|
Public Member Functions |
| PLSA (const lemur::api::Index &dbIndex, int numCats, lemur::utility::HashFreqVector **train, lemur::utility::HashFreqVector **validate, int numIter, int numRestarts, double betastart, double betastop, double anneal, double betaMod) |
| building with provided train/test partitions
|
| PLSA (const lemur::api::Index &dbIndex, int testPercentage, int numCats, int numIter, int numRestarts, double betastart, double betastop, double anneal, double betaMod) |
| building without provided train/test partitions
|
| PLSA (const lemur::api::Index &dbIndex) |
| for using prebuilt tables.
|
virtual | ~PLSA () |
void | iterateWithRestarts () |
| Start things going.
|
double * | get_p_z () const |
double ** | get_p_w_z () const |
| P(w|z) matrix.
|
double ** | get_p_d_z () const |
| P(d|z) matrix.
|
double | getProb (int d, int w) const |
| P(d,w).
|
int | numWords () const |
| number of terms
|
int | numDocs () const |
| number of docs
|
int | numCats () const |
| number of categories
|
bool | readArrays () |
Private Types |
enum | pType { P_Z = 0,
P_W_Z = 1,
P_D_Z = 2
} |
| read/write array options. More...
|
Private Member Functions |
void | setPrevToCurrent () |
| Copy current iteration data to previous iteration data.
|
void | setCurrentToBest () |
| Copy best iteration data to current iteration data.
|
void | setBestToCurrent () |
| Copy current iteration data to best iteration data.
|
void | setBestToPrev () |
| Copy previous iteration data to best iteration data.
|
void | setPrevToBest () |
| Copy best iteration data to previous iteration data.
|
double | getAverageLikelihood () |
double | getAverageLikelihoodPrev () |
double | jointEstimate (int indexD, int indexW) |
| Estimates P(d,w) using previous parameter estimates.
|
double | jointEstimateCurrent (int indexD, int indexW) |
| Estimates P(d,w) using current parameter estimates.
|
double | jointEstimateBest (int indexD, int indexW) |
| Estimates P(d,w) using best parameter estimates.
|
double | jointEstimateBeta (int indexD, int indexW) |
void | iterate () |
| main routine for model training.
|
void | initializeParameters () |
| Initialize the prev probability arrays to random values.
|
double | doLogLikelihood (jointfuncType, lemur::utility::HashFreqVector **&myData) |
double | logLikelihood () |
| Calculate the training data log-likelihood using prev parameters.
|
double | validateDataLogLikelihood () |
| Calculate the hold out data log-likelihood using prev parameters.
|
double | validateCurrentLogLikelihood () |
| Calculate the hold out data log-likelihood using current parameters.
|
double | bestDataLogLikelihood () |
| Calculate the hold out data log-likelihood using the best parameters.
|
double | interleavedIterationEM () |
| performs one EM iteration, returns log likelihood of training data
|
void | selectTestTrain (int testPercent) |
| Select training/test events.
|
void | init () |
| Initialize attributes.
|
void | initR () |
| Initialize R and w->d inverted list.
|
void | writeArrays () |
| write out all the arrays to file.
|
bool | readArray (ifstream &infile, enum pType which) |
| Read a probability array (matrix) from a file.
|
void | writeArray (ofstream &ofile, enum pType which) |
| Write a probability array (matrix) to a file.
|
Private Attributes |
const lemur::api::Index & | ind |
| Index to use.
|
int | sizeZ |
| number of categories
|
int | sizeD |
| number of documents
|
int | sizeW |
| number of words
|
lemur::utility::HashFreqVector ** | data |
| train d->w freq list
|
lemur::utility::HashFreqVector ** | testData |
| test (validation) d->w freq list
|
set< int, less< int > > * | invIndex |
| w->d inverted index for M step of P(w | z)
|
double | startBeta |
| Beta for TEM.
|
double | beta |
| Beta for TEM.
|
double | betaMin |
| Beta for TEM.
|
double | betaModifier |
| eta for TEM (beta = eta * beta;)
|
double | annealcue |
| annealcue value (delta)
|
int | R |
| used in M step for p_z
|
int | numberOfIterations |
| How many iterations.
|
int | numberOfRestarts |
| How many restarts.
|
double | bestTestLL |
| Best log likelihood on the test data so far.
|
double | bestA |
| Best average log likelihood on the test data so far.
|
bool | bestOnly |
| have we only loaded existing tables from files
|
bool | ownMem |
| did we allocate the test/train vectors?
|
double * | p_z_current |
| P(z) vector current iteration.
|
double ** | p_w_z_current |
| P(w|z) matrix current iteration.
|
double ** | p_d_z_current |
| P(d|z) matrix current iteration.
|
double * | p_z_prev |
| P(z) vector previous iteration.
|
double ** | p_w_z_prev |
| P(w|z) matrix previous iteration.
|
double ** | p_d_z_prev |
| P(d|z) matrix previous iteration.
|
double * | p_z_best |
| P(z) vector best iteration.
|
double ** | p_w_z_best |
| P(w|z) matrix best iteration.
|
double ** | p_d_z_best |
| P(d|z) matrix best iteration.
|