Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

SimpleKLDocModel.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  *
00003  *  Original source copyright (c) 2001, Carnegie Mellon University.
00004  *  See copyright.cmu for details.
00005  *  Modifications copyright (c) 2002, University of Massachusetts.
00006  *  See copyright.umass for details.
00007  *
00008  *==========================================================================
00009  */
00010 
00011 #ifndef _SIMPLEKLDOCMODEL_HPP
00012 #define _SIMPLEKLDOCMODEL_HPP
00013 
00014 #include "DocumentRep.hpp"
00015 #include "Index.hpp"
00016 #include "UnigramLM.hpp"
00017 
00018 
00020 namespace SimpleKLParameter {
00021   enum SmoothMethod  {JELINEKMERCER=0, DIRICHLETPRIOR=1, ABSOLUTEDISCOUNT=2, 
00022                       TWOSTAGE=3};
00023  
00024   enum SmoothStrategy  {INTERPOLATE=0, BACKOFF=1};
00025 
00026   enum QueryUpdateMethod {MIXTURE = 0, DIVMIN=1, MARKOVCHAIN=2, RM1=3, RM2=4};
00027 
00028   enum adjustedScoreMethods {QUERYLIKELIHOOD = 1, CROSSENTROPY = 2, 
00029                              NEGATIVEKLD = 3};
00030 
00031   struct DocSmoothParam {
00033     enum SmoothMethod smthMethod;
00035     enum SmoothStrategy smthStrategy;
00037     double ADDelta;
00039     double JMLambda;
00041     double DirPrior;
00042   };
00043 
00044   static enum SmoothMethod defaultSmoothMethod = DIRICHLETPRIOR;
00045   static enum SmoothStrategy defaultSmoothStrategy = INTERPOLATE;
00046   static double defaultADDelta = 0.7;
00047   static double defaultJMLambda = 0.5;
00048   static double defaultDirPrior = 1000;
00049 
00050   struct QueryModelParam {
00051     enum adjustedScoreMethods adjScoreMethod;
00053     double qryNoise;
00054 
00056     enum QueryUpdateMethod fbMethod;
00058     double fbCoeff;
00060     int fbTermCount;
00062     double fbPrTh;
00064     double fbPrSumTh;
00066     double fbMixtureNoise;
00068     int emIterations;
00069   };
00070 
00071   static enum QueryUpdateMethod defaultFBMethod = MIXTURE;
00072   static double defaultFBCoeff = 0.5;
00073   static int defaultFBTermCount =50;
00074   static double defaultFBPrTh = 0.001;
00075   static double defaultFBPrSumTh = 1;
00076   static double defaultFBMixNoise = 0.5;
00077   static int defaultEMIterations = 50;
00078   static double defaultQryNoise = 0; //maximum likelihood estimator
00079 }
00080 
00081 namespace lemur 
00082 {
00083   namespace retrieval
00084   {
00085     
00086 
00088 
00101     class SimpleKLDocModel : public lemur::api::DocumentRep {
00102     public:
00103       SimpleKLDocModel(lemur::api::DOCID_T docID, const lemur::langmod::UnigramLM &collectLM, 
00104                        int dl = 1, 
00105                        const double *prMass = NULL,
00106                        SimpleKLParameter::SmoothStrategy strat = SimpleKLParameter::INTERPOLATE) : 
00107         lemur::api::DocumentRep(docID, dl), 
00108         refLM(collectLM), docPrMass(prMass), strategy(strat) {
00109       };
00110   
00111       ~SimpleKLDocModel() {};
00112 
00114       virtual double termWeight(lemur::api::TERMID_T termID, const lemur::api::DocInfo *info) const {
00115         double sp = seenProb(info->termCount(), termID);
00116         double usp = unseenCoeff();
00117         double ref = refLM.prob(termID);
00118         double score = sp/(usp*ref);
00119         /*
00120           cerr << "TW:" << termID << " sp:" << sp << " usp:" << usp << " ref:" << ref << " s:" << score << endl;
00121         */
00122         //    return (seenProb(info->termCount(), termID)/(unseenCoeff()* refLM.prob(termID)));
00123         return score;
00124       }
00125 
00127       virtual double scoreConstant() const {
00128         return unseenCoeff();
00129       }
00130 
00132       virtual double unseenCoeff() const =0; // a(d)
00134       virtual double seenProb(double termFreq, lemur::api::TERMID_T termID) const =0;
00135 
00136     protected:
00137       const lemur::langmod::UnigramLM &refLM;
00138       const double *docPrMass;
00139       SimpleKLParameter::SmoothStrategy strategy;
00140     };
00141 
00142 
00143 
00145 
00153     class JelinekMercerDocModel : public SimpleKLDocModel {
00154     public:
00155       JelinekMercerDocModel(lemur::api::DOCID_T docID, 
00156                             int dl,
00157                             const lemur::langmod::UnigramLM &collectLM,
00158                             const double *docProbMass,
00159                             double collectLMWeight, 
00160                             SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00161         SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00162         lambda(collectLMWeight) {
00163       };
00164 
00165       virtual ~JelinekMercerDocModel() {};
00166   
00167       virtual double unseenCoeff() const {
00168         if (strategy == SimpleKLParameter::INTERPOLATE) {
00169           return lambda;
00170         } else if (strategy==SimpleKLParameter::BACKOFF) {
00171           return lambda/(1-docPrMass[id]);
00172         } else {
00173           throw lemur::api::Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00174         }
00175       }
00176       virtual double seenProb(double termFreq, lemur::api::TERMID_T termID) const {
00177         if (strategy == SimpleKLParameter::INTERPOLATE) {
00178           return ((1-lambda)*termFreq/(double)docLength +
00179                   lambda*refLM.prob(termID));
00180         } else if (strategy == SimpleKLParameter::BACKOFF) {
00181           return ((1-lambda)*termFreq/(double)docLength);
00182         } else {
00183           throw lemur::api::Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00184         }
00185       }
00186     private:
00187       double lambda;
00188     };
00189 
00191 
00196     class DirichletPriorDocModel : public SimpleKLDocModel {
00197     public:
00198       DirichletPriorDocModel(lemur::api::DOCID_T docID,
00199                              int dl,
00200                              const lemur::langmod::UnigramLM &collectLM,
00201                              const double *docProbMass,
00202                              double priorWordCount,
00203                              SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00204         SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00205         mu(priorWordCount) {
00206       };
00207 
00208       virtual ~DirichletPriorDocModel() {};
00209 
00210       virtual double unseenCoeff() const {
00211 
00212         if (strategy == SimpleKLParameter::INTERPOLATE) {
00213           return mu/(mu+docLength);
00214         } else if (strategy==SimpleKLParameter::BACKOFF) {
00215           return (mu/((mu+docLength)*(1-docPrMass[id])));
00216         } else {
00217           throw lemur::api::Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00218         }
00219       }
00220 
00221       virtual double seenProb(double termFreq, lemur::api::TERMID_T termID) const {
00222         if (strategy == SimpleKLParameter::INTERPOLATE) {
00223           return (termFreq+mu*refLM.prob(termID))/
00224             (double)(docLength+mu);
00225         } else if (strategy == SimpleKLParameter::BACKOFF) {
00226           return (termFreq/(double)(docLength+mu));
00227         } else {      
00228           throw lemur::api::Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00229         }
00230       }
00231     private:
00232       double mu;
00233     };
00234 
00236 
00243     class AbsoluteDiscountDocModel : public SimpleKLDocModel {
00244     public:
00245       AbsoluteDiscountDocModel(lemur::api::DOCID_T docID,
00246                                int dl,
00247                                const lemur::langmod::UnigramLM &collectLM,
00248                                const double *docProbMass,
00249                                lemur::api::COUNT_T *uniqueTermCount,
00250                                double discount,
00251                                SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00252         SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00253         uniqDocLen(uniqueTermCount),
00254         delta(discount) {
00255       };
00256 
00257       virtual ~AbsoluteDiscountDocModel() {};
00258   
00259       virtual double unseenCoeff() const {
00260 
00261         if (strategy == SimpleKLParameter::INTERPOLATE) {
00262           return (delta*uniqDocLen[id]/(double)docLength);
00263         } else if (strategy==SimpleKLParameter::BACKOFF) {
00264           return (delta*uniqDocLen[id]/(docLength*(1-docPrMass[id])));
00265         } else {
00266           throw lemur::api::Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00267         }
00268       }
00269       virtual double seenProb(double termFreq, lemur::api::TERMID_T termID) const {
00270         if (strategy == SimpleKLParameter::INTERPOLATE) {
00271           return ((termFreq-delta)/(double)docLength+
00272                   delta*uniqDocLen[id]*refLM.prob(termID)/(double)docLength);
00273         } else if (strategy == SimpleKLParameter::BACKOFF) {
00274           return ((termFreq-delta)/(double)docLength);
00275         } else {
00276           throw lemur::api::Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00277         }
00278       }
00279     private:
00280       double *collectPr;
00281       lemur::api::COUNT_T *uniqDocLen;
00282       double delta;
00283     };
00284 
00285 
00287     // alpha = (mu+lambda*dLength)/(dLength+mu)
00288     // pseen(w) = [(1-lambda)*c(w;d)+ (mu+lambda*dLength)*Pc(w)]/(dLength + mu)
00289     class TwoStageDocModel : public SimpleKLDocModel {
00290     public:
00291       TwoStageDocModel(lemur::api::DOCID_T docID,
00292                        int dl,
00293                        const lemur::langmod::UnigramLM &collectLM,
00294                        const double *docProbMass,
00295                        double firstStageMu, 
00296                        double secondStageLambda, 
00297                        SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE): 
00298         SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00299         mu(firstStageMu),
00300         lambda(secondStageLambda) {
00301       };
00302 
00303       virtual ~TwoStageDocModel() {};
00304 
00305       virtual double unseenCoeff() const {
00306 
00307         if (strategy == SimpleKLParameter::INTERPOLATE) {
00308           return (mu+lambda*docLength)/(mu+docLength);
00309         } else if (strategy == SimpleKLParameter::BACKOFF) {
00310           return ((mu+lambda*docLength)/((mu+docLength)*(1-docPrMass[id])));
00311         } else {
00312           throw lemur::api::Exception("TwoStageDocModel", "Unknown smoothing strategy");
00313         }
00314       }
00315 
00316       virtual double seenProb(double termFreq, lemur::api::TERMID_T termID) const {
00317         if (strategy == SimpleKLParameter::INTERPOLATE) {      
00318           return ((1-lambda)*(termFreq+mu*refLM.prob(termID))/
00319                   (double)(docLength+mu) + lambda*refLM.prob(termID));
00320         } else if (strategy == SimpleKLParameter::BACKOFF) {
00321           return (termFreq*(1-lambda)/(double)(docLength+mu));
00322         } else {
00323           throw lemur::api::Exception("TwoStageDocModel", "Unknown smoothing strategy");
00324         }
00325       }
00326     private:
00327       double mu;
00328       double lambda;
00329     };
00330   }
00331 }
00332 
00333 #endif /* _SIMPLEKLDOCMODEL_HPP */

Generated on Tue Jun 15 11:02:55 2010 for Lemur by doxygen 1.3.4