00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _SIMPLEKLDOCMODEL_HPP
00012 #define _SIMPLEKLDOCMODEL_HPP
00013
00014 #include "DocumentRep.hpp"
00015 #include "Index.hpp"
00016 #include "UnigramLM.hpp"
00017
00018
00020 namespace SimpleKLParameter {
00021 enum SmoothMethod {JELINEKMERCER=0, DIRICHLETPRIOR=1, ABSOLUTEDISCOUNT=2,
00022 TWOSTAGE=3};
00023
00024 enum SmoothStrategy {INTERPOLATE=0, BACKOFF=1};
00025
00026 enum QueryUpdateMethod {MIXTURE = 0, DIVMIN=1, MARKOVCHAIN=2, RM1=3, RM2=4};
00027
00028 enum adjustedScoreMethods {QUERYLIKELIHOOD = 1, CROSSENTROPY = 2,
00029 NEGATIVEKLD = 3};
00030
00031 struct DocSmoothParam {
00033 enum SmoothMethod smthMethod;
00035 enum SmoothStrategy smthStrategy;
00037 double ADDelta;
00039 double JMLambda;
00041 double DirPrior;
00042 };
00043
00044 static enum SmoothMethod defaultSmoothMethod = DIRICHLETPRIOR;
00045 static enum SmoothStrategy defaultSmoothStrategy = INTERPOLATE;
00046 static double defaultADDelta = 0.7;
00047 static double defaultJMLambda = 0.5;
00048 static double defaultDirPrior = 1000;
00049
00050 struct QueryModelParam {
00051 enum adjustedScoreMethods adjScoreMethod;
00053 double qryNoise;
00054
00056 enum QueryUpdateMethod fbMethod;
00058 double fbCoeff;
00060 int fbTermCount;
00062 double fbPrTh;
00064 double fbPrSumTh;
00066 double fbMixtureNoise;
00068 int emIterations;
00069 };
00070
00071 static enum QueryUpdateMethod defaultFBMethod = MIXTURE;
00072 static double defaultFBCoeff = 0.5;
00073 static int defaultFBTermCount =50;
00074 static double defaultFBPrTh = 0.001;
00075 static double defaultFBPrSumTh = 1;
00076 static double defaultFBMixNoise = 0.5;
00077 static int defaultEMIterations = 50;
00078 static double defaultQryNoise = 0;
00079 }
00080
00081 namespace lemur
00082 {
00083 namespace retrieval
00084 {
00085
00086
00088
00101 class SimpleKLDocModel : public lemur::api::DocumentRep {
00102 public:
00103 SimpleKLDocModel(lemur::api::DOCID_T docID, const lemur::langmod::UnigramLM &collectLM,
00104 int dl = 1,
00105 const double *prMass = NULL,
00106 SimpleKLParameter::SmoothStrategy strat = SimpleKLParameter::INTERPOLATE) :
00107 lemur::api::DocumentRep(docID, dl),
00108 refLM(collectLM), docPrMass(prMass), strategy(strat) {
00109 };
00110
00111 ~SimpleKLDocModel() {};
00112
00114 virtual double termWeight(lemur::api::TERMID_T termID, const lemur::api::DocInfo *info) const {
00115 double sp = seenProb(info->termCount(), termID);
00116 double usp = unseenCoeff();
00117 double ref = refLM.prob(termID);
00118 double score = sp/(usp*ref);
00119
00120
00121
00122
00123 return score;
00124 }
00125
00127 virtual double scoreConstant() const {
00128 return unseenCoeff();
00129 }
00130
00132 virtual double unseenCoeff() const =0;
00134 virtual double seenProb(double termFreq, lemur::api::TERMID_T termID) const =0;
00135
00136 protected:
00137 const lemur::langmod::UnigramLM &refLM;
00138 const double *docPrMass;
00139 SimpleKLParameter::SmoothStrategy strategy;
00140 };
00141
00142
00143
00145
00153 class JelinekMercerDocModel : public SimpleKLDocModel {
00154 public:
00155 JelinekMercerDocModel(lemur::api::DOCID_T docID,
00156 int dl,
00157 const lemur::langmod::UnigramLM &collectLM,
00158 const double *docProbMass,
00159 double collectLMWeight,
00160 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00161 SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00162 lambda(collectLMWeight) {
00163 };
00164
00165 virtual ~JelinekMercerDocModel() {};
00166
00167 virtual double unseenCoeff() const {
00168 if (strategy == SimpleKLParameter::INTERPOLATE) {
00169 return lambda;
00170 } else if (strategy==SimpleKLParameter::BACKOFF) {
00171 return lambda/(1-docPrMass[id]);
00172 } else {
00173 throw lemur::api::Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00174 }
00175 }
00176 virtual double seenProb(double termFreq, lemur::api::TERMID_T termID) const {
00177 if (strategy == SimpleKLParameter::INTERPOLATE) {
00178 return ((1-lambda)*termFreq/(double)docLength +
00179 lambda*refLM.prob(termID));
00180 } else if (strategy == SimpleKLParameter::BACKOFF) {
00181 return ((1-lambda)*termFreq/(double)docLength);
00182 } else {
00183 throw lemur::api::Exception("JelinekMercerDocModel", "Unknown smoothing strategy");
00184 }
00185 }
00186 private:
00187 double lambda;
00188 };
00189
00191
00196 class DirichletPriorDocModel : public SimpleKLDocModel {
00197 public:
00198 DirichletPriorDocModel(lemur::api::DOCID_T docID,
00199 int dl,
00200 const lemur::langmod::UnigramLM &collectLM,
00201 const double *docProbMass,
00202 double priorWordCount,
00203 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00204 SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00205 mu(priorWordCount) {
00206 };
00207
00208 virtual ~DirichletPriorDocModel() {};
00209
00210 virtual double unseenCoeff() const {
00211
00212 if (strategy == SimpleKLParameter::INTERPOLATE) {
00213 return mu/(mu+docLength);
00214 } else if (strategy==SimpleKLParameter::BACKOFF) {
00215 return (mu/((mu+docLength)*(1-docPrMass[id])));
00216 } else {
00217 throw lemur::api::Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00218 }
00219 }
00220
00221 virtual double seenProb(double termFreq, lemur::api::TERMID_T termID) const {
00222 if (strategy == SimpleKLParameter::INTERPOLATE) {
00223 return (termFreq+mu*refLM.prob(termID))/
00224 (double)(docLength+mu);
00225 } else if (strategy == SimpleKLParameter::BACKOFF) {
00226 return (termFreq/(double)(docLength+mu));
00227 } else {
00228 throw lemur::api::Exception("DirichletPriorDocModel", "Unknown smoothing strategy");
00229 }
00230 }
00231 private:
00232 double mu;
00233 };
00234
00236
00243 class AbsoluteDiscountDocModel : public SimpleKLDocModel {
00244 public:
00245 AbsoluteDiscountDocModel(lemur::api::DOCID_T docID,
00246 int dl,
00247 const lemur::langmod::UnigramLM &collectLM,
00248 const double *docProbMass,
00249 lemur::api::COUNT_T *uniqueTermCount,
00250 double discount,
00251 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00252 SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00253 uniqDocLen(uniqueTermCount),
00254 delta(discount) {
00255 };
00256
00257 virtual ~AbsoluteDiscountDocModel() {};
00258
00259 virtual double unseenCoeff() const {
00260
00261 if (strategy == SimpleKLParameter::INTERPOLATE) {
00262 return (delta*uniqDocLen[id]/(double)docLength);
00263 } else if (strategy==SimpleKLParameter::BACKOFF) {
00264 return (delta*uniqDocLen[id]/(docLength*(1-docPrMass[id])));
00265 } else {
00266 throw lemur::api::Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00267 }
00268 }
00269 virtual double seenProb(double termFreq, lemur::api::TERMID_T termID) const {
00270 if (strategy == SimpleKLParameter::INTERPOLATE) {
00271 return ((termFreq-delta)/(double)docLength+
00272 delta*uniqDocLen[id]*refLM.prob(termID)/(double)docLength);
00273 } else if (strategy == SimpleKLParameter::BACKOFF) {
00274 return ((termFreq-delta)/(double)docLength);
00275 } else {
00276 throw lemur::api::Exception("AbsoluteDiscountDocModel", "Unknown smoothing strategy");
00277 }
00278 }
00279 private:
00280 double *collectPr;
00281 lemur::api::COUNT_T *uniqDocLen;
00282 double delta;
00283 };
00284
00285
00287
00288
00289 class TwoStageDocModel : public SimpleKLDocModel {
00290 public:
00291 TwoStageDocModel(lemur::api::DOCID_T docID,
00292 int dl,
00293 const lemur::langmod::UnigramLM &collectLM,
00294 const double *docProbMass,
00295 double firstStageMu,
00296 double secondStageLambda,
00297 SimpleKLParameter::SmoothStrategy smthStrategy=SimpleKLParameter::INTERPOLATE):
00298 SimpleKLDocModel(docID, collectLM, dl, docProbMass, smthStrategy),
00299 mu(firstStageMu),
00300 lambda(secondStageLambda) {
00301 };
00302
00303 virtual ~TwoStageDocModel() {};
00304
00305 virtual double unseenCoeff() const {
00306
00307 if (strategy == SimpleKLParameter::INTERPOLATE) {
00308 return (mu+lambda*docLength)/(mu+docLength);
00309 } else if (strategy == SimpleKLParameter::BACKOFF) {
00310 return ((mu+lambda*docLength)/((mu+docLength)*(1-docPrMass[id])));
00311 } else {
00312 throw lemur::api::Exception("TwoStageDocModel", "Unknown smoothing strategy");
00313 }
00314 }
00315
00316 virtual double seenProb(double termFreq, lemur::api::TERMID_T termID) const {
00317 if (strategy == SimpleKLParameter::INTERPOLATE) {
00318 return ((1-lambda)*(termFreq+mu*refLM.prob(termID))/
00319 (double)(docLength+mu) + lambda*refLM.prob(termID));
00320 } else if (strategy == SimpleKLParameter::BACKOFF) {
00321 return (termFreq*(1-lambda)/(double)(docLength+mu));
00322 } else {
00323 throw lemur::api::Exception("TwoStageDocModel", "Unknown smoothing strategy");
00324 }
00325 }
00326 private:
00327 double mu;
00328 double lambda;
00329 };
00330 }
00331 }
00332
00333 #endif