00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #ifndef _TFIDFRETMETHOD_HPP
00015 #define _TFIDFRETMETHOD_HPP
00016
00017 #include "TextQueryRetMethod.hpp"
00018
00020 namespace TFIDFParameter {
00021
00022 enum TFMethod {RAWTF=0, LOGTF=1, BM25=2};
00023 struct WeightParam {
00024 TFMethod tf;
00025 double bm25K1;
00026 double bm25B;
00027 };
00028 struct FeedbackParam {
00029 int howManyTerms;
00030 double posCoeff;
00031 };
00032 static double defaultDocK1=1;
00033 static double defaultDocB = 0.5;
00034 static double defaultQryK1 = 1;
00035 static double defaultQryB = 0;
00036 static int defaultHowManyTerms = 50;
00037 static double defaultPosCoeff = 0.5;
00038 }
00039 namespace lemur
00040 {
00041 namespace retrieval
00042 {
00043
00045 class TFIDFQueryRep : public ArrayQueryRep {
00046 public:
00047 TFIDFQueryRep(const lemur::api::TermQuery &qry, const lemur::api::Index &dbIndex, double *idfValue, TFIDFParameter::WeightParam ¶m);
00048
00049 virtual ~TFIDFQueryRep() {}
00050
00051 double queryTFWeight(const double rawTF) const;
00052 protected:
00053 TFIDFParameter::WeightParam &prm;
00054 double *idf;
00055 const lemur::api::Index &ind;
00056 };
00057
00059 class TFIDFDocRep : public lemur::api::DocumentRep {
00060 public:
00061 TFIDFDocRep(lemur::api::DOCID_T docID, const lemur::api::Index &dbIndex, double *idfValue,
00062 TFIDFParameter::WeightParam ¶m) :
00063 lemur::api::DocumentRep(docID, dbIndex.docLength(docID)), ind(dbIndex), prm(param), idf(idfValue) {
00064 }
00065 virtual ~TFIDFDocRep() { }
00066 virtual double termWeight(lemur::api::TERMID_T termID, const lemur::api::DocInfo *info) const{
00067 return (idf[termID]*docTFWeight(info->termCount()));
00068 }
00069 virtual double scoreConstant() const { return 0;}
00070
00071 double docTFWeight(const double rawTF) const;
00072 private:
00073
00074 const lemur::api::Index & ind;
00075 TFIDFParameter::WeightParam &prm;
00076 double *idf;
00077 };
00078
00079
00081
00082 class TFIDFRetMethod : public lemur::api::TextQueryRetMethod {
00083 public:
00084
00085 TFIDFRetMethod(const lemur::api::Index &dbIndex, lemur::api::ScoreAccumulator &accumulator);
00086 virtual ~TFIDFRetMethod() {delete [] idfV; delete scFunc;}
00087
00088 virtual lemur::api::TextQueryRep *computeTextQueryRep(const lemur::api::TermQuery &qry) {
00089 return (new TFIDFQueryRep(qry, ind, idfV, qryTFParam));
00090 }
00091
00092 virtual lemur::api::DocumentRep *computeDocRep(lemur::api::DOCID_T docID) {
00093 return (new TFIDFDocRep(docID, ind, idfV, docTFParam));
00094 }
00095 virtual lemur::api::ScoreFunction *scoreFunc() {
00096 return (scFunc);
00097 }
00098
00099
00100 virtual void updateTextQuery(lemur::api::TextQueryRep &qryRep,
00101 const lemur::api::DocIDSet &relDocs);
00102
00103 void setDocTFParam(TFIDFParameter::WeightParam &docTFWeightParam);
00104
00105 void setQueryTFParam(TFIDFParameter::WeightParam &queryTFWeightParam);
00106
00107 void setFeedbackParam(TFIDFParameter::FeedbackParam &feedbackParam);
00108
00109 static double BM25TF(const double rawTF, const double k1, const double b,
00110 const double docLen, const double avgDocLen);
00111
00112 protected:
00113 double *idfV;
00114 lemur::api::ScoreFunction *scFunc;
00115
00117
00118
00119 TFIDFParameter::WeightParam qryTFParam;
00120 TFIDFParameter::WeightParam docTFParam;
00121 TFIDFParameter::FeedbackParam fbParam;
00122
00124
00125 };
00126
00127
00128 inline void TFIDFRetMethod::setDocTFParam(TFIDFParameter::WeightParam &docTFWeightParam)
00129 {
00130 docTFParam = docTFWeightParam;
00131 }
00132
00133
00134
00135 inline void TFIDFRetMethod::setQueryTFParam(TFIDFParameter::WeightParam &queryTFWeightParam)
00136 {
00137 qryTFParam = queryTFWeightParam;
00138 }
00139
00140
00141 inline void TFIDFRetMethod::setFeedbackParam(TFIDFParameter::FeedbackParam &feedbackParam)
00142 {
00143 fbParam = feedbackParam;
00144 }
00145
00146
00147
00148 inline double TFIDFRetMethod ::BM25TF(const double rawTF, const double k1, const double b,
00149 const double docLen, const double avgDocLen)
00150 {
00151 double x= rawTF+k1*(1-b+b*docLen/avgDocLen);
00152 return (k1*rawTF/x);
00153 }
00154
00155 }
00156 }
00157
00158
00159 #endif