00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _XLINGRETMETHOD_HPP
00013 #define _XLINGRETMETHOD_HPP
00014
00015 #include "common_headers.hpp"
00016 #include <cmath>
00017 #include <vector>
00018 #include <algorithm>
00019 #include "IndexTypes.hpp"
00020 #include "FreqVector.hpp"
00021 #include "UnigramLM.hpp"
00022 #include "ScoreFunction.hpp"
00023 #include "XLingDocModel.hpp"
00024 #include "TextQueryRep.hpp"
00025 #include "TextQueryRetMethod.hpp"
00026 #include "Counter.hpp"
00027 #include "DocUnigramCounter.hpp"
00028 #include "PDict.hpp"
00029 #include "TextHandlerManager.hpp"
00030
00031 namespace lemur
00032 {
00033 namespace retrieval
00034 {
00036 class XLQueryTerm : public lemur::api::QueryTerm {
00037 public:
00038 XLQueryTerm(lemur::api::TERMID_T tid, double wt, const char *term, double pge,
00039 lemur::dictionary::PDict &dic, lemur::api::Stemmer *stm = NULL) :
00040 lemur::api::QueryTerm(tid, wt), source(term), p_s_ge(pge), dict(dic),
00041 stemmer(stm) {
00042 }
00043
00044 XLQueryTerm(const char *term, lemur::dictionary::PDict &dic, lemur::api::Stemmer *stm = NULL) :
00045 lemur::api::QueryTerm(0, 0), source(term), p_s_ge(0), dict(dic), stemmer(stm) {
00046 }
00047
00048 XLQueryTerm(const XLQueryTerm &other) : lemur::api::QueryTerm(0,0), dict(other.dict) {
00049 ti = other.ti;
00050 w = other.w;
00051 p_s_ge = other.p_s_ge;
00052 source = other.source;
00053 stemmer = other.stemmer;
00054 }
00055
00056
00057 virtual ~XLQueryTerm() { }
00058
00060 const string &getSource() const {return source;}
00061
00063 const double getP_s_GE() const {
00064 return p_s_ge;
00065 }
00066
00068 void setWeight(double wt) {
00069 w = wt;
00070 }
00071
00073 void incWeight(double wt) {
00074 w += wt;
00075 }
00077 virtual bool operator==(const XLQueryTerm& other) const {
00078 return (other.source == source);
00079 }
00080
00082 virtual XLQueryTerm& operator=(const XLQueryTerm& other) {
00083 ti = other.ti;
00084 w = other.w;
00085 p_s_ge = other.p_s_ge;
00086 source = other.source;
00087 dict = other.dict;
00088 stemmer = other.stemmer;
00089 return (*this);
00090 }
00095 lemur::dictionary::DictEntryVector *getTranslations() const {
00096 lemur::dictionary::DictEntryVector *xlates = dict.getTranslations(source);
00097
00098 if (xlates == NULL && stemmer != NULL) {
00099
00100 char tmpTerm[512];
00101 strcpy(tmpTerm, source.c_str());
00102 string stem = stemmer->stemWord(tmpTerm);
00103 cerr << "getTranslations: stemming " << source << " to " << stem
00104 << endl;
00105 xlates = dict.getTranslations(stem);
00106 }
00107 return xlates;
00108 }
00109 private:
00110 string source;
00111 double p_s_ge;
00112 lemur::dictionary::PDict &dict;
00113 lemur::api::Stemmer *stemmer;
00114 };
00115
00116
00118 class XLingQueryModel : public lemur::api::QueryRep {
00119 public:
00128 XLingQueryModel(const lemur::api::TermQuery &qry,
00129 const lemur::api::Index &source,
00130 bool dbS, double numSource,
00131 lemur::dictionary::PDict &dict,
00132 const lemur::api::Stopper *stp = NULL,
00133 lemur::api::Stemmer *stm = NULL) {
00134
00135
00136 double pge;
00137 numTerms = 0;
00138
00139 qry.startTermIteration();
00140 while (qry.hasMore()) {
00141 const lemur::api::Term *t = qry.nextTerm();
00142
00143 if (stp == NULL || !(stp->stopWord(t->spelling()))) {
00144 numTerms++;
00145 XLQueryTerm st(t->spelling(), dict, stm);
00146 iter = find(qTerms.begin(), qTerms.end(), st);
00147 if (iter != qTerms.end()) {
00148
00149 (*iter).incWeight(1);
00150 } else {
00151
00152 lemur::api::TERMID_T ti = source.term(t->spelling());
00153 if (ti>0) {
00154
00155 if (dbS) {
00156 pge = source.docCount(ti)/numSource;
00157 } else {
00158 pge = (source.termCount(ti)/numSource);
00159 }
00160 } else {
00161
00162
00163
00164 pge = (0.000001*0.000001);
00165 }
00166 XLQueryTerm newTerm(ti, 1, t->spelling(), pge, dict, stm);
00167 qTerms.push_back(newTerm);
00168 }
00169 } else {
00170 cerr << "XLingQueryModel: " << t->spelling()
00171 << " on stoplist, ignoring" << endl;
00172 }
00173
00174 }
00175 }
00176
00177 virtual ~XLingQueryModel() {
00178 }
00179
00181 virtual void startIteration() const {
00182 iter = qTerms.begin();
00183 }
00185 virtual bool hasMore() const {
00186 return (iter != qTerms.end());
00187 }
00189
00190 virtual XLQueryTerm &nextTerm() const {
00191 return (*iter++);
00192 }
00193 virtual int getNumTerms() const {return numTerms;}
00194
00195 private:
00196 mutable vector<XLQueryTerm> qTerms;
00197 mutable vector<XLQueryTerm>::iterator iter;
00198 int numTerms;
00199 };
00200
00201
00202
00203
00211
00212 class XLingRetMethod : public lemur::api::RetrievalMethod {
00213 public:
00214
00227 XLingRetMethod(const lemur::api::Index &dbIndex,
00228 const lemur::api::Index &background,
00229 lemur::dictionary::PDict &dict,
00230 lemur::api::ScoreAccumulator &accumulator,
00231 double l, double b, bool cacheDR,
00232 string &sBM, string &tBM,
00233 const lemur::api::Stopper *stp = NULL,
00234 lemur::api::Stemmer *stm = NULL);
00236 virtual ~XLingRetMethod();
00237
00241 virtual lemur::api::DocumentRep *computeDocRep(lemur::api::DOCID_T docID);
00242
00249 virtual double matchedTermWeight(lemur::api::TERMID_T id, double weight,
00250 const lemur::api::DocInfo *info,
00251 const lemur::api::DocumentRep *dRep) const {
00252 double d = dRep->termWeight(id,info);
00253 double score = d * weight;
00254 return score;
00255 }
00256
00261 virtual double adjustedScore(double origScore, double pge) const {
00262 return (log((lambda * origScore) + ((1 - lambda) * pge)));
00263 }
00264
00265 virtual void scoreCollection(const lemur::api::QueryRep &qry,
00266 lemur::api::IndexedRealVector &results){
00267 scoreInvertedIndex(qry, results);
00268 }
00269
00270 virtual void scoreInvertedIndex(const lemur::api::QueryRep &qryRep,
00271 lemur::api::IndexedRealVector &scores,
00272 bool scoreAll = false);
00273
00274 virtual lemur::api::QueryRep *computeQueryRep(const lemur::api::Query &qry) {
00275 if (const lemur::api::TermQuery *q = dynamic_cast<const lemur::api::TermQuery *>(&qry))
00276 return (new XLingQueryModel(*q, source, docBasedSourceSmooth, numSource,
00277 dictionary, stopper, stemmer));
00278 else LEMUR_THROW(LEMUR_RUNTIME_ERROR, "XLingRetMethod expects query of type TermQuery");
00279 }
00280
00281 virtual lemur::api::QueryRep *computeTargetKLRep(const lemur::api::QueryRep *qry);
00282
00284 virtual double scoreDoc(const lemur::api::QueryRep &qry, lemur::api::DOCID_T docID);
00285
00287 virtual void updateQuery(lemur::api::QueryRep &qryRep, const lemur::api::DocIDSet &relDocs) {}
00288
00289 protected:
00290 virtual double scoreDocVector(const XLingQueryModel &qRep, lemur::api::DOCID_T docID,
00291 lemur::utility::FreqVector &docVector);
00292
00293 double lambda;
00294 double beta;
00295 double numSource;
00296 double numTarget;
00297 bool docBasedSourceSmooth;
00298 bool docBasedTargetSmooth;
00299 lemur::api::ScoreAccumulator &scAcc;
00300 lemur::dictionary::PDict &dictionary;
00301 lemur::api::Stemmer *stemmer;
00302 const lemur::api::Stopper *stopper;
00303 const lemur::api::Index &source;
00305 lemur::api::DocumentRep **docReps;
00307 bool cacheDocReps;
00309 int docRepsSize;
00310 lemur::api::ScoreAccumulator *termScores;
00311 };
00312 }
00313 }
00314 #endif