Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

XLingRetMethod.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  *
00003  *  Original source copyright (c) 2001, Carnegie Mellon University.
00004  *  See copyright.cmu for details.
00005  *  Modifications copyright (c) 2002, University of Massachusetts.
00006  *  See copyright.umass for details.
00007  *
00008  *==========================================================================
00009  */
00010 
00011 
00012 #ifndef _XLINGRETMETHOD_HPP
00013 #define _XLINGRETMETHOD_HPP
00014 
00015 #include "common_headers.hpp"
00016 #include <cmath>
00017 #include <vector>
00018 #include <algorithm>
00019 #include "IndexTypes.hpp"
00020 #include "FreqVector.hpp"
00021 #include "UnigramLM.hpp"
00022 #include "ScoreFunction.hpp"
00023 #include "XLingDocModel.hpp"
00024 #include "TextQueryRep.hpp"
00025 #include "TextQueryRetMethod.hpp"
00026 #include "Counter.hpp"
00027 #include "DocUnigramCounter.hpp"
00028 #include "PDict.hpp"
00029 #include "TextHandlerManager.hpp"
00030 
00031 namespace lemur 
00032 {
00033   namespace retrieval 
00034   {
00036     class XLQueryTerm : public lemur::api::QueryTerm {
00037     public:
00038       XLQueryTerm(lemur::api::TERMID_T tid, double  wt, const char *term, double pge,
00039                   lemur::dictionary::PDict &dic, lemur::api::Stemmer *stm = NULL) :
00040         lemur::api::QueryTerm(tid, wt), source(term), p_s_ge(pge), dict(dic),
00041         stemmer(stm) {
00042       }
00043 
00044       XLQueryTerm(const char *term, lemur::dictionary::PDict &dic, lemur::api::Stemmer *stm = NULL) : 
00045         lemur::api::QueryTerm(0, 0), source(term), p_s_ge(0), dict(dic), stemmer(stm) {
00046       }
00047 
00048       XLQueryTerm(const XLQueryTerm &other) : lemur::api::QueryTerm(0,0), dict(other.dict) {
00049         ti = other.ti;
00050         w = other.w;
00051         p_s_ge = other.p_s_ge;
00052         source = other.source;
00053         stemmer = other.stemmer;
00054       }
00055 
00056 
00057       virtual ~XLQueryTerm() { }
00058 
00060       const string &getSource() const {return source;}
00061 
00063       const double getP_s_GE() const {
00064         return p_s_ge;
00065       }
00066   
00068       void setWeight(double wt) {
00069         w = wt;
00070       }
00071 
00073       void incWeight(double wt) {
00074         w += wt;
00075       }
00077       virtual bool operator==(const XLQueryTerm& other) const {
00078         return (other.source == source);
00079       }
00080 
00082       virtual XLQueryTerm& operator=(const XLQueryTerm& other)  {
00083         ti = other.ti;
00084         w = other.w;
00085         p_s_ge = other.p_s_ge;
00086         source = other.source;
00087         dict = other.dict;
00088         stemmer = other.stemmer;
00089         return (*this);
00090       }
00095       lemur::dictionary::DictEntryVector *getTranslations() const {
00096         lemur::dictionary::DictEntryVector *xlates = dict.getTranslations(source);
00097         // If no xlates, Leah's version stems the term and tries again.
00098         if (xlates == NULL && stemmer != NULL) {
00099           // porter stemmer is destructive
00100           char tmpTerm[512];
00101           strcpy(tmpTerm, source.c_str());
00102           string stem = stemmer->stemWord(tmpTerm);
00103           cerr << "getTranslations: stemming " << source << " to " << stem 
00104                << endl;
00105           xlates = dict.getTranslations(stem);
00106         }
00107         return xlates;
00108       }
00109     private:
00110       string source;
00111       double p_s_ge;
00112       lemur::dictionary::PDict &dict;  
00113       lemur::api::Stemmer *stemmer;
00114     };
00115 
00116 
00118     class XLingQueryModel : public lemur::api::QueryRep {
00119     public:
00128       XLingQueryModel(const lemur::api::TermQuery &qry, 
00129                       const lemur::api::Index &source, 
00130                       bool dbS, double numSource,
00131                       lemur::dictionary::PDict &dict, 
00132                       const lemur::api::Stopper *stp = NULL, 
00133                       lemur::api::Stemmer *stm = NULL) {
00134         // fill in weighted terms
00135         // P(e|GE)
00136         double pge;
00137         numTerms = 0;
00138     
00139         qry.startTermIteration();
00140         while (qry.hasMore()) {
00141           const lemur::api::Term *t = qry.nextTerm();
00142           // if Stopper is not NULL, test for stopwords.
00143           if (stp == NULL || !(stp->stopWord(t->spelling()))) {
00144             numTerms++;
00145             XLQueryTerm st(t->spelling(), dict, stm);
00146             iter = find(qTerms.begin(), qTerms.end(), st);
00147             if (iter != qTerms.end()) {
00148               // found it, bump count
00149               (*iter).incWeight(1);
00150             } else {
00151               // new term
00152               lemur::api::TERMID_T ti = source.term(t->spelling());
00153               if (ti>0) {
00154                 // pge
00155                 if (dbS) {
00156                   pge = source.docCount(ti)/numSource;
00157                 } else {
00158                   pge = (source.termCount(ti)/numSource);      
00159                 }
00160               } else {
00161                 // OOV, use default pge
00162                 // perhaps this would be better estimated with:
00163                 //        pge = 1/(numSource + 1);
00164                 pge = (0.000001*0.000001);
00165               }
00166               XLQueryTerm newTerm(ti, 1, t->spelling(), pge, dict, stm);
00167               qTerms.push_back(newTerm);
00168             }
00169           } else {
00170             cerr << "XLingQueryModel: " << t->spelling() 
00171                  << " on stoplist, ignoring" << endl;
00172           }
00173       
00174         }
00175       }
00176   
00177       virtual ~XLingQueryModel() {
00178       }
00179 
00181       virtual void startIteration() const {
00182         iter = qTerms.begin();
00183       }
00185       virtual bool hasMore() const {
00186         return (iter != qTerms.end());
00187       }
00189       //  virtual XLQueryTerm &nextTerm() {
00190       virtual XLQueryTerm &nextTerm() const {
00191         return (*iter++);
00192       }
00193       virtual int getNumTerms() const {return numTerms;}
00194   
00195     private:
00196       mutable vector<XLQueryTerm> qTerms;
00197       mutable vector<XLQueryTerm>::iterator iter;
00198       int numTerms;
00199     };
00200 
00201     // Should not really be a TextQueryRetMethod, as it does not score
00202     // in a like fashion. but does take advantage of the cached doc reps.
00203     //
00211     //class XLingRetMethod : public TextQueryRetMethod {
00212     class XLingRetMethod : public lemur::api::RetrievalMethod {
00213     public:
00214 
00227       XLingRetMethod(const lemur::api::Index &dbIndex, 
00228                      const lemur::api::Index &background, 
00229                      lemur::dictionary::PDict &dict, 
00230                      lemur::api::ScoreAccumulator &accumulator, 
00231                      double l, double b, bool cacheDR,
00232                      string &sBM, string &tBM, 
00233                      const lemur::api::Stopper *stp = NULL, 
00234                      lemur::api::Stemmer *stm = NULL);
00236       virtual ~XLingRetMethod();
00237   
00241       virtual lemur::api::DocumentRep *computeDocRep(lemur::api::DOCID_T docID);
00242 
00249       virtual double matchedTermWeight(lemur::api::TERMID_T id, double weight,
00250                                        const lemur::api::DocInfo *info, 
00251                                        const lemur::api::DocumentRep *dRep) const { 
00252         double d = dRep->termWeight(id,info); //P(a|D)
00253         double score = d * weight; //P(a|D) * P(e|a)
00254         return score;
00255       }
00256 
00261       virtual double adjustedScore(double origScore, double pge) const {
00262         return (log((lambda * origScore) + ((1 - lambda) * pge)));
00263       }
00264 
00265       virtual void scoreCollection(const lemur::api::QueryRep &qry, 
00266                                    lemur::api::IndexedRealVector &results){
00267         scoreInvertedIndex(qry, results);
00268       }
00269       // Override (have to do individual doc ones too.
00270       virtual void scoreInvertedIndex(const lemur::api::QueryRep &qryRep, 
00271                                       lemur::api::IndexedRealVector &scores, 
00272                                       bool scoreAll = false);
00273 
00274       virtual lemur::api::QueryRep *computeQueryRep(const lemur::api::Query &qry) {
00275         if (const lemur::api::TermQuery *q = dynamic_cast<const lemur::api::TermQuery *>(&qry))
00276           return (new XLingQueryModel(*q, source, docBasedSourceSmooth, numSource,
00277                                       dictionary, stopper, stemmer));
00278         else LEMUR_THROW(LEMUR_RUNTIME_ERROR, "XLingRetMethod expects query of type TermQuery");
00279       } 
00280 
00281       virtual lemur::api::QueryRep *computeTargetKLRep(const lemur::api::QueryRep *qry);
00282 
00284       virtual double scoreDoc(const lemur::api::QueryRep &qry, lemur::api::DOCID_T docID);
00285 
00287       virtual void updateQuery(lemur::api::QueryRep &qryRep, const lemur::api::DocIDSet &relDocs) {}
00288 
00289     protected:
00290       virtual double scoreDocVector(const XLingQueryModel &qRep, lemur::api::DOCID_T docID, 
00291                                     lemur::utility::FreqVector &docVector);
00292 
00293       double lambda;
00294       double beta;
00295       double numSource;
00296       double numTarget;
00297       bool docBasedSourceSmooth;
00298       bool docBasedTargetSmooth;
00299       lemur::api::ScoreAccumulator &scAcc; // this does not need to be passed in. Bleah.
00300       lemur::dictionary::PDict &dictionary;
00301       lemur::api::Stemmer *stemmer; // source language
00302       const lemur::api::Stopper *stopper; // source language
00303       const lemur::api::Index &source;
00305       lemur::api::DocumentRep **docReps;
00307       bool cacheDocReps;
00309       int docRepsSize;
00310       lemur::api::ScoreAccumulator *termScores;
00311     };
00312   }
00313 }
00314 #endif /* _XLINGRETMETHOD_HPP */

Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4