00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _BASICSUMM_HPP
00012 #define _BASICSUMM_HPP
00013
00014 #include <iomanip>
00015 #include "lemur-compat.hpp"
00016 #include "Summarizer.hpp"
00017 #include "Passage.hpp"
00018 #include "BasicPassage.hpp"
00019 #include "Index.hpp"
00020 #include <algorithm>
00021 #include <vector>
00022 #include <cmath>
00023 using std::vector;
00024
00025 namespace lemur
00026 {
00027 namespace summarization
00028 {
00033 class BasicSumm : public Summarizer {
00034
00035 private:
00036 const lemur::api::Index* idx;
00037 int summLen;
00038 vector<BasicPassage> doc;
00039 mutable int iterCount;
00040
00041 public:
00043 BasicSumm(const lemur::api::Index* inIdx, int inSummLen = 5) :
00044 idx(inIdx), summLen(inSummLen), iterCount(1) {};
00045
00046 virtual void summDocument(const string &docID, const int optLen,
00047 const string &qInfo);
00048
00049 virtual void scorePassages(const string &qInfo);
00050
00051 virtual void markPassages(int optLen, const string &qInfo);
00052
00053 virtual void addPassage(Passage &psg);
00054
00055 virtual void clear(void);
00056
00057 virtual int fetchPassages(Passage* psgs, int optLen) const;
00058
00059 virtual int nextPassage(Passage* psg) const;
00060
00061 virtual void iterClear(void) const ;
00062
00063 virtual void outputSumm(void) const ;
00064
00066 int isEOS(const string &check) {
00067 return (check == EOS);
00068 }
00069
00071 int hasEOS(const lemur::api::Index* idx,
00072 const lemur::api::TermInfoList* tList) {
00073 tList->startIteration();
00074 lemur::api::TermInfo* tEntry;
00075 while (tList->hasMore()) {
00076 tEntry = tList->nextEntry();
00077 if ( isEOS(idx->term(tEntry->termID())) ) return true;
00078 }
00079 return false;
00080 }
00081
00083 double scorePassage(BasicPassage &psg, const string &qInfo) {
00084 const string &docID = psg.docID;
00085 passageVec psgV= *psg.getAsVector();
00086 double psgLen = psgV.size();
00087 double P = 1;
00088 double M = 1.5;
00089 double endScore, Tf, tf, idf, docLen, avgDocLen;
00090 endScore = 0.0;
00091 for (int i=0; i < psgLen; i++) {
00092 docLen = idx->docLength(idx->document(docID));
00093 avgDocLen = idx->docLengthAvg();
00094 tf = psgV[i].tf;
00095 Tf = tf / (tf + 0.5 + 1.5 * (docLen/avgDocLen) );
00096 idf = lemur_compat::min(M,
00097 log((double)idx->docCount()/
00098 (double)idx->docCount(psgV[i].termID)));
00099 endScore += (Tf * idf * P);
00100 }
00101 endScore = endScore / 1+psgLen;
00102 psg.score = endScore;
00103 return endScore;
00104 }
00105
00107 void findNextPassage(BasicPassage &psg,
00108 const lemur::api::Index* idx,
00109 const lemur::api::TermInfoList* tList, int eos) {
00110 lemur::api::TermInfo* tEntry;
00111 psg.clear();
00112
00113
00114
00115
00116 termCount storage;
00117 if (eos) {
00118 while (tList->hasMore()) {
00119 tEntry = tList->nextEntry();
00120 if ( isEOS(idx->term(tEntry->termID())) ) return;
00121
00122
00123
00124
00125 storage.termID = tEntry->termID();
00126 storage.tf = tEntry->count();
00127 psg.addTerm(storage);
00128 }
00129 } else {
00130 for(int i=0; i < PSG_LEN; i++) {
00131 if (tList->hasMore()) {
00132 tEntry = tList->nextEntry();
00133
00134
00135
00136
00137 storage.termID = tEntry->termID();
00138 storage.tf = tEntry->count();
00139 psg.addTerm(storage);
00140 } else {
00141 return;
00142 }
00143 }
00144 }
00145 return;
00146 }
00147
00149 void showPassage(const passageVec* psg,
00150 const lemur::api::Index* idx) const {
00151 for (int i=0; i < psg->size(); i++) {
00152 cout << idx->term((*psg)[i].termID) << " ";
00153 }
00154 }
00155
00157 void showMarkedPassages() const {
00158
00159 for (int i=0; i<doc.size(); i++) {
00160 if (doc[i].marked > 0) {
00161 showPassage(doc[i].getAsVector(), idx);
00162 cout << endl;
00163 }
00164 }
00165 }
00166
00167 };
00168 }
00169 }
00170
00171 #endif