00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <string>
00022 #include <vector>
00023 #include "indri/TermFieldStatistics.hpp"
00024 #include "indri/TermScoreFunction.hpp"
00025 #include "indri/TermScoreFunctionFactory.hpp"
00026 #include "indri/HashTable.hpp"
00027 #include "indri/greedy_vector"
00028 #include "indri/QueryEnvironment.hpp"
00029
00030 namespace indri {
00031 namespace query {
00032 class RelevanceModel {
00033 public:
00034 struct Gram {
00035 std::vector<std::string> terms;
00036 double weight;
00037
00038 struct hash {
00039 int operator() ( const Gram* one ) const {
00040 indri::utility::GenericHash<const char*> h;
00041 int accumulator = 0;
00042
00043 for( size_t i=0; i<one->terms.size(); i++ ) {
00044 accumulator *= 7;
00045 accumulator += h( one->terms[i].c_str() );
00046 }
00047
00048 return accumulator;
00049 }
00050 };
00051
00052 struct weight_greater {
00053 bool operator() ( const Gram* o, const Gram* t ) const {
00054 return t->weight < o->weight;
00055 }
00056 };
00057
00058 struct string_comparator {
00059 int operator() ( const Gram* o, const Gram* t ) const {
00060 const Gram& one = *o;
00061 const Gram& two = *t;
00062
00063 if( one.terms.size() != two.terms.size() ) {
00064 if( one.terms.size() < two.terms.size() ) {
00065 return 1;
00066 } else {
00067 return -1;
00068 }
00069 }
00070
00071 for( size_t i=0; i<one.terms.size(); i++ ) {
00072 const std::string& oneString = one.terms[i];
00073 const std::string& twoString = two.terms[i];
00074
00075 if( oneString != twoString ) {
00076 if( oneString < twoString )
00077 return -1;
00078 else
00079 return 1;
00080 }
00081 }
00082
00083 return 0;
00084 }
00085 };
00086 };
00087
00088 private:
00089 struct GramCounts {
00090 Gram gram;
00091 indri::utility::greedy_vector< std::pair< int, int > > counts;
00092 };
00093
00094 indri::api::QueryEnvironment& _environment;
00095 int _maxGrams;
00096 std::string _smoothing;
00097 int _documents;
00098
00099 typedef indri::utility::HashTable< Gram*, GramCounts*, Gram::hash, Gram::string_comparator > HGram;
00100 HGram _gramTable;
00101
00102 std::vector<indri::api::ScoredExtentResult> _results;
00103 std::vector<lemur::api::DOCID_T> _documentIDs;
00104 std::vector<Gram*> _grams;
00105 std::vector<indri::api::DocumentVector*> _vectors;
00106
00107 void _countGrams();
00108 void _scoreGrams();
00109 void _sortGrams();
00110 void _extractDocuments();
00111
00112 public:
00113 RelevanceModel( indri::api::QueryEnvironment& environment,
00114 const std::string& smoothing,\
00115 int maxGrams,
00116 int documents );
00117 ~RelevanceModel();
00118
00119 void generate( const std::string& query );
00120
00121 void generate( const std::string &query , const std::vector<indri::api::ScoredExtentResult>& results );
00122 const std::vector<indri::api::ScoredExtentResult>& getQueryResults() const;
00123 const std::vector<Gram*>& getGrams() const;
00124 };
00125 }
00126 }