00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _LEMUR_KEYFILE_INCINDEX_HPP
00014 #define _LEMUR_KEYFILE_INCINDEX_HPP
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "common_headers.hpp"
00025 #include "Index.hpp"
00026 #include "InvFPDocList.hpp"
00027 #include "InvFPTermList.hpp"
00028 #include "InvFPTypes.hpp"
00029 #include "BasicCollectionProps.hpp"
00030 #include "Param.hpp"
00031 #include "PushIndex.hpp"
00032 #include "MemCache.hpp"
00033 #include "Keyfile.hpp"
00034 #include "KeyfileDocMgr.hpp"
00035 #include "ReadBuffer.hpp"
00036 #include "WriteBuffer.hpp"
00037 #include "TermCache.hpp"
00038 #include <cstring>
00039 #include <queue>
00040 namespace lemur
00041 {
00042 namespace index
00043 {
00044
00045
00046 #define UNIQUE_TERMS 0
00047 #define TOTAL_TERMS 1
00048 #define DOCS 2
00049 #define DT_FILES 3
00050 #define INV_FILES 4
00051
00052 #define MAX_DOCID_LENGTH 512
00053 #define MAX_TERM_LENGTH 512
00054
00055 #define KEYFILE_MAX_SEGMENTS (16)
00056
00057
00058
00071 class KeyfileIncIndex : public PushIndex, public lemur::api::Index {
00072 public:
00074 class record {
00075 public:
00077 lemur::file::File::offset_type offset;
00079 int len;
00081 int totalLen;
00083 int num;
00084 };
00086 struct SegmentOffset {
00088 unsigned int segment;
00090 unsigned int length;
00092 lemur::file::File::offset_type offset;
00093 };
00095 struct TermData {
00097 lemur::api::COUNT_T totalCount;
00099 lemur::api::COUNT_T documentCount;
00101 SegmentOffset segments[ KEYFILE_MAX_SEGMENTS ];
00102 };
00105 KeyfileIncIndex(const string &prefix, int cachesize=128000000,
00106 lemur::api::DOCID_T startdocid=1);
00108 KeyfileIncIndex();
00110 ~KeyfileIncIndex();
00111
00113 void setName(const string &prefix);
00114
00116 bool beginDoc(const lemur::parse::DocumentProps* dp);
00117
00119 bool addTerm(const lemur::api::Term& t);
00120
00122 void endDoc(const lemur::parse::DocumentProps* dp);
00123
00125 virtual void endDoc(const lemur::parse::DocumentProps* dp, const string &mgr);
00126
00128 void endCollection(const lemur::parse::CollectionProps* cp);
00129
00131 void setDocManager(const string &mgrID);
00132
00133 protected:
00135 bool tryOpen();
00137 void writeTOC(const lemur::parse::CollectionProps* cp);
00139 void writeCache( bool lastRun = false );
00141 void lastWriteCache();
00142
00144 void mergeCacheSegments();
00146 void writeCacheSegment();
00148 void writeDocMgrIDs();
00151 int docMgrID(const string &mgr);
00153 virtual void doendDoc(const lemur::parse::DocumentProps* dp, int mgrid);
00155 int listlengths;
00156
00157 public:
00159
00160
00162 bool open(const string &indexName);
00164
00166
00167
00169 lemur::api::TERMID_T term(const lemur::api::TERM_T &word) const;
00170
00172 const lemur::api::TERM_T term(lemur::api::TERMID_T termID) const;
00173
00175 lemur::api::DOCID_T document(const lemur::api::EXDOCID_T &docIDStr) const;
00176
00178 const lemur::api::EXDOCID_T document(lemur::api::DOCID_T docID) const;
00179
00181 const lemur::api::DocumentManager *docManager(lemur::api::DOCID_T docID) const;
00182
00183 const lemur::parse::CollectionProps *collectionProps() const;
00185
00187
00188
00190 lemur::api::COUNT_T docCount() const { return counts[DOCS]; };
00191
00193 lemur::api::COUNT_T termCountUnique() const { return counts[UNIQUE_TERMS]; };
00194
00196 lemur::api::COUNT_T termCount(lemur::api::TERMID_T termID) const;
00197
00199 lemur::api::COUNT_T termCount() const { return counts[TOTAL_TERMS]; };
00200
00202 float docLengthAvg() const;
00203
00205 lemur::api::COUNT_T docCount(lemur::api::TERMID_T termID) const;
00206
00208 lemur::api::COUNT_T docLength(lemur::api::DOCID_T docID) const;
00209
00211 virtual lemur::api::COUNT_T totaldocLength (lemur::api::DOCID_T docID) const;
00212
00214 lemur::api::COUNT_T docLengthCounted(lemur::api::DOCID_T docID) const;
00215
00217
00219
00220
00221 lemur::api::DocInfoList* docInfoList(lemur::api::TERMID_T termID) const;
00222
00224 lemur::api::TermInfoList* termInfoList(lemur::api::DOCID_T docID) const;
00226 lemur::api::TermInfoList* termInfoListSeq(lemur::api::DOCID_T docID) const;
00227
00229
00231 void setMesgStream(ostream * lemStream);
00233 void addKnownTerm( lemur::api::TERMID_T termID, lemur::api::LOC_T position );
00235 lemur::api::TERMID_T addUnknownTerm( const InvFPTerm* term );
00237 lemur::api::TERMID_T addUncachedTerm( const InvFPTerm* term );
00238
00239 protected:
00241 void openDBs();
00243 void openSegments();
00245 void createDBs();
00246
00248 void fullToc();
00250 bool docMgrIDs();
00252 record fetchDocumentRecord( lemur::api::DOCID_T key ) const;
00254 void addDocumentLookup( lemur::api::DOCID_T documentKey, const char* documentName );
00256 void addTermLookup( lemur::api::TERMID_T termKey, const char* termSpelling );
00258 void addGeneralLookup( lemur::file::Keyfile& numberNameIndex,
00259 lemur::file::Keyfile& nameNumberIndex,
00260 lemur::api::TERMID_T number, const char* name );
00262 InvFPDocList* internalDocInfoList(lemur::api::TERMID_T termID) const;
00264 void _updateTermlist( InvFPDocList* curlist, lemur::api::LOC_T position );
00266 int _cacheSize();
00268 void _computeMemoryBounds( int memorySize );
00270 void _resetEstimatePoint();
00272 lemur::api::COUNT_T* counts;
00274 std::vector<std::string> names;
00276 float aveDocLen;
00278 vector<std::string> docmgrs;
00280 ostream* msgstream;
00281
00282
00283
00285 mutable lemur::file::Keyfile invlookup;
00286
00287
00289 mutable lemur::file::Keyfile dIDs;
00291 mutable lemur::file::Keyfile dSTRs;
00293 mutable lemur::file::Keyfile tIDs;
00295 mutable lemur::file::Keyfile tSTRs;
00297 mutable lemur::file::File dtlookup;
00299 lemur::file::ReadBuffer* dtlookupReadBuffer;
00302 mutable lemur::file::File writetlist;
00303
00305 mutable char termKey[MAX_TERM_LENGTH];
00307 mutable char docKey[MAX_DOCID_LENGTH];
00309 int _listsSize;
00311 int _memorySize;
00313 std::string name;
00315 vector<InvFPDocList*> invertlists;
00317 vector<LocatedTerm> termlist;
00319 int curdocmgr;
00321 vector<lemur::api::DocumentManager*> docMgrs;
00323 lemur::utility::TermCache _cache;
00325 mutable lemur::parse::BasicCollectionProps* cprops;
00326
00328 std::vector<lemur::file::File*> _segments;
00330 lemur::api::TERMID_T _largestFlushedTermID;
00332 int _estimatePoint;
00334 bool ignoreDoc;
00336 bool _readOnly;
00337 };
00338 }
00339 }
00340
00341
00342 #endif //_LEMUR_KEYFILE_INCINDEX_HPP