00001 /*========================================================================== 00002 * 00003 * Original source copyright (c) 2001, Carnegie Mellon University. 00004 * See copyright.cmu for details. 00005 * Modifications copyright (c) 2002, University of Massachusetts. 00006 * See copyright.umass for details. 00007 * 00008 *========================================================================== 00009 */ 00010 00011 #ifndef _LEMUR_KEYFILE_DOCMGR_HPP 00012 #define _LEMUR_KEYFILE_DOCMGR_HPP 00013 00014 #include "common_headers.hpp" 00015 #include "DocumentManager.hpp" 00016 #include "RVLCompress.hpp" 00017 #include "TextHandlerManager.hpp" 00018 #include "Match.hpp" 00019 #include "Keyfile.hpp" 00020 namespace lemur 00021 { 00022 namespace parse 00023 { 00024 00025 // array of byte offsets, indexed by token for each doc. 00026 #define BT_POSITIONS ".btp" 00027 // source file start, length 00028 #define BT_LOOKUP ".btl" 00029 // TOC 00030 #define BT_TOC ".bdm" 00031 // source files. 00032 #define BT_FID ".bfi" 00033 00034 00043 class KeyfileDocMgr : public lemur::api::DocumentManager, public lemur::api::TextHandler { 00044 public: 00046 KeyfileDocMgr() { myDoc = NULL; numdocs = 0; ignoreDoc = false; _readOnly = true;} 00047 00050 KeyfileDocMgr(const string &name, bool readOnly = true); 00051 00056 KeyfileDocMgr(string name, string mode, string source); 00057 00058 virtual ~KeyfileDocMgr(); 00059 00061 char* getDoc(const string &docID) const; 00063 virtual char* handleDoc(char * docno); 00065 virtual void handleEndDoc(); 00067 virtual char *handleWord(char * word) { 00068 if (!ignoreDoc && word != NULL) { 00069 int end = myparser->fileTell() - 1; 00070 int start = (end - strlen(word)) + 1; 00071 Match m; 00072 m.start = start - docEntry.offset; 00073 m.end = end - docEntry.offset; 00074 offsets.push_back(m); 00075 } 00076 return word; 00077 } 00079 virtual void setParser(lemur::api::Parser *p) { 00080 myparser = p; 00081 } 00082 00083 virtual lemur::api::Parser* getParser() const { 00084 return (lemur::api::TextHandlerManager::createParser(pm)); 00085 } 00086 00089 virtual void buildMgr(); 00091 virtual const string &getMyID() const{ 00092 return IDnameext; 00093 } 00094 00098 vector<Match> getOffsets(const string &docID) const; 00099 00101 virtual bool open(const string &manname) { 00102 IDnameext = manname; 00103 IDname = manname.substr(0, manname.length() - 4); 00104 return loadTOC(); 00105 } 00106 00107 protected: 00108 struct btl { 00109 int fid; 00110 long offset; 00111 long bytes; 00112 }; 00113 00114 lemur::api::Parser *myparser; 00115 virtual void writeTOC(); 00116 virtual bool loadTOC(); 00117 bool loadFTFiles(const string &fn, int num); 00118 // the return object 00119 mutable vector <Match> offsets; 00120 int numdocs; // how many docs we have 00121 string pm; // parse mode 00122 00123 mutable lemur::file::Keyfile poslookup; // btree for lookup to positions list. 00124 mutable lemur::file::Keyfile doclookup; // btree for lookup to doc start. 00125 int dbcache; 00126 00127 btl docEntry; 00128 char *myDoc; 00129 int doclen; 00130 string IDname; // my name 00131 string IDnameext; // my name w/ extension 00132 vector<string> sources; // list of all source files 00134 int numOldSources; 00135 int fileid; // fileid of current/last file being processed 00137 bool ignoreDoc; 00139 bool _readOnly; 00140 }; 00141 } 00142 } 00143 00144 #endif // _LEMUR_KEYFILE_DOCMGR_HPP