Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

KeyfileDocMgr.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  *
00003  *  Original source copyright (c) 2001, Carnegie Mellon University.
00004  *  See copyright.cmu for details.
00005  *  Modifications copyright (c) 2002, University of Massachusetts.
00006  *  See copyright.umass for details.
00007  *
00008  *==========================================================================
00009  */
00010 
00011 #ifndef _LEMUR_KEYFILE_DOCMGR_HPP
00012 #define _LEMUR_KEYFILE_DOCMGR_HPP
00013 
00014 #include "common_headers.hpp"
00015 #include "DocumentManager.hpp"
00016 #include "RVLCompress.hpp"
00017 #include "TextHandlerManager.hpp"
00018 #include "Match.hpp"
00019 #include "Keyfile.hpp"
00020 namespace lemur 
00021 {
00022   namespace parse 
00023   {
00024     
00025     // array of byte offsets, indexed by token for each doc.
00026 #define BT_POSITIONS ".btp"
00027     // source file start, length
00028 #define BT_LOOKUP ".btl"
00029     // TOC
00030 #define BT_TOC ".bdm"
00031     // source files.
00032 #define BT_FID ".bfi"
00033 
00034 
00043     class KeyfileDocMgr : public lemur::api::DocumentManager, public lemur::api::TextHandler {
00044     public:
00046       KeyfileDocMgr() {  myDoc = NULL;  numdocs = 0; ignoreDoc = false; _readOnly = true;}
00047 
00050       KeyfileDocMgr(const string &name, bool readOnly = true);
00051 
00056       KeyfileDocMgr(string name, string mode, string source);  
00057 
00058       virtual ~KeyfileDocMgr();
00059 
00061       char* getDoc(const string &docID) const;
00063       virtual char* handleDoc(char * docno);
00065       virtual void handleEndDoc();
00067       virtual char *handleWord(char * word) {
00068         if (!ignoreDoc && word != NULL) {
00069           int end = myparser->fileTell() - 1;
00070           int start = (end - strlen(word)) + 1;
00071           Match m;
00072           m.start = start - docEntry.offset;
00073           m.end = end - docEntry.offset;    
00074           offsets.push_back(m);
00075         }
00076         return word;
00077       }
00079       virtual void setParser(lemur::api::Parser *p) {
00080         myparser = p;
00081       }
00082 
00083       virtual lemur::api::Parser* getParser() const {
00084         return (lemur::api::TextHandlerManager::createParser(pm));
00085       }
00086 
00089       virtual void buildMgr();
00091       virtual const string &getMyID() const{
00092         return IDnameext;
00093       }
00094 
00098       vector<Match> getOffsets(const string &docID) const;
00099 
00101       virtual bool open(const string &manname) {
00102         IDnameext = manname;
00103         IDname = manname.substr(0, manname.length() - 4);
00104         return loadTOC();
00105       }
00106 
00107     protected:
00108       struct btl {
00109         int fid;
00110         long offset;
00111         long bytes;
00112       };
00113 
00114       lemur::api::Parser *myparser;
00115       virtual void writeTOC();
00116       virtual bool loadTOC();
00117       bool loadFTFiles(const string &fn, int num);
00118       // the return object
00119       mutable vector <Match> offsets;
00120       int numdocs;              // how many docs we have
00121       string pm;  // parse mode
00122 
00123       mutable lemur::file::Keyfile poslookup; // btree for lookup to positions list.
00124       mutable lemur::file::Keyfile doclookup; // btree for lookup to doc start.
00125       int dbcache;
00126   
00127       btl docEntry;
00128       char *myDoc;
00129       int doclen;
00130       string IDname;            // my name
00131       string IDnameext;                 // my name w/ extension
00132       vector<string> sources;   // list of all source files
00134       int numOldSources;
00135       int fileid;       // fileid of current/last file being processed
00137       bool ignoreDoc;
00139       bool _readOnly;
00140     };
00141   }
00142 }
00143 
00144 #endif // _LEMUR_KEYFILE_DOCMGR_HPP

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4