Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

MemoryIndex.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 //
00013 // MemoryIndex
00014 //
00015 // 15 November 2004 -- tds
00016 //
00017 
00018 #ifndef INDRI_MEMORYINDEX_HPP
00019 #define INDRI_MEMORYINDEX_HPP
00020 
00021 #include "indri/Index.hpp"
00022 #include "indri/Mutex.hpp"
00023 #include "indri/HashTable.hpp"
00024 #include "indri/DocumentData.hpp"
00025 #include "indri/Buffer.hpp"
00026 #include <list>
00027 #include <vector>
00028 
00029 #include "indri/DocListIterator.hpp"
00030 #include "indri/DocListFileIterator.hpp"
00031 #include "indri/TermList.hpp"
00032 #include "indri/TermListFileIterator.hpp"
00033 #include "indri/VocabularyIterator.hpp"
00034 #include "indri/ParsedDocument.hpp"
00035 #include "indri/DocListMemoryBuilder.hpp"
00036 #include "indri/FieldStatistics.hpp"
00037 #include "indri/CorpusStatistics.hpp"
00038 #include "indri/DocExtentListMemoryBuilder.hpp"
00039 #include "indri/ReadersWritersLock.hpp"
00040 #include "indri/ReaderLockable.hpp"
00041 #include "indri/WriterLockable.hpp"
00042 #include "indri/RegionAllocator.hpp"
00043 
00044 namespace indri {
00045   namespace index {
00046     class MemoryIndex : public Index {
00047     public:
00048       // vocabulary structure
00049       struct term_entry {
00050         struct term_less {
00051           bool operator() ( const term_entry* one, const term_entry* two ) const {
00052             return strcmp( one->term, two->term ) < 0;
00053           }
00054         };
00055 
00056         term_entry( indri::utility::RegionAllocator* allocator ) :
00057           list(allocator),
00058           next(0)
00059         {
00060         }
00061 
00062         void clearMark() {
00063           next = 0;
00064         }
00065 
00066         bool hasNext() {
00067           return next != 0 && next != (term_entry*) 1;
00068         }
00069           
00070         void mark() {
00071           next = (term_entry*) 1;
00072         }
00073 
00074         bool marked() {
00075           return next != 0;
00076         }
00077 
00078         char* term;
00079         lemur::api::TERMID_T termID;
00080         TermData* termData;
00081         term_entry* next;
00082         indri::index::DocListMemoryBuilder list;
00083       };
00084       
00085     private:
00086       indri::utility::RegionAllocator _allocator;
00087 
00088       indri::thread::ReadersWritersLock _lock;
00089       indri::thread::ReaderLockable _readLock;
00090       indri::thread::WriterLockable _writeLock;
00091 
00092       CorpusStatistics _corpusStatistics;
00093       lemur::api::DOCID_T _baseDocumentID;
00094       
00095       // document buffers
00096       indri::index::TermList _termList;
00097       indri::utility::greedy_vector<term_entry*> _seenTerms;
00098 
00099       // term lookups
00100       indri::utility::HashTable<const char*, term_entry*> _stringToTerm;
00101       std::vector<term_entry*> _idToTerm;
00102 
00103       // field statistics
00104       indri::utility::HashTable<const char*, int> _fieldLookup;
00105       std::vector<FieldStatistics> _fieldData;
00106       std::vector<indri::index::DocExtentListMemoryBuilder*> _fieldLists;
00107       
00108       // document statistics
00109       std::vector<indri::index::DocumentData> _documentData;
00110       
00111       // document vector buffers
00112       std::list<indri::utility::Buffer*> _termLists;
00113       UINT64 _termListsBaseOffset;
00114       
00115       void _addOpenTags( indri::utility::greedy_vector<indri::parse::TagExtent *>& indexedTags,
00116                          indri::utility::greedy_vector<indri::parse::TagExtent *>& openTags,
00117                          indri::utility::greedy_vector<indri::parse::TagExtent *>& extents,
00118                          unsigned int& extentIndex, 
00119                          unsigned int position );
00120       void _removeClosedTags( indri::utility::greedy_vector<indri::parse::TagExtent *>& tags, unsigned int position );
00121       void _writeFieldExtents( lemur::api::DOCID_T documentID, indri::utility::greedy_vector<indri::parse::TagExtent *>& indexedTags );
00122       void _writeDocumentTermList( UINT64& offset, int& byteLength, lemur::api::DOCID_T documentID, int documentLength, indri::index::TermList& locatedTerms );
00123       void _writeDocumentStatistics( UINT64 offset, int byteLength, int indexedLength, int totalLength, int uniqueTerms );
00124       term_entry* _lookupTerm( const char* term );
00125       void _destroyTerms();
00126 
00127       int _fieldID( const std::string& fieldName );
00128       int _fieldID( const char* fieldName );
00129 
00130     public:
00131       MemoryIndex();
00132       MemoryIndex( lemur::api::DOCID_T docBase );
00133       MemoryIndex( lemur::api::DOCID_T docBase, const std::vector<Index::FieldDescription>& fields );
00134       ~MemoryIndex();
00135 
00136       void close();
00137 
00138       lemur::api::DOCID_T documentBase();
00139       lemur::api::DOCID_T documentMaximum();
00140       
00141       lemur::api::TERMID_T term( const std::string& t );
00142       lemur::api::TERMID_T term( const char* t );
00143       std::string term( lemur::api::TERMID_T termID );
00144 
00145       int field( const char* fieldName );
00146       int field( const std::string& fieldName );
00147       std::string field( int fieldID );
00148 
00149       int documentLength( lemur::api::DOCID_T documentID );
00150       UINT64 documentCount();
00151       UINT64 documentCount( const std::string& term );
00152       UINT64 uniqueTermCount();
00153       
00154       UINT64 termCount( const std::string& term );
00155       UINT64 termCount();
00156       
00157       UINT64 fieldTermCount( const std::string& field );
00158       UINT64 fieldTermCount( const std::string& field, const std::string& term );
00159       
00160       UINT64 fieldDocumentCount( const std::string& field );
00161       UINT64 fieldDocumentCount( const std::string& field, const std::string& term );
00162       
00163       DocListIterator* docListIterator( lemur::api::TERMID_T termID );
00164       DocListIterator* docListIterator( const std::string& term );
00165       DocListFileIterator* docListFileIterator();
00166       DocExtentListIterator* fieldListIterator( int fieldID );
00167       DocExtentListIterator* fieldListIterator( const std::string& field );
00168       const TermList* termList( lemur::api::DOCID_T documentID );
00169       TermListFileIterator* termListFileIterator();
00170 
00171       VocabularyIterator* vocabularyIterator();
00172       VocabularyIterator* frequentVocabularyIterator();
00173       VocabularyIterator* infrequentVocabularyIterator();
00174 
00175       DocumentDataIterator* documentDataIterator();
00176       
00177       indri::thread::Lockable* iteratorLock();
00178       indri::thread::Lockable* statisticsLock();
00179 
00180       lemur::api::DOCID_T addDocument( indri::api::ParsedDocument& document );
00181       size_t memorySize();
00182     };
00183   }
00184 }
00185 
00186 #endif // INDRI_MEMORYINDEX_HPP

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4