00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #ifndef INDRI_MEMORYINDEX_HPP
00019 #define INDRI_MEMORYINDEX_HPP
00020
00021 #include "indri/Index.hpp"
00022 #include "indri/Mutex.hpp"
00023 #include "indri/HashTable.hpp"
00024 #include "indri/DocumentData.hpp"
00025 #include "indri/Buffer.hpp"
00026 #include <list>
00027 #include <vector>
00028
00029 #include "indri/DocListIterator.hpp"
00030 #include "indri/DocListFileIterator.hpp"
00031 #include "indri/TermList.hpp"
00032 #include "indri/TermListFileIterator.hpp"
00033 #include "indri/VocabularyIterator.hpp"
00034 #include "indri/ParsedDocument.hpp"
00035 #include "indri/DocListMemoryBuilder.hpp"
00036 #include "indri/FieldStatistics.hpp"
00037 #include "indri/CorpusStatistics.hpp"
00038 #include "indri/DocExtentListMemoryBuilder.hpp"
00039 #include "indri/ReadersWritersLock.hpp"
00040 #include "indri/ReaderLockable.hpp"
00041 #include "indri/WriterLockable.hpp"
00042 #include "indri/RegionAllocator.hpp"
00043
00044 namespace indri {
00045 namespace index {
00046 class MemoryIndex : public Index {
00047 public:
00048
00049 struct term_entry {
00050 struct term_less {
00051 bool operator() ( const term_entry* one, const term_entry* two ) const {
00052 return strcmp( one->term, two->term ) < 0;
00053 }
00054 };
00055
00056 term_entry( indri::utility::RegionAllocator* allocator ) :
00057 list(allocator),
00058 next(0)
00059 {
00060 }
00061
00062 void clearMark() {
00063 next = 0;
00064 }
00065
00066 bool hasNext() {
00067 return next != 0 && next != (term_entry*) 1;
00068 }
00069
00070 void mark() {
00071 next = (term_entry*) 1;
00072 }
00073
00074 bool marked() {
00075 return next != 0;
00076 }
00077
00078 char* term;
00079 lemur::api::TERMID_T termID;
00080 TermData* termData;
00081 term_entry* next;
00082 indri::index::DocListMemoryBuilder list;
00083 };
00084
00085 private:
00086 indri::utility::RegionAllocator _allocator;
00087
00088 indri::thread::ReadersWritersLock _lock;
00089 indri::thread::ReaderLockable _readLock;
00090 indri::thread::WriterLockable _writeLock;
00091
00092 CorpusStatistics _corpusStatistics;
00093 lemur::api::DOCID_T _baseDocumentID;
00094
00095
00096 indri::index::TermList _termList;
00097 indri::utility::greedy_vector<term_entry*> _seenTerms;
00098
00099
00100 indri::utility::HashTable<const char*, term_entry*> _stringToTerm;
00101 std::vector<term_entry*> _idToTerm;
00102
00103
00104 indri::utility::HashTable<const char*, int> _fieldLookup;
00105 std::vector<FieldStatistics> _fieldData;
00106 std::vector<indri::index::DocExtentListMemoryBuilder*> _fieldLists;
00107
00108
00109 std::vector<indri::index::DocumentData> _documentData;
00110
00111
00112 std::list<indri::utility::Buffer*> _termLists;
00113 UINT64 _termListsBaseOffset;
00114
00115 void _addOpenTags( indri::utility::greedy_vector<indri::parse::TagExtent *>& indexedTags,
00116 indri::utility::greedy_vector<indri::parse::TagExtent *>& openTags,
00117 indri::utility::greedy_vector<indri::parse::TagExtent *>& extents,
00118 unsigned int& extentIndex,
00119 unsigned int position );
00120 void _removeClosedTags( indri::utility::greedy_vector<indri::parse::TagExtent *>& tags, unsigned int position );
00121 void _writeFieldExtents( lemur::api::DOCID_T documentID, indri::utility::greedy_vector<indri::parse::TagExtent *>& indexedTags );
00122 void _writeDocumentTermList( UINT64& offset, int& byteLength, lemur::api::DOCID_T documentID, int documentLength, indri::index::TermList& locatedTerms );
00123 void _writeDocumentStatistics( UINT64 offset, int byteLength, int indexedLength, int totalLength, int uniqueTerms );
00124 term_entry* _lookupTerm( const char* term );
00125 void _destroyTerms();
00126
00127 int _fieldID( const std::string& fieldName );
00128 int _fieldID( const char* fieldName );
00129
00130 public:
00131 MemoryIndex();
00132 MemoryIndex( lemur::api::DOCID_T docBase );
00133 MemoryIndex( lemur::api::DOCID_T docBase, const std::vector<Index::FieldDescription>& fields );
00134 ~MemoryIndex();
00135
00136 void close();
00137
00138 lemur::api::DOCID_T documentBase();
00139 lemur::api::DOCID_T documentMaximum();
00140
00141 lemur::api::TERMID_T term( const std::string& t );
00142 lemur::api::TERMID_T term( const char* t );
00143 std::string term( lemur::api::TERMID_T termID );
00144
00145 int field( const char* fieldName );
00146 int field( const std::string& fieldName );
00147 std::string field( int fieldID );
00148
00149 int documentLength( lemur::api::DOCID_T documentID );
00150 UINT64 documentCount();
00151 UINT64 documentCount( const std::string& term );
00152 UINT64 uniqueTermCount();
00153
00154 UINT64 termCount( const std::string& term );
00155 UINT64 termCount();
00156
00157 UINT64 fieldTermCount( const std::string& field );
00158 UINT64 fieldTermCount( const std::string& field, const std::string& term );
00159
00160 UINT64 fieldDocumentCount( const std::string& field );
00161 UINT64 fieldDocumentCount( const std::string& field, const std::string& term );
00162
00163 DocListIterator* docListIterator( lemur::api::TERMID_T termID );
00164 DocListIterator* docListIterator( const std::string& term );
00165 DocListFileIterator* docListFileIterator();
00166 DocExtentListIterator* fieldListIterator( int fieldID );
00167 DocExtentListIterator* fieldListIterator( const std::string& field );
00168 const TermList* termList( lemur::api::DOCID_T documentID );
00169 TermListFileIterator* termListFileIterator();
00170
00171 VocabularyIterator* vocabularyIterator();
00172 VocabularyIterator* frequentVocabularyIterator();
00173 VocabularyIterator* infrequentVocabularyIterator();
00174
00175 DocumentDataIterator* documentDataIterator();
00176
00177 indri::thread::Lockable* iteratorLock();
00178 indri::thread::Lockable* statisticsLock();
00179
00180 lemur::api::DOCID_T addDocument( indri::api::ParsedDocument& document );
00181 size_t memorySize();
00182 };
00183 }
00184 }
00185
00186 #endif // INDRI_MEMORYINDEX_HPP