00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #ifndef INDRI_INDEXWRITER_HPP
00019 #define INDRI_INDEXWRITER_HPP
00020
00021 #include <vector>
00022 #include <utility>
00023 #include <queue>
00024
00025 #include "lemur-compat.hpp"
00026 #include "indri/indri-platform.h"
00027 #include "indri/greedy_vector"
00028 #include "indri/TermData.hpp"
00029 #include "Keyfile.hpp"
00030 #include "indri/Index.hpp"
00031 #include "indri/DocListFileIterator.hpp"
00032 #include "indri/File.hpp"
00033 #include "indri/SequentialWriteBuffer.hpp"
00034 #include "indri/CorpusStatistics.hpp"
00035 #include "indri/FieldStatistics.hpp"
00036 #include "indri/TermBitmap.hpp"
00037 #include "indri/TermRecorder.hpp"
00038 #include "indri/TermTranslator.hpp"
00039 #include "indri/DeletedDocumentList.hpp"
00040 #include "indri/BulkTree.hpp"
00041
00042 namespace indri {
00043 namespace index {
00044
00045 struct WriterIndexContext {
00046 struct greater {
00047 private:
00048 indri::index::DocListFileIterator::iterator_greater _iterator_greater;
00049
00050 int _compareTerms( const WriterIndexContext* const& one, const WriterIndexContext* const& two ) const {
00051 const char* oneTerm = one->iterator->currentEntry()->termData->term;
00052 const char* twoTerm = two->iterator->currentEntry()->termData->term;
00053
00054 return strcmp( oneTerm, twoTerm );
00055 }
00056
00057 int _compareDocuments( const WriterIndexContext* const& one, const WriterIndexContext* const& two ) const {
00058 const indri::index::DocListIterator::DocumentData* oneData = one->iterator->currentEntry()->iterator->currentEntry();
00059 const indri::index::DocListIterator::DocumentData* twoData = two->iterator->currentEntry()->iterator->currentEntry();
00060
00061 lemur::api::DOCID_T oneDocument = oneData ? oneData->document + one->documentOffset : 0;
00062 lemur::api::DOCID_T twoDocument = twoData ? twoData->document + two->documentOffset : 0;
00063
00064 return oneDocument > twoDocument;
00065 }
00066
00067 public:
00068 bool operator () ( const WriterIndexContext* const& one, const WriterIndexContext* const& two ) const {
00069 assert( !one->iterator->finished() && !two->iterator->finished() );
00070
00071 int result = _compareTerms( one, two );
00072
00073
00074 if( result != 0 )
00075 return result > 0;
00076
00077
00078 return _compareDocuments( one, two ) > 0;
00079 }
00080 };
00081
00082 WriterIndexContext( indri::index::Index* _index, indri::index::DeletedDocumentList* _deletedList, lemur::api::DOCID_T _documentOffset ) {
00083 deletedList = _deletedList;
00084 documentOffset = _documentOffset;
00085
00086 bitmap = new indri::index::TermBitmap;
00087 index = _index;
00088 wasInfrequentCount = 0;
00089 wasFrequentCount = 0;
00090
00091 if( index->iteratorLock() )
00092 index->iteratorLock()->lock();
00093
00094 iterator = index->docListFileIterator();
00095 iterator->startIteration();
00096
00097 newlyFrequent = new indri::index::TermRecorder;
00098 oldFrequent = new indri::index::TermRecorder;
00099 oldInfrequent = new indri::utility::HashTable<lemur::api::TERMID_T, lemur::api::TERMID_T>;
00100
00101
00102 sequenceCount = 0;
00103 }
00104
00105 ~WriterIndexContext() {
00106 delete iterator;
00107
00108 if( index->iteratorLock() )
00109 index->iteratorLock()->unlock();
00110
00111 delete oldFrequent;
00112 delete newlyFrequent;
00113 delete oldInfrequent;
00114 delete bitmap;
00115 }
00116
00117 indri::index::DocListFileIterator* iterator;
00118 indri::index::TermBitmap* bitmap;
00119 indri::index::Index* index;
00120
00121 int wasFrequentCount;
00122 int wasInfrequentCount;
00123 int sequenceCount;
00124 indri::index::TermRecorder* newlyFrequent;
00125 indri::index::TermRecorder* oldFrequent;
00126 indri::utility::HashTable<lemur::api::TERMID_T, lemur::api::TERMID_T>* oldInfrequent;
00127
00128 indri::index::DeletedDocumentList* deletedList;
00129 lemur::api::DOCID_T documentOffset;
00130 };
00131
00132 typedef std::priority_queue<WriterIndexContext*,
00133 std::vector<WriterIndexContext*>,
00134 WriterIndexContext::greater> invertedlist_pqueue;
00135
00136 class IndexWriter {
00137 private:
00138 struct disktermdata_count_greater {
00139 bool operator () ( const DiskTermData* one, const DiskTermData* two ) const {
00140 return one->termData->corpus.totalCount > two->termData->corpus.totalCount;
00141 }
00142 };
00143
00144 struct disktermdata_alpha_less {
00145 bool operator () ( const DiskTermData* one, const DiskTermData* two ) const {
00146 return strcmp( one->termData->term, two->termData->term ) < 0;
00147 }
00148 };
00149
00150 struct keyfile_pair {
00151 indri::file::BulkTreeWriter* stringMap;
00152 indri::file::BulkTreeWriter* idMap;
00153 };
00154
00155 keyfile_pair _infrequentTerms;
00156 keyfile_pair _frequentTerms;
00157 indri::file::File _frequentTermsData;
00158
00159 indri::file::BulkTreeReader _infrequentTermsReader;
00160 indri::file::BulkTreeReader _frequentTermsReader;
00161
00162 indri::file::File _documentStatistics;
00163 indri::file::File _documentLengths;
00164
00165 indri::file::File _invertedFile;
00166 indri::file::File _directFile;
00167 indri::file::File _fieldsFile;
00168
00169 indri::file::SequentialWriteBuffer* _invertedOutput;
00170
00171 indri::utility::greedy_vector<indri::index::DiskTermData*> _topTerms;
00172 int _topTermsCount;
00173 indri::utility::Buffer _termDataBuffer;
00174
00175 int _isFrequentCount;
00176 lemur::api::DOCID_T _documentBase;
00177 indri::index::CorpusStatistics _corpus;
00178 std::vector<indri::index::Index::FieldDescription> _fields;
00179 std::vector<indri::index::FieldStatistics> _fieldData;
00180
00181 void _writeManifest( const std::string& path );
00182 void _writeSkip( indri::file::SequentialWriteBuffer* buffer, lemur::api::DOCID_T document, int length );
00183 void _writeBatch( indri::file::SequentialWriteBuffer* buffer, lemur::api::DOCID_T document, int length, indri::utility::Buffer& data );
00184
00185 void _writeFieldLists( std::vector<WriterIndexContext*>& contexts, const std::string& path );
00186 void _writeFieldList( indri::file::SequentialWriteBuffer& output, int fieldIndex, std::vector<indri::index::DocExtentListIterator*>& iterators, std::vector<WriterIndexContext*>& contexts );
00187
00188 void _pushInvertedLists( indri::utility::greedy_vector<WriterIndexContext*>& lists, invertedlist_pqueue& queue );
00189 void _fetchMatchingInvertedLists( indri::utility::greedy_vector<WriterIndexContext*>& lists, invertedlist_pqueue& queue );
00190 void _writeStatistics( indri::utility::greedy_vector<WriterIndexContext*>& lists, indri::index::TermData* termData, UINT64& startOffset );
00191 void _writeInvertedLists( std::vector<WriterIndexContext*>& contexts );
00192
00193 void _storeIdEntry( IndexWriter::keyfile_pair& pair, indri::index::DiskTermData* diskTermData );
00194 void _storeStringEntry( IndexWriter::keyfile_pair& pair, indri::index::DiskTermData* diskTermData );
00195
00196 void _storeTermEntry( IndexWriter::keyfile_pair& pair, indri::index::DiskTermData* diskTermData );
00197 void _storeFrequentTerms();
00198 void _addInvertedListData( indri::utility::greedy_vector<WriterIndexContext*>& lists, indri::index::TermData* termData, indri::utility::Buffer& listBuffer, UINT64& endOffset );
00199 void _storeMatchInformation( indri::utility::greedy_vector<WriterIndexContext*>& lists, int sequence, indri::index::TermData* termData, UINT64 startOffset, UINT64 endOffset );
00200
00201 lemur::api::TERMID_T _lookupTermID( indri::file::BulkTreeReader& keyfile, const char* term );
00202
00203 void _buildIndexContexts( std::vector<WriterIndexContext*>& contexts, std::vector<indri::index::Index*>& indexes, indri::index::DeletedDocumentList& deletedList );
00204 void _buildIndexContexts( std::vector<WriterIndexContext*>& contexts, std::vector<indri::index::Index*>& indexes, std::vector<indri::index::DeletedDocumentList*>& deletedLists, const std::vector<lemur::api::DOCID_T>& documentOffsets );
00205
00206 void _writeDirectLists( std::vector<WriterIndexContext*>& contexts );
00207 void _writeDirectLists( WriterIndexContext* context,
00208 indri::file::SequentialWriteBuffer* directOutput,
00209 indri::file::SequentialWriteBuffer* lengthsOutput,
00210 indri::file::SequentialWriteBuffer* dataOutput );
00211
00212 void _constructFiles( const std::string& path );
00213 void _closeFiles( const std::string& path );
00214 void _openTermsReaders( const std::string& path );
00215
00216 indri::index::TermTranslator* _buildTermTranslator( indri::file::BulkTreeReader& newInfrequentTerms,
00217 indri::file::BulkTreeReader& newFrequentTerms,
00218 indri::index::TermRecorder& oldFrequentTermsRecorder,
00219 indri::utility::HashTable<lemur::api::TERMID_T, lemur::api::TERMID_T>* oldInfrequent,
00220 indri::index::TermRecorder& newFrequentTermsRecorder,
00221 indri::index::Index* index,
00222 indri::index::TermBitmap* bitmap );
00223
00224
00225 char *_compressedData;
00226 char *_uncompressedData;
00227 int _dataSize;
00228
00229 enum {
00230 TOPDOCS_DOCUMENT_COUNT = 1000,
00231 FREQUENT_TERM_COUNT = 1000
00232 };
00233
00234 public:
00235 IndexWriter();
00236 void write( indri::index::Index& index,
00237 std::vector<indri::index::Index::FieldDescription>& fields,
00238 indri::index::DeletedDocumentList& deletedList,
00239 const std::string& fileName );
00240 void write( std::vector<indri::index::Index*>& indexes,
00241 std::vector<indri::index::Index::FieldDescription>& fields,
00242 indri::index::DeletedDocumentList& deletedList,
00243 const std::string& fileName );
00244 void write( std::vector<indri::index::Index*>& indexes,
00245 std::vector<indri::index::Index::FieldDescription>& fields,
00246 std::vector<indri::index::DeletedDocumentList*>& deletedLists,
00247 const std::vector<lemur::api::DOCID_T>& documentMaximums,
00248 const std::string& path );
00249 };
00250 }
00251 }
00252
00253 #endif // INDRI_INDEXWRITER_HPP