00001 /*========================================================================== 00002 * Copyright (c) 2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 // 00013 // DiskIndex 00014 // 00015 // 8 December 2004 -- tds 00016 // 00017 00018 #ifndef INDRI_DISKINDEX_HPP 00019 #define INDRI_DISKINDEX_HPP 00020 00021 #include "indri/Index.hpp" 00022 #include "indri/File.hpp" 00023 #include "Keyfile.hpp" 00024 #include "indri/TermData.hpp" 00025 #include "indri/FieldStatistics.hpp" 00026 #include "indri/CorpusStatistics.hpp" 00027 #include "indri/DiskTermData.hpp" 00028 #include <vector> 00029 #include <string> 00030 #include "indri/BulkTree.hpp" 00031 #include "indri/SequentialReadBuffer.hpp" 00032 00033 namespace indri { 00034 namespace index { 00035 class DiskIndex : public Index { 00036 private: 00037 indri::thread::Mutex _lock; 00038 00039 std::string _path; 00040 00041 indri::file::BulkTreeReader _frequentStringToTerm; 00042 indri::file::BulkTreeReader _infrequentStringToTerm; 00043 00044 indri::file::BulkTreeReader _frequentIdToTerm; 00045 indri::file::BulkTreeReader _infrequentIdToTerm; 00046 00047 indri::file::File _frequentTermsData; 00048 00049 indri::file::File _documentLengths; 00050 indri::file::File _documentStatistics; 00051 00052 indri::file::File _invertedFile; 00053 indri::file::File _directFile; 00054 indri::file::File _fieldsFile; 00055 00056 indri::file::SequentialReadBuffer _lengthsBuffer; 00057 00058 std::vector<FieldStatistics> _fieldData; 00059 lemur::api::DOCID_T _documentBase; 00060 int _infrequentTermBase; 00061 00062 indri::index::DiskTermData* _fetchTermData( lemur::api::TERMID_T termID ); 00063 indri::index::DiskTermData* _fetchTermData( const char* termString ); 00064 00065 CorpusStatistics _corpusStatistics; 00066 void _readManifest( const std::string& manifestPath ); 00067 00068 public: 00069 DiskIndex() : _lengthsBuffer(_documentLengths) {} 00070 00071 void open( const std::string& base, const std::string& relative ); 00072 void close(); 00073 00074 const std::string& path(); 00075 lemur::api::DOCID_T documentBase(); 00076 00077 int field( const char* fieldName ); 00078 int field( const std::string& fieldName ); 00079 std::string field( int fieldID ); 00080 00081 lemur::api::TERMID_T term( const char* term ); 00082 lemur::api::TERMID_T term( const std::string& term ); 00083 std::string term( lemur::api::TERMID_T termID ); 00084 00085 int documentLength( lemur::api::DOCID_T documentID ); 00086 UINT64 documentCount(); 00087 UINT64 documentCount( const std::string& term ); 00088 lemur::api::DOCID_T documentMaximum(); 00089 UINT64 uniqueTermCount(); 00090 00091 UINT64 termCount( const std::string& term ); 00092 UINT64 termCount(); 00093 00094 UINT64 fieldTermCount( const std::string& field ); 00095 UINT64 fieldTermCount( const std::string& field, const std::string& term ); 00096 00097 UINT64 fieldDocumentCount( const std::string& field ); 00098 UINT64 fieldDocumentCount( const std::string& field, const std::string& term ); 00099 00100 // 00101 // Lists 00102 // 00103 00104 DocListIterator* docListIterator( lemur::api::TERMID_T termID ); 00105 DocListIterator* docListIterator( const std::string& term ); 00106 DocListFileIterator* docListFileIterator(); 00107 DocExtentListIterator* fieldListIterator( int fieldID ); 00108 DocExtentListIterator* fieldListIterator( const std::string& field ); 00109 const TermList* termList( lemur::api::DOCID_T documentID ); 00110 TermListFileIterator* termListFileIterator(); 00111 00112 VocabularyIterator* vocabularyIterator(); 00113 VocabularyIterator* frequentVocabularyIterator(); 00114 VocabularyIterator* infrequentVocabularyIterator(); 00115 00116 DocumentDataIterator* documentDataIterator(); 00117 00118 indri::thread::Lockable* iteratorLock(); 00119 indri::thread::Lockable* statisticsLock(); 00120 // cache limit 00121 enum { 00123 // 250,000 documents/megabyte. 00124 MAX_DOCLENGTHS_CACHE = 20*1024*1024 00125 }; 00126 }; 00127 } 00128 } 00129 00130 #endif // INDRI_DISKINDEX_HPP