00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_REPOSITORY_HPP
00020 #define INDRI_REPOSITORY_HPP
00021
00022 #include "indri/Parameters.hpp"
00023 #include "indri/Transformation.hpp"
00024 #include "indri/MemoryIndex.hpp"
00025 #include "indri/DiskIndex.hpp"
00026 #include "indri/ref_ptr.hpp"
00027 #include "indri/DeletedDocumentList.hpp"
00028 #include "indri/PriorListIterator.hpp"
00029 #include <string>
00030
00031 #define MERGE_FILE_LIMIT 768
00032 namespace indri
00033 {
00035 namespace collection
00036 {
00037
00041 class Repository {
00042 public:
00043 struct Load {
00044 float one;
00045 float five;
00046 float fifteen;
00047 };
00048
00049 struct Field {
00050 std::string name;
00051 std::string parserName;
00052 bool numeric;
00053 bool ordinal;
00054 bool parental;
00055 };
00056
00057 typedef std::vector<indri::index::Index*> index_vector;
00058 typedef indri::atomic::ref_ptr<index_vector> index_state;
00059
00060 private:
00061 friend class RepositoryMaintenanceThread;
00062 friend class RepositoryLoadThread;
00063
00064 class RepositoryMaintenanceThread* _maintenanceThread;
00065 class RepositoryLoadThread* _loadThread;
00066
00067 indri::thread::Mutex _stateLock;
00068 std::vector<index_state> _states;
00069 index_state _active;
00070 int _indexCount;
00071
00072
00073 volatile bool _maintenanceRunning;
00074 volatile bool _loadThreadRunning;
00075
00076 indri::thread::Mutex _addLock;
00077
00078 class CompressedCollection* _collection;
00079 indri::index::DeletedDocumentList _deletedList;
00080
00081 indri::api::Parameters _parameters;
00082 std::vector<indri::parse::Transformation*> _transformations;
00083 std::vector<Field> _fields;
00084 std::vector<indri::index::Index::FieldDescription> _indexFields;
00085 std::map<std::string, indri::file::File*> _priorFiles;
00086
00087 std::string _path;
00088 bool _readOnly;
00089
00090 INT64 _memory;
00091
00092 UINT64 _lastThrashTime;
00093 volatile bool _thrashing;
00094
00095 enum { LOAD_MINUTES = 15, LOAD_MINUTE_FRACTION = 12 };
00096
00097 indri::atomic::value_type _queryLoad[ LOAD_MINUTES * LOAD_MINUTE_FRACTION ];
00098 indri::atomic::value_type _documentLoad[ LOAD_MINUTES * LOAD_MINUTE_FRACTION ];
00099
00100 static std::vector<std::string> _fieldNames( indri::api::Parameters& parameters );
00101 static std::string _stemmerName( indri::api::Parameters& parameters );
00102
00103 static void _mergeClosedIndexes( const std::string& outputPath,
00104 const std::vector<std::string>& repositories,
00105 const std::vector<indri::collection::Repository::Field>& indexFields,
00106 const std::vector<lemur::api::DOCID_T>& documentMaximums );
00107 static void _writeMergedManifest( const std::string& path, indri::api::Parameters& firstManifest );
00108 static void _mergeBitmaps( const std::string& outputPath, const std::vector<std::string>& repositories, const std::vector<lemur::api::DOCID_T>& documentCounts );
00109 static void _mergeCompressedCollections( const std::string& outputPath,
00110 const std::vector<std::string>& repositories,
00111 const std::vector<lemur::api::DOCID_T>& documentMaximums );
00112 static void _cleanAndCreateDirectory( const std::string& path );
00113
00114 void _writeParameters( const std::string& path );
00115 void _checkpoint();
00116 void _incrementLoad();
00117 void _countDocumentAdd();
00118 Load _computeLoad( indri::atomic::value_type* loadArray );
00119
00120 void _openPriors( const std::string& path );
00121 void _closePriors();
00122
00123 void _buildFields();
00124 void _buildChain( indri::api::Parameters& parameters,
00125 indri::api::Parameters *options );
00126
00127 void _copyParameters( indri::api::Parameters& options );
00128
00129 void _removeStates( std::vector<index_state>& toRemove );
00130 void _remove( const std::string& path );
00131
00132 void _openIndexes( indri::api::Parameters& params, const std::string& parentPath );
00133 std::vector<index_state> _statesContaining( std::vector<indri::index::Index*>& indexes );
00134 bool _stateContains( index_state& state, std::vector<indri::index::Index*>& indexes );
00135 void _swapState( std::vector<indri::index::Index*>& oldIndexes, indri::index::Index* newIndex );
00136 void _closeIndexes();
00137 static std::vector<indri::index::Index::FieldDescription> _fieldsForIndex( const std::vector<Repository::Field>& _fields );
00138 void _merge( index_state& state );
00139 indri::index::Index* _mergeStage( index_state& state );
00140 UINT64 _mergeMemory( const std::vector<indri::index::Index*>& indexes );
00141 unsigned int _mergeFiles( const std::vector<indri::index::Index*>& indexes );
00142
00143
00145 void _merge();
00147 void _write();
00149 void _trim();
00150
00151 void _startThreads();
00152 void _stopThreads();
00153
00154 void _setThrashing( bool flag );
00155 UINT64 _timeSinceThrashing();
00156 void _addMemoryIndex();
00157
00158 public:
00159 Repository() {
00160 _collection = 0;
00161 _readOnly = false;
00162 _lastThrashTime = 0;
00163 _thrashing = false;
00164 memset( (void*) _documentLoad, 0, sizeof(indri::atomic::value_type)*LOAD_MINUTES*LOAD_MINUTE_FRACTION );
00165 memset( (void*) _queryLoad, 0, sizeof(indri::atomic::value_type)*LOAD_MINUTES*LOAD_MINUTE_FRACTION );
00166 }
00167
00168 ~Repository() {
00169 close();
00170 }
00174 int addDocument( indri::api::ParsedDocument* document, bool inCollection = true );
00177 void deleteDocument( int documentID );
00179 const std::vector<Field>& fields() const;
00181 std::vector<std::string> tags() const;
00183 std::vector<std::string> priors() const;
00187 std::string processTerm( const std::string& term );
00189 class CompressedCollection* collection();
00193 void create( const std::string& path, indri::api::Parameters* options = 0 );
00197 void open( const std::string& path, indri::api::Parameters* options = 0 );
00201 void openRead( const std::string& path, indri::api::Parameters* options = 0 );
00205 static bool exists( const std::string& path );
00207 void close();
00208
00211 void compact();
00212
00214 index_state indexes();
00215
00217 indri::collection::PriorListIterator* priorListIterator( const std::string& priorName );
00218
00220 void countQuery();
00221
00223 void write();
00224
00226 void merge();
00227
00229 static void makeEmpty( const std::string& path );
00230
00232 static void merge( const std::string& outputIndex, const std::vector<std::string>& inputIndexes );
00233
00235 indri::index::DeletedDocumentList& deletedList();
00236
00238 Load queryLoad();
00239
00241 Load documentLoad();
00242 };
00243 }
00244 }
00245
00246 #endif // INDRI_REPOSITORY_HPP
00247