00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 #ifndef INDRI_REPOSITORY_HPP
00020 #define INDRI_REPOSITORY_HPP
00021 
00022 #include "indri/Parameters.hpp"
00023 #include "indri/Transformation.hpp"
00024 #include "indri/MemoryIndex.hpp"
00025 #include "indri/DiskIndex.hpp"
00026 #include "indri/ref_ptr.hpp"
00027 #include "indri/DeletedDocumentList.hpp"
00028 #include "indri/PriorListIterator.hpp"
00029 #include <string>
00030 
00031 #define MERGE_FILE_LIMIT 768 
00032 namespace indri
00033 {
00035   namespace collection
00036   {
00037     
00041     class Repository {
00042     public:
00043       struct Load {
00044         float one;
00045         float five;
00046         float fifteen;
00047       };
00048 
00049       struct Field {
00050         std::string name;
00051         std::string parserName;
00052         bool numeric;
00053         bool ordinal;
00054         bool parental;
00055       };
00056 
00057       typedef std::vector<indri::index::Index*> index_vector;
00058       typedef indri::atomic::ref_ptr<index_vector> index_state;
00059 
00060     private:
00061       friend class RepositoryMaintenanceThread;
00062       friend class RepositoryLoadThread;
00063 
00064       class RepositoryMaintenanceThread* _maintenanceThread;
00065       class RepositoryLoadThread* _loadThread;
00066 
00067       indri::thread::Mutex _stateLock; 
00068       std::vector<index_state> _states;
00069       index_state _active;
00070       int _indexCount;
00071 
00072       
00073       volatile bool _maintenanceRunning;
00074       volatile bool _loadThreadRunning;
00075 
00076       indri::thread::Mutex _addLock; 
00077 
00078       class CompressedCollection* _collection;
00079       indri::index::DeletedDocumentList _deletedList;
00080 
00081       indri::api::Parameters _parameters;
00082       std::vector<indri::parse::Transformation*> _transformations;
00083       std::vector<Field> _fields;
00084       std::vector<indri::index::Index::FieldDescription> _indexFields;
00085       std::map<std::string, indri::file::File*> _priorFiles;
00086 
00087       std::string _path;
00088       bool _readOnly;
00089 
00090       INT64 _memory;
00091 
00092       UINT64 _lastThrashTime;
00093       volatile bool _thrashing;
00094 
00095       enum { LOAD_MINUTES = 15, LOAD_MINUTE_FRACTION = 12 };
00096 
00097       indri::atomic::value_type _queryLoad[ LOAD_MINUTES * LOAD_MINUTE_FRACTION ];
00098       indri::atomic::value_type _documentLoad[ LOAD_MINUTES * LOAD_MINUTE_FRACTION ];
00099 
00100       static std::vector<std::string> _fieldNames( indri::api::Parameters& parameters );
00101       static std::string _stemmerName( indri::api::Parameters& parameters );
00102 
00103       static void _mergeClosedIndexes( const std::string& outputPath,
00104                                        const std::vector<std::string>& repositories,
00105                                        const std::vector<indri::collection::Repository::Field>& indexFields,
00106                                        const std::vector<lemur::api::DOCID_T>& documentMaximums );
00107       static void _writeMergedManifest( const std::string& path, indri::api::Parameters& firstManifest );
00108       static void _mergeBitmaps( const std::string& outputPath, const std::vector<std::string>& repositories, const std::vector<lemur::api::DOCID_T>& documentCounts );
00109       static void _mergeCompressedCollections( const std::string& outputPath,
00110                                                                  const std::vector<std::string>& repositories,
00111                                                                  const std::vector<lemur::api::DOCID_T>& documentMaximums );
00112       static void _cleanAndCreateDirectory( const std::string& path );
00113 
00114       void _writeParameters( const std::string& path );
00115       void _checkpoint();
00116       void _incrementLoad();
00117       void _countDocumentAdd();
00118       Load _computeLoad( indri::atomic::value_type* loadArray );
00119       
00120       void _openPriors( const std::string& path );
00121       void _closePriors();
00122 
00123       void _buildFields();
00124       void _buildChain( indri::api::Parameters& parameters,
00125                         indri::api::Parameters *options );
00126 
00127       void _copyParameters( indri::api::Parameters& options );
00128 
00129       void _removeStates( std::vector<index_state>& toRemove );
00130       void _remove( const std::string& path );
00131 
00132       void _openIndexes( indri::api::Parameters& params, const std::string& parentPath );
00133       std::vector<index_state> _statesContaining( std::vector<indri::index::Index*>& indexes );
00134       bool _stateContains( index_state& state, std::vector<indri::index::Index*>& indexes );
00135       void _swapState( std::vector<indri::index::Index*>& oldIndexes, indri::index::Index* newIndex );
00136       void _closeIndexes();
00137       static std::vector<indri::index::Index::FieldDescription> _fieldsForIndex( const std::vector<Repository::Field>& _fields );
00138       void _merge( index_state& state );
00139       indri::index::Index* _mergeStage( index_state& state );
00140       UINT64 _mergeMemory( const std::vector<indri::index::Index*>& indexes );
00141       unsigned int _mergeFiles( const std::vector<indri::index::Index*>& indexes );
00142 
00143       
00145       void _merge(); 
00147       void _write();
00149       void _trim();
00150 
00151       void _startThreads();
00152       void _stopThreads();
00153 
00154       void _setThrashing( bool flag );
00155       UINT64 _timeSinceThrashing();
00156       void _addMemoryIndex();
00157 
00158     public:
00159       Repository() {
00160         _collection = 0;
00161         _readOnly = false;
00162         _lastThrashTime = 0;
00163         _thrashing = false;
00164         memset( (void*) _documentLoad, 0, sizeof(indri::atomic::value_type)*LOAD_MINUTES*LOAD_MINUTE_FRACTION );
00165         memset( (void*) _queryLoad, 0, sizeof(indri::atomic::value_type)*LOAD_MINUTES*LOAD_MINUTE_FRACTION );
00166       }
00167 
00168       ~Repository() {
00169         close();
00170       }
00174       int addDocument( indri::api::ParsedDocument* document, bool inCollection  = true );
00177       void deleteDocument( int documentID );
00179       const std::vector<Field>& fields() const;
00181       std::vector<std::string> tags() const;
00183       std::vector<std::string> priors() const;
00187       std::string processTerm( const std::string& term );
00189       class CompressedCollection* collection();
00193       void create( const std::string& path, indri::api::Parameters* options = 0 );
00197       void open( const std::string& path, indri::api::Parameters* options = 0 );
00201       void openRead( const std::string& path, indri::api::Parameters* options = 0 );
00205       static bool exists( const std::string& path );
00207       void close();
00208 
00211       void compact();
00212 
00214       index_state indexes();
00215       
00217       indri::collection::PriorListIterator* priorListIterator( const std::string& priorName );
00218 
00220       void countQuery();
00221 
00223       void write();
00224 
00226       void merge(); 
00227       
00229       static void makeEmpty( const std::string& path );
00230 
00232       static void merge( const std::string& outputIndex, const std::vector<std::string>& inputIndexes );
00233 
00235       indri::index::DeletedDocumentList& deletedList();
00236 
00238       Load queryLoad();
00239 
00241       Load documentLoad();
00242     };
00243   }
00244 }
00245 
00246 #endif // INDRI_REPOSITORY_HPP
00247