00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_COMPRESSEDCOLLECTION_HPP
00020 #define INDRI_COMPRESSEDCOLLECTION_HPP
00021
00022 #include "indri/Collection.hpp"
00023 #include "string-set.h"
00024 #include <string>
00025 #include <vector>
00026 #include "Keyfile.hpp"
00027 #include "indri/Buffer.hpp"
00028 #include "indri/SequentialWriteBuffer.hpp"
00029 #include "indri/SequentialReadBuffer.hpp"
00030 #include "indri/HashTable.hpp"
00031 #include "indri/File.hpp"
00032 #include "indri/Mutex.hpp"
00033 #include "IndexTypes.hpp"
00034 #include "indri/DeletedDocumentList.hpp"
00035
00036 typedef struct z_stream_s* z_stream_p;
00037
00038 namespace indri
00039 {
00040 namespace collection
00041 {
00042
00043 class CompressedCollection : public Collection {
00044 private:
00045 indri::thread::Mutex _lock;
00046
00047 std::string _basePath;
00048 lemur::file::Keyfile _lookup;
00049 indri::file::File _storage;
00050 indri::file::SequentialWriteBuffer* _output;
00051 indri::utility::Buffer _positionsBuffer;
00052 z_stream_p _stream;
00053
00054 indri::utility::HashTable<const char*, lemur::file::Keyfile*> _reverseLookups;
00055 indri::utility::HashTable<const char*, lemur::file::Keyfile*> _forwardLookups;
00056 String_set* _strings;
00057
00058 void _writePositions( indri::api::ParsedDocument* document, int& keyLength, int& valueLength );
00059 void _writeMetadataItem( indri::api::ParsedDocument* document, int i, int& keyLength, int& valueLength );
00060 void _writeText( indri::api::ParsedDocument* document, int& keyLength, int& valueLength );
00061 void _writeContent( indri::api::ParsedDocument* document, int& keyLength, int& valueLength );
00062 void _writeContentLength( indri::api::ParsedDocument* document, int& keyLength, int& valueLength );
00063
00064 void _readPositions( indri::api::ParsedDocument* document, const void* positionData, int positionDataLength );
00065
00066 void _removeForwardLookups( indri::index::DeletedDocumentList& deletedList, lemur::file::Keyfile& keyfile );
00067 void _removeReverseLookups( indri::index::DeletedDocumentList& deletedList, lemur::file::Keyfile& keyfile );
00068
00069 void _copyForwardLookup( const std::string& name,
00070 lemur::file::Keyfile& other,
00071 indri::index::DeletedDocumentList& deletedList,
00072 lemur::api::DOCID_T documentOffset );
00073
00074 void _copyReverseLookup( const std::string& name,
00075 lemur::file::Keyfile& other,
00076 indri::index::DeletedDocumentList& deletedList,
00077 lemur::api::DOCID_T documentOffset );
00078
00079
00080 void _copyStorageEntry( indri::file::SequentialReadBuffer* input,
00081 indri::file::SequentialWriteBuffer* output,
00082 int key,
00083 UINT64 position,
00084 UINT64 length,
00085 lemur::file::Keyfile& lookup );
00086 void _copyStorageData( indri::file::SequentialReadBuffer* input,
00087 indri::file::SequentialWriteBuffer* output,
00088 indri::index::DeletedDocumentList& deletedList,
00089 lemur::api::DOCID_T documentOffset,
00090 lemur::file::Keyfile& sourceLookup,
00091 lemur::file::Keyfile& destLookup,
00092 UINT64 storageLength );
00093 void _copyForwardLookup( const std::string& name, lemur::file::Keyfile& other, lemur::api::DOCID_T documentOffset );
00094
00095 bool _storeDocs;
00096 public:
00097 CompressedCollection();
00098 ~CompressedCollection();
00099
00100 void create( const std::string& fileName );
00101 void create( const std::string& fileName, const std::vector<std::string>& indexedFields );
00102 void create( const std::string& fileName, const std::vector<std::string>& forwardIndexedFields, const std::vector<std::string>& reverseIndexedFields, bool storeDocs = true );
00103 void reopen( const std::string& fileName );
00104 void open( const std::string& fileName );
00105 void openRead( const std::string& fileName );
00106 void close();
00107 bool exists(lemur::api::DOCID_T documentID);
00108 indri::api::ParsedDocument* retrieve( lemur::api::DOCID_T documentID );
00109 std::string retrieveMetadatum( lemur::api::DOCID_T documentID, const std::string& attributeName );
00110 std::vector<indri::api::ParsedDocument*> retrieveByMetadatum( const std::string& attributeName, const std::string& value );
00111 std::vector<lemur::api::DOCID_T> retrieveIDByMetadatum( const std::string& attributeName, const std::string& value );
00112
00113 void addDocument( lemur::api::DOCID_T documentID, indri::api::ParsedDocument* document );
00114 void compact( indri::index::DeletedDocumentList& deletedList );
00115 void append( indri::collection::CompressedCollection& other, indri::index::DeletedDocumentList& deletedList, lemur::api::DOCID_T documentOffset );
00116
00117 std::vector<std::string> forwardFields();
00118 std::vector<std::string> reverseFields();
00119 };
00120 }
00121 }
00122
00123 #endif // INDRI_COMPRESSEDCOLLECTION_HPP