00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_INDEXENVIRONMENT_HPP
00020 #define INDRI_INDEXENVIRONMENT_HPP
00021
00022 #include <string>
00023 #include "indri/Parameters.hpp"
00024 #include "indri/HTMLParser.hpp"
00025 #include "indri/ConflationPattern.hpp"
00026 #include "indri/Repository.hpp"
00027 #include "indri/IndriParser.hpp"
00028 #include "indri/IndriTokenizer.hpp"
00029 #include "indri/DocumentIterator.hpp"
00030 #include "indri/AnchorTextAnnotator.hpp"
00031 #include "indri/OffsetAnnotationAnnotator.hpp"
00032 #include "indri/OffsetMetadataAnnotator.hpp"
00033 #include "indri/Transformation.hpp"
00034 #include "indri/DocumentIteratorFactory.hpp"
00035 #include "indri/ParserFactory.hpp"
00036 #include "indri/FileClassEnvironmentFactory.hpp"
00037 #include <map>
00038 namespace indri
00039 {
00041 namespace api
00042 {
00043
00044 struct IndexStatus {
00045 enum action_code {
00046 FileOpen,
00047 FileSkip,
00048 FileError,
00049 FileClose,
00050 DocumentCount
00051 };
00052
00053 virtual void operator () ( int code, const std::string& documentPath, const std::string& error, int documentsIndexed, int documentsSeen ) {
00054 status( code, documentPath, error, documentsIndexed, documentsSeen );
00055 }
00056
00057 virtual void status( int code, const std::string& documentPath, const std::string& error, int documentsIndexed, int documentsSeen ) {};
00058 };
00059
00066 class IndexEnvironment {
00067 private:
00068 IndexStatus* _callback;
00069 Parameters* _options;
00070
00071 std::string _repositoryPath;
00072 indri::collection::Repository _repository;
00073 int _documents;
00074 std::string _error;
00075
00076 std::string _offsetAnnotationsRoot;
00077 std::string _offsetMetadataRoot;
00078 std::string _anchorTextRoot;
00079 std::string _documentRoot;
00080
00081 Parameters _parameters;
00082 indri::parse::FileClassEnvironmentFactory _fileClassFactory;
00083
00084 indri::parse::AnchorTextAnnotator _annotator;
00085 indri::parse::OffsetAnnotationAnnotator _oa_annotator;
00086 indri::parse::OffsetMetadataAnnotator _om_annotator;
00087
00088 std::map<std::string, indri::parse::FileClassEnvironment*> _environments;
00089
00090 int _documentsIndexed;
00091 int _documentsSeen;
00092
00093 void _getParsingContext( indri::parse::Parser** parser,
00094 indri::parse::Tokenizer** tokenizer,
00095 indri::parse::DocumentIterator** iterator,
00096 indri::parse::Conflater** conflater,
00097 const std::string& extension );
00098
00099 std::vector<indri::parse::Transformation*> _createAnnotators( const std::string& fileName,
00100 const std::string& fileClass,
00101 indri::parse::Conflater** conflater);
00102
00103 ParsedDocument* _applyAnnotators( std::vector<indri::parse::Transformation*>& annotators,
00104 ParsedDocument* parsed );
00105
00106
00107 public:
00108 friend class QueryEnvironment;
00109
00110 IndexEnvironment();
00111 ~IndexEnvironment();
00112
00115 void setOffsetAnnotationsPath( const std::string& offsetAnnotationsRoot );
00116
00119 void setOffsetMetadataPath( const std::string& offsetMetadataRoot );
00120
00123 void setAnchorTextPath( const std::string& anchorTextRoot );
00124
00127 void setDocumentRoot( const std::string& documentRoot );
00128
00143 void addFileClass( const std::string& name,
00144 const std::string& iterator,
00145 const std::string& parser,
00146 const std::string& tokenizer,
00147 const std::string& startDocTag,
00148 const std::string& endDocTag,
00149 const std::string& endMetadataTag,
00150 const std::vector<std::string>& include,
00151 const std::vector<std::string>& exclude,
00152 const std::vector<std::string>& index,
00153 const std::vector<std::string>& metadata,
00154 const std::map<indri::parse::ConflationPattern*,std::string>& conflations );
00155
00158 indri::parse::FileClassEnvironmentFactory::Specification *getFileClassSpec( const std::string& name) {
00159 return _fileClassFactory.getFileClassSpec(name);
00160 }
00161
00164 void addFileClass( const indri::parse::FileClassEnvironmentFactory::Specification &spec ){
00165 _fileClassFactory.addFileClass(spec);
00166 }
00167
00173 void setIndexedFields( const std::vector<std::string>& fieldNames );
00174
00179 void setNumericField( const std::string& fieldName, bool isNumeric,
00180 const std::string &parserName = "");
00181
00185 void setOrdinalField( const std::string& fieldName, bool isOrdinal);
00186
00190 void setParentalField( const std::string& fieldName, bool isParental);
00191
00192
00201 void setMetadataIndexedFields( const std::vector<std::string>& forwardFieldNames, const std::vector<std::string>& backwardFieldNames );
00202
00205 void setStopwords( const std::vector<std::string>& stopwords );
00206
00209 void setStemmer( const std::string& stemmer );
00210
00213 void setMemory( UINT64 memory );
00214
00217 void setNormalization( bool flag );
00218
00221 void setStoreDocs( bool flag );
00222
00225 void setOffsetAnnotationIndexHint(indri::parse::OffsetAnnotationIndexHint hintType);
00226
00230 void create( const std::string& repositoryPath, IndexStatus* callback = 0 );
00231
00235 void open( const std::string& repositoryPath, IndexStatus* callback = 0 );
00236
00238 void close();
00239
00245 void addFile( const std::string& fileName );
00246
00250 void addFile( const std::string& fileName, const std::string& fileClass );
00251
00257 lemur::api::DOCID_T addString( const std::string& documentString,
00258 const std::string& fileClass,
00259 const std::vector<indri::parse::MetadataPair>& metadata );
00260
00271 lemur::api::DOCID_T addString( const std::string& documentString,
00272 const std::string& fileClass,
00273 const std::vector<indri::parse::MetadataPair>& metadata,
00274 const std::vector<indri::parse::TagExtent *> &tags );
00275
00278 lemur::api::DOCID_T addParsedDocument( ParsedDocument* document );
00279
00282 void deleteDocument( lemur::api::DOCID_T documentID );
00283
00285 int documentsIndexed();
00286
00290 int documentsSeen();
00291
00294 void compact();
00295
00302 static void merge( const std::string& outputIndex, const std::vector<std::string>& inputIndexes );
00303 };
00304 }
00305 }
00306
00307 #endif // INDRI_INDEXENVIRONMENT_HPP
00308