00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_TAGGEDTEXTPARSER_HPP
00020 #define INDRI_TAGGEDTEXTPARSER_HPP
00021
00022 #include <stdio.h>
00023 #include <ctype.h>
00024 #include <string.h>
00025 #include <string>
00026 #include <vector>
00027 #include <map>
00028 #include "indri/HashTable.hpp"
00029 #include "indri/TagList.hpp"
00030 #include "indri/IndriParser.hpp"
00031 #include "indri/Buffer.hpp"
00032 #include "indri/TokenizedDocument.hpp"
00033 #include "string-set.h"
00034 #include "indri/ConflationPattern.hpp"
00035 #include "indri/Conflater.hpp"
00036
00037 #define MAX_DOCNO_LENGTH 128
00038 #define PARSER_MAX_BUF_SIZE 1024
00039
00040 namespace indri
00041 {
00042 namespace parse
00043 {
00044
00045 class TaggedTextParser : public Parser {
00046 public:
00047 TaggedTextParser();
00048 ~TaggedTextParser();
00049
00050 void setTags( const std::vector<std::string>& include,
00051 const std::vector<std::string>& exclude,
00052 const std::vector<std::string>& index,
00053 const std::vector<std::string>& metadata,
00054 const std::map<indri::parse::ConflationPattern*,std::string>& conflations );
00055
00056 indri::api::ParsedDocument* parse( TokenizedDocument* document );
00057
00058 void handle( TokenizedDocument* document );
00059 void setHandler( ObjectHandler<indri::api::ParsedDocument>& h );
00060
00061 protected:
00062 typedef indri::utility::HashTable<std::string, std::string> StrHashTable;
00063
00064
00065 virtual void initialize( TokenizedDocument* document, indri::api::ParsedDocument* parsed );
00066 virtual void cleanup( TokenizedDocument* document, indri::api::ParsedDocument* parsed );
00067
00068 void addTag(const char *s, const char* c, int pos) { tl->addTag(s, c, pos); }
00069 void endTag(const char *s, const char* c, int pos) { tl->endTag(s, c, pos); }
00070
00071 void addMetadataTag(const char* s, const char* c, int pos) { _metaList->addTag(s, c, pos); }
00072 void endMetadataTag(const char* s, const char* c, int pos) { _metaList->endTag(s, c, pos); }
00073
00074 Conflater* _p_conflater;
00075
00076
00077 TagList* tl;
00078 TagList* _metaList;
00079 indri::utility::Buffer _termBuffer;
00080
00081 struct tag_properties {
00082 const char* name;
00083 bool index;
00084 bool exclude;
00085 bool include;
00086 bool metadata;
00087 };
00088
00089 tag_properties* _findTag( std::string name );
00090 tag_properties* _buildTag( std::string name );
00091
00092 indri::utility::HashTable<const char*,tag_properties*> _tagTable;
00093
00094 virtual void handleTag( TagEvent* te );
00095
00096 const tag_properties* _startExcludeRegion;
00097 const tag_properties* _startIncludeRegion;
00098
00099 bool _exclude;
00100 bool _include;
00101 bool _defaultInclude;
00102
00103 unsigned int token_pos;
00104 unsigned int tokens_excluded;
00105
00106 indri::api::ParsedDocument _document;
00107
00108 private:
00109 ObjectHandler<indri::api::ParsedDocument>* _handler;
00110
00111 };
00112
00113 }
00114 }
00115
00116 #endif // INDRI_TAGGEDTEXTPARSER_HPP