Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

TaggedTextParser.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2005 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // TaggedTextParser
00015 //
00016 // 15 September 2005 -- revised by mwb
00017 //
00018 
00019 #ifndef INDRI_TAGGEDTEXTPARSER_HPP
00020 #define INDRI_TAGGEDTEXTPARSER_HPP
00021 
00022 #include <stdio.h>
00023 #include <ctype.h>
00024 #include <string.h>
00025 #include <string>
00026 #include <vector>
00027 #include <map>
00028 #include "indri/HashTable.hpp"
00029 #include "indri/TagList.hpp"
00030 #include "indri/IndriParser.hpp"
00031 #include "indri/Buffer.hpp"
00032 #include "indri/TokenizedDocument.hpp"
00033 #include "string-set.h"
00034 #include "indri/ConflationPattern.hpp"
00035 #include "indri/Conflater.hpp"
00036 
00037 #define MAX_DOCNO_LENGTH 128
00038 #define PARSER_MAX_BUF_SIZE 1024
00039 
00040 namespace indri
00041 {
00042   namespace parse
00043   {
00044     
00045     class TaggedTextParser : public Parser {
00046     public:
00047       TaggedTextParser();
00048       ~TaggedTextParser();
00049   
00050       void setTags( const std::vector<std::string>& include,
00051                     const std::vector<std::string>& exclude,
00052                     const std::vector<std::string>& index,
00053                     const std::vector<std::string>& metadata, 
00054                     const std::map<indri::parse::ConflationPattern*,std::string>& conflations );
00055 
00056       indri::api::ParsedDocument* parse( TokenizedDocument* document );
00057 
00058       void handle( TokenizedDocument* document );
00059       void setHandler( ObjectHandler<indri::api::ParsedDocument>& h );
00060 
00061     protected:
00062       typedef indri::utility::HashTable<std::string, std::string> StrHashTable;
00063 
00064 
00065       virtual void initialize( TokenizedDocument* document, indri::api::ParsedDocument* parsed );
00066       virtual void cleanup( TokenizedDocument* document, indri::api::ParsedDocument* parsed );
00067 
00068       void addTag(const char *s, const char* c, int pos) { tl->addTag(s, c, pos); }
00069       void endTag(const char *s, const char* c, int pos) { tl->endTag(s, c, pos); }
00070 
00071       void addMetadataTag(const char* s, const char* c, int pos) { _metaList->addTag(s, c, pos); }
00072       void endMetadataTag(const char* s, const char* c, int pos) { _metaList->endTag(s, c, pos); }
00073 
00074       Conflater* _p_conflater;
00075 
00076       // tag list
00077       TagList* tl;
00078       TagList* _metaList;
00079       indri::utility::Buffer _termBuffer;
00080 
00081       struct tag_properties {
00082         const char* name;
00083         bool index;
00084         bool exclude;
00085         bool include;
00086         bool metadata;
00087       };
00088 
00089       tag_properties* _findTag( std::string name );
00090       tag_properties* _buildTag( std::string name );
00091 
00092       indri::utility::HashTable<const char*,tag_properties*> _tagTable;
00093 
00094       virtual void handleTag( TagEvent* te );
00095 
00096       const tag_properties* _startExcludeRegion;
00097       const tag_properties* _startIncludeRegion;
00098   
00099       bool _exclude;
00100       bool _include;
00101       bool _defaultInclude;
00102   
00103       unsigned int token_pos;
00104       unsigned int tokens_excluded;
00105 
00106       indri::api::ParsedDocument _document;
00107   
00108     private:
00109       ObjectHandler<indri::api::ParsedDocument>* _handler;
00110 
00111     };
00112 
00113   }
00114 }
00115 
00116 #endif // INDRI_TAGGEDTEXTPARSER_HPP

Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4