00001 /*========================================================================== 00002 * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // TextParser 00015 // 00016 // 16 August 2004 -- tds 00017 // 00018 00019 #ifndef INDRI_TEXTPARSER_HPP 00020 #define INDRI_TEXTPARSER_HPP 00021 00022 #include <stdio.h> 00023 #include <ctype.h> 00024 #include <string.h> 00025 #include <string> 00026 #include <vector> 00027 #include "indri/IndriParser.hpp" 00028 #include "indri/Buffer.hpp" 00029 #include "indri/ConflationPattern.hpp" 00030 #include "string-set.h" 00031 namespace indri 00032 { 00033 namespace parse 00034 { 00035 00036 class TextParser : public Parser { 00037 public: 00038 TextParser(); 00039 ~TextParser(); 00040 00041 indri::api::ParsedDocument* parse( TokenizedDocument* document ); 00042 00043 void handle( TokenizedDocument* document ); 00044 void setHandler( ObjectHandler<indri::api::ParsedDocument>& h ); 00045 00046 void setTags( const std::vector<std::string>& include, 00047 const std::vector<std::string>& exclude, 00048 const std::vector<std::string>& index, 00049 const std::vector<std::string>& metadata, 00050 const std::map<ConflationPattern*, std::string>& conflations ); 00051 00052 protected: 00053 void writeToken(char* token); 00054 void writeToken(char *token, int start, int end); 00055 indri::utility::Buffer _termBuffer; 00056 00057 private: 00058 ObjectHandler<indri::api::ParsedDocument>* _handler; 00059 indri::api::ParsedDocument _document; 00060 }; 00061 } 00062 } 00063 00064 #endif // INDRI_TEXTPARSER_HPP 00065