00001 /*========================================================================== 00002 * Copyright (c) 2000-2004 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software (and below), and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 #ifndef _INDRITEXTHANDLER_HPP 00013 #define _INDRITEXTHANDLER_HPP 00014 00015 #include "Parser.hpp" 00016 #include "indri/ParsedDocument.hpp" 00017 #include "indri/IndexEnvironment.hpp" 00018 00019 namespace lemur 00020 { 00021 namespace parse 00022 { 00023 00029 #define DOCIDKEY "docno" 00030 00031 class IndriTextHandler : public lemur::api::TextHandler { 00032 00033 public: 00036 IndriTextHandler(const string &name, int memory, 00037 const lemur::api::Parser* p); 00038 ~IndriTextHandler(); 00039 00041 char * handleDoc(char * docno); 00043 void handleEndDoc(); 00045 char * handleWord(char * word, const char* original, PropertyList* list); 00046 char * handleBeginTag(char* tag, const char* orig, PropertyList* props); 00047 char * handleEndTag(char* tag, const char* orig, PropertyList* props); 00048 00049 protected: 00051 indri::api::IndexEnvironment env; 00053 indri::api::ParsedDocument document; 00055 indri::parse::MetadataPair docid; 00057 char* curdocno; 00059 int docbegin; 00061 const lemur::api::Parser* parser; 00063 }; 00064 } 00065 } 00066 00067 #endif 00068