00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 #ifndef _PARSER_HPP 00013 #define _PARSER_HPP 00014 00015 #include "TextHandler.hpp" 00016 #include "WordSet.hpp" 00017 namespace lemur 00018 { 00019 namespace api 00020 { 00023 00030 class Parser : public TextHandler { 00031 public: 00032 static const string category; 00033 static const string identifier; 00034 00035 Parser(); 00036 virtual ~Parser(); 00037 00040 virtual void parse(const string &filename); 00041 00044 virtual void parseFile(const string &filename) = 0; 00045 00047 virtual void parseBuffer(char * buf, int len) = 0; 00048 00052 virtual void setAcroList(const lemur::utility::WordSet * acronyms); 00053 00055 virtual void setAcroList(string filename); 00056 00058 virtual long fileTell() const = 0; 00059 00061 virtual long getDocBytePos() const { return docpos; } 00062 00064 virtual const string getParseFile() const { return parsefile; } 00065 00066 protected: 00069 bool isAcronym(const char * word); 00071 void clearAcros(); 00072 00073 long docpos; 00074 00075 string parsefile; 00076 private: 00078 lemur::utility::WordSet * myacros; 00079 const lemur::utility::WordSet* borrowedacros; 00080 }; 00081 } 00082 } 00083 00084 #endif