00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <cstring>
00013 #include "common_headers.hpp"
00014
00015 #ifndef NULL
00016 #define NULL 0
00017 #endif
00018
00019 #ifndef _TEXTHANDLER_HPP
00020 #define _TEXTHANDLER_HPP
00021 #include <cstdio>
00022 #include "PropertyList.hpp"
00023 #include "Exception.hpp"
00024
00025 #define MAXWORDSIZE 1024
00026
00027 namespace lemur
00028 {
00029 namespace api
00030 {
00031
00033
00063 class TextHandler {
00064
00065 public:
00066 enum TokenType {BEGINDOC = 1, ENDDOC = 2, WORDTOK = 3,
00067 BEGINTAG = 4, ENDTAG = 5, SYMBOLTOK = 6};
00068 static const string category;
00069 static const string identifier;
00070
00071 TextHandler() {
00072 textHandler = NULL;
00073 prevHandler = NULL;
00074 buffer[MAXWORDSIZE-1] = '\0';
00075 cat = category;
00076 iden = identifier;
00077 }
00078 virtual ~TextHandler() {
00079 if (textHandler)
00080 textHandler->destroyPrevHandler();
00081 if (prevHandler)
00082 prevHandler->destroyTextHandler();
00083 }
00084
00086 virtual void setTextHandler(TextHandler * th) {
00087 textHandler = th;
00088 textHandler->setPrevHandler(this);
00089 }
00090
00092 virtual TextHandler * getTextHandler() {
00093 return textHandler;
00094 }
00095
00097 virtual TextHandler * getPrevHandler() {
00098 return prevHandler;
00099 }
00100
00101 virtual void foundToken(TokenType type,
00102 const char * token = NULL,
00103 const char * orig = NULL,
00104 lemur::parse::PropertyList * properties = NULL) {
00105 char * t = NULL;
00106
00107 if (token != NULL) {
00108 strncpy(buffer, token, MAXWORDSIZE - 1);
00109 t = buffer;
00110 }
00111
00112 switch (type) {
00113
00114 case BEGINDOC:
00115 t = handleBeginDoc(t, orig, properties);
00116 break;
00117 case ENDDOC:
00118 t = handleEndDoc(t, orig, properties);
00119 break;
00120 case WORDTOK:
00121 t = handleWord(t, orig, properties);
00122 break;
00123 case BEGINTAG:
00124 t = handleBeginTag(t, orig, properties);
00125 break;
00126 case ENDTAG:
00127 t = handleEndTag(t, orig, properties);
00128 break;
00129 case SYMBOLTOK:
00130 t = handleSymbol(t, orig, properties);
00131 break;
00132 }
00133
00134 if (textHandler != NULL) {
00135 textHandler->foundToken(type, t, orig, properties);
00136 }
00137 }
00138
00141 virtual char * handleBeginDoc(char * docno, const char * original,
00142 lemur::parse::PropertyList * list) {
00143 return handleDoc(docno);
00144 }
00147 virtual char * handleEndDoc(char * token, const char * original,
00148 lemur::parse::PropertyList * list) {
00149 handleEndDoc();
00150 return token;
00151 }
00154 virtual char * handleWord(char * word, const char * original,
00155 lemur::parse::PropertyList * list) {
00156 return handleWord(word);
00157 }
00159 virtual char * handleBeginTag(char * tag, const char * original,
00160 lemur::parse::PropertyList * list) {
00161 return tag;
00162 }
00164 virtual char * handleEndTag(char * tag, const char * original,
00165 lemur::parse::PropertyList * list) {
00166 return tag;
00167 }
00168
00171 virtual char * handleSymbol(char * symbol, const char * original,
00172 lemur::parse::PropertyList * list) {
00173 return handleSymbol(symbol);
00174 }
00175
00176
00177
00178
00180 virtual void foundDoc(char * docno) {
00181 foundToken(BEGINDOC, docno, docno);
00182 }
00183 virtual void foundDoc(char * docno, const char * original) {
00184 foundToken(BEGINDOC, docno, original);
00185 }
00187 virtual void foundWord(char * word) {
00188 foundToken(WORDTOK, word, word);
00189 }
00190 virtual void foundWord(char * word, const char * original) {
00191 foundToken(WORDTOK, word, original);
00192 }
00194 virtual void foundEndDoc() {
00195 foundToken(ENDDOC);
00196 }
00198 virtual void foundSymbol(const char * sym) {
00199 foundToken(SYMBOLTOK, sym, sym);
00200 }
00201
00203 virtual char * handleDoc(char * docno) { return docno; }
00205 virtual char * handleWord(char * word) { return word; }
00207 virtual void handleEndDoc() { }
00209 virtual char * handleSymbol(char * sym) { return sym; }
00210
00212 virtual string getCategory() const { return cat; }
00214 virtual string getIdentifier() const { return iden; }
00216 virtual void writePropertyList(lemur::parse::PropertyList* list) const{
00217 if (!list) {
00218 LEMUR_THROW(LEMUR_INTERNAL_ERROR, cat + " unable to save properties list");
00219 return;
00220 }
00221 lemur::parse::Property prop(cat);
00222 prop.setValue(iden);
00223 list->setProperty(&prop);
00224 }
00225
00226 protected:
00228 virtual void setPrevHandler(TextHandler * th) {
00229 prevHandler = th;
00230 }
00231
00233 virtual void destroyPrevHandler() {
00234 if (prevHandler)
00235 prevHandler = prevHandler->getPrevHandler();
00236 }
00237
00239 virtual void destroyTextHandler() {
00240 if (textHandler)
00241 textHandler = textHandler->getTextHandler();
00242 }
00243
00245 TextHandler * textHandler;
00247 TextHandler * prevHandler;
00248 string cat;
00249 string iden;
00250
00251 char buffer[MAXWORDSIZE];
00252 };
00253 }
00254 }
00255 #endif