00001 /*========================================================================== 00002 * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // HTMLParser 00015 // 00016 // March 2004 -- metzler 00017 // 00018 #ifndef HTMLPARSER_HPP 00019 #define HTMLPARSER_HPP 00020 #include "indri/TaggedTextParser.hpp" 00021 00022 #ifndef MAX_URL_LENGTH 00023 #define MAX_URL_LENGTH 4096 00024 #endif 00025 namespace indri 00026 { 00027 namespace parse 00028 { 00029 00030 class HTMLParser : public TaggedTextParser { 00031 public: 00032 HTMLParser() { 00033 } 00034 00035 ~HTMLParser() { } 00036 00037 protected: 00038 virtual void initialize( TokenizedDocument* tokenized, indri::api::ParsedDocument* parsed ); 00039 virtual void cleanup( TokenizedDocument* tokenized, indri::api::ParsedDocument* parsed ); 00040 virtual void handleTag(TagEvent *te); 00041 char url[MAX_URL_LENGTH]; 00042 char base_url[MAX_URL_LENGTH]; 00043 void prepURL(char *s); 00044 bool normalizeURL(char *s); 00045 00046 tag_properties* _relativeUrlTag; 00047 tag_properties* _absoluteUrlTag; 00048 tag_properties* _anchorTag; 00049 00050 indri::utility::Buffer _urlBuffer; 00051 }; 00052 } 00053 } 00054 00055 #endif