00001 /*========================================================================== 00002 * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // URLTextAnnotator 00015 // 00016 // 23 September 2006 -- tds 00017 // 00018 // Reads the URL text from the metadata field, parses it, 00019 // and adds it to the parsed document text for indexing. 00020 // 00021 00022 #ifndef INDRI_URLTEXTANNOTATOR_HPP 00023 #define INDRI_URLTEXTANNOTATOR_HPP 00024 00025 #include <algorithm> 00026 #include "indri/Buffer.hpp" 00027 #include "indri/Transformation.hpp" 00028 #include "indri/TagExtent.hpp" 00029 #include "indri/ParsedDocument.hpp" 00030 00032 namespace indri 00033 { 00035 namespace parse 00036 { 00040 class URLTextAnnotator : public Transformation { 00041 indri::utility::Buffer _buffer; 00042 ObjectHandler<indri::api::ParsedDocument>* _handler; 00043 00044 public: 00045 URLTextAnnotator() { 00046 _handler = 0; 00047 } 00048 00049 ~URLTextAnnotator() { 00050 } 00051 00052 indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) { 00053 // find the url metadata pair 00054 indri::utility::greedy_vector<indri::parse::MetadataPair>::iterator iter; 00055 00056 iter = std::find_if( document->metadata.begin(), 00057 document->metadata.end(), 00058 indri::parse::MetadataPair::key_equal( "url" ) ); 00059 00060 // no need to stick around if there is no url 00061 if( iter == document->metadata.end() ) 00062 return document; 00063 00064 // need to copy this into the buffer and parse it: 00065 _buffer.clear(); 00066 _buffer.grow( iter->valueLength + 1 ); 00067 char* urlText = _buffer.write( iter->valueLength ); 00068 // pushes the buffer pointer, trash in urlText 00069 // memcpy( _buffer.write( iter->valueLength ), iter->value, iter->valueLength ); 00070 memcpy( urlText, iter->value, iter->valueLength ); 00071 *_buffer.write(1) = '\0'; 00072 00073 // now we're pointing to the copied urlText, so we can start parsing 00074 int urlStart = (int)document->terms.size(); 00075 char* c = urlText; 00076 bool lastSkipped = true; 00077 bool foundSlash = false; 00078 int remainingStart = -1; 00079 00080 // skip the beginning stuff (http://) 00081 for( c = urlText; *c; c++ ) { 00082 if( *c == '/' && c[1] && c[1] == '/' ) { 00083 urlText = c + 2; 00084 } 00085 } 00086 int cnt = 0; 00087 00088 // now, try to find the 00089 for( c = urlText; *c; c++ ) { 00090 if( *c >= 'A' && *c <= 'Z' || 00091 *c >= 'a' && *c <= 'z' || 00092 *c >= '0' && *c <= '9' ) 00093 { 00094 if( lastSkipped ) { 00095 lastSkipped = false; 00096 document->terms.push_back( c ); 00097 cnt++; 00098 } 00099 } else if( *c == '/' && remainingStart < 0 ) { 00100 *c = 0; 00101 lastSkipped = true; 00102 remainingStart = document->terms.size(); 00103 } else { 00104 lastSkipped = true; 00105 *c = 0; 00106 } 00107 } 00108 00109 // put in phony positions entries 00110 int tokEnd = document->positions.size() ? document->positions[document->positions.size()-1].end : 0; 00111 for (size_t n = document->terms.size()-cnt; n < document->terms.size(); n++) { 00112 TermExtent extent; 00113 extent.begin = tokEnd++; // hope this doesn't run off the end 00114 extent.end = tokEnd; 00115 document->positions.push_back( extent ); 00116 } 00117 00118 // the URL text is now parsed and stored in the document 00119 // all we need to do now is put some tags around the text. 00120 TagExtent *url = new TagExtent; 00121 url->begin = urlStart; 00122 url->end = document->terms.size(); 00123 url->name = "url"; 00124 url->number = 0; 00125 document->tags.push_back(url); 00126 00127 TagExtent *domain = new TagExtent; 00128 domain->begin = urlStart; 00129 domain->end = (remainingStart >= 0) ? remainingStart : document->terms.size(); 00130 domain->name = "urldomain"; 00131 domain->number = 0; 00132 document->tags.push_back(domain); 00133 00134 if( remainingStart > 0 ) { 00135 indri::parse::TagExtent *urlpath = new TagExtent; 00136 urlpath->begin = remainingStart; 00137 urlpath->end = document->terms.size(); 00138 urlpath->name = "urlpath"; 00139 urlpath->number = 0; 00140 document->tags.push_back(urlpath); 00141 } 00142 00143 return document; 00144 } 00145 00146 void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler ) { 00147 _handler = &handler; 00148 } 00149 00150 void handle( indri::api::ParsedDocument* document ) { 00151 _handler->handle( transform( document ) ); 00152 } 00153 }; 00154 } 00155 } 00156 00157 #endif // INDRI_URLTEXTANNOTATOR_HPP 00158 00159