Lemur: URLTextAnnotator.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // URLTextAnnotator
00015 //
00016 // 23 September 2006 -- tds
00017 // 
00018 // Reads the URL text from the metadata field, parses it,
00019 // and adds it to the parsed document text for indexing.
00020 //
00021 
00022 #ifndef INDRI_URLTEXTANNOTATOR_HPP
00023 #define INDRI_URLTEXTANNOTATOR_HPP
00024 
00025 #include <algorithm>
00026 #include "indri/Buffer.hpp"
00027 #include "indri/Transformation.hpp"
00028 #include "indri/TagExtent.hpp"
00029 #include "indri/ParsedDocument.hpp"
00030 
00032 namespace indri
00033 {
00035   namespace parse
00036   {
00040     class URLTextAnnotator : public Transformation {
00041       indri::utility::Buffer _buffer;
00042       ObjectHandler<indri::api::ParsedDocument>* _handler;
00043 
00044     public:
00045       URLTextAnnotator() {
00046         _handler = 0;
00047       }
00048 
00049       ~URLTextAnnotator() {
00050       }
00051 
00052       indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) {
00053         // find the url metadata pair
00054         indri::utility::greedy_vector<indri::parse::MetadataPair>::iterator iter;
00055        
00056         iter = std::find_if( document->metadata.begin(),
00057                              document->metadata.end(),
00058                              indri::parse::MetadataPair::key_equal( "url" ) );   
00059        
00060         // no need to stick around if there is no url                     
00061         if( iter == document->metadata.end() )
00062           return document;                          
00063         
00064         // need to copy this into the buffer and parse it:
00065         _buffer.clear();                                   
00066         _buffer.grow( iter->valueLength + 1 );
00067         char* urlText = _buffer.write( iter->valueLength );                  
00068         // pushes the buffer pointer, trash in urlText
00069         //        memcpy( _buffer.write( iter->valueLength ), iter->value, iter->valueLength );
00070         memcpy( urlText, iter->value, iter->valueLength );
00071         *_buffer.write(1) = '\0';
00072         
00073         // now we're pointing to the copied urlText, so we can start parsing
00074         int urlStart = (int)document->terms.size();
00075         char* c = urlText;    
00076         bool lastSkipped = true; 
00077         bool foundSlash = false;
00078         int remainingStart = -1;
00079         
00080         // skip the beginning stuff (http://)
00081         for( c = urlText; *c; c++ ) {
00082           if( *c == '/' && c[1] && c[1] == '/' ) {
00083             urlText = c + 2;                            
00084           }
00085         }
00086         int cnt = 0;
00087         
00088         // now, try to find the 
00089         for( c = urlText; *c; c++ ) {
00090           if( *c >= 'A' && *c <= 'Z' ||
00091               *c >= 'a' && *c <= 'z' ||
00092               *c >= '0' && *c <= '9' ) 
00093           {
00094             if( lastSkipped ) {
00095               lastSkipped = false;
00096               document->terms.push_back( c );
00097               cnt++;
00098             }
00099           } else if( *c == '/' && remainingStart < 0 ) {
00100             *c = 0;
00101             lastSkipped = true;
00102             remainingStart = document->terms.size();
00103           } else {
00104             lastSkipped = true;
00105             *c = 0;
00106           }            
00107         }
00108 
00109         // put in phony positions entries
00110         int tokEnd = document->positions.size() ? document->positions[document->positions.size()-1].end : 0;
00111         for (size_t n = document->terms.size()-cnt; n < document->terms.size(); n++) {
00112           TermExtent extent;
00113           extent.begin = tokEnd++; // hope this doesn't run off the end
00114           extent.end = tokEnd;
00115           document->positions.push_back( extent );
00116         }
00117 
00118         // the URL text is now parsed and stored in the document
00119         // all we need to do now is put some tags around the text.
00120         TagExtent *url = new TagExtent;
00121         url->begin = urlStart;
00122         url->end = document->terms.size();
00123         url->name = "url";
00124         url->number = 0;
00125         document->tags.push_back(url);
00126                         
00127         TagExtent *domain = new TagExtent;
00128         domain->begin = urlStart;
00129         domain->end = (remainingStart >= 0) ? remainingStart : document->terms.size();
00130         domain->name = "urldomain";      
00131         domain->number = 0;
00132         document->tags.push_back(domain);
00133         
00134         if( remainingStart > 0 ) {
00135           indri::parse::TagExtent *urlpath = new TagExtent;
00136           urlpath->begin = remainingStart;
00137           urlpath->end = document->terms.size();
00138           urlpath->name = "urlpath";
00139           urlpath->number = 0;
00140           document->tags.push_back(urlpath);
00141         }
00142   
00143         return document;
00144       }
00145 
00146       void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler ) {
00147         _handler = &handler;
00148       }
00149 
00150       void handle( indri::api::ParsedDocument* document ) {
00151         _handler->handle( transform( document ) );
00152       }
00153     };
00154   }
00155 }
00156 
00157 #endif // INDRI_URLTEXTANNOTATOR_HPP
00158                                       
00159