Lemur: AnchorTextAnnotator.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // AnchorTextAnnotator
00015 //
00016 // 25 May 2004 -- tds
00017 //
00018 // Reads anchor text in from files created by the 
00019 // combiner, and adds the text to the end of the
00020 // parsed document
00021 //
00022 
00023 #ifndef INDRI_ANCHORTEXTANNOTATOR_HPP
00024 #define INDRI_ANCHORTEXTANNOTATOR_HPP
00025 
00026 #include "indri/Buffer.hpp"
00027 #include "indri/Transformation.hpp"
00028 #include <iostream>
00029 #include "indri/TagExtent.hpp"
00030 #include "indri/ParsedDocument.hpp"
00031 #include <fstream>
00032 #include <algorithm>
00033 
00035 namespace indri
00036 {
00038   namespace parse
00039   {
00043     class AnchorTextAnnotator : public Transformation {
00044       std::ifstream _in;
00045       char _docno[256];
00046       int _count;
00047       indri::utility::Buffer _buffer;
00048       ObjectHandler<indri::api::ParsedDocument>* _handler;
00049 
00050       void _readDocumentHeader() {
00051         char line[65536];
00052         _count = 0;
00053 
00054         if( !_in.good() || _in.eof() )
00055           return;
00056 
00057         // DOCNO=
00058         _in.getline( _docno, sizeof _docno-1 );
00059         if( !_in.good() || _in.eof() )
00060           return;
00061 
00062         // DOCURL=
00063         _in.getline( line, sizeof line-1 );
00064         if( !_in.good() || _in.eof() )
00065           return;
00066 
00067         // LINKS=
00068         _in.getline( line, sizeof line-1 );
00069 
00070         _count = atoi( line+6 );
00071       }
00072 
00073       void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) {
00074         // now, fetch the additional terms
00075         char line[65536];
00076         _buffer.clear();
00077         for( int i=0; i<_count; i++ ) {
00078           // LINK
00079           _in.getline( line, sizeof line-1 );
00080 
00081           // LINKDOCNO 
00082           _in.getline( line, sizeof line-1 );
00083           
00084           // TEXT=
00085           _in.getline( line, sizeof line-1 );
00086           if (!line[0]) continue;
00087           size_t textLen = strlen(line+6);
00088           strcpy( _buffer.write(textLen+1), line+6 );
00089           _buffer.unwrite(1);
00090           
00091           assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" );
00092         }
00093         *(_buffer.write(1)) = 0;
00094 
00095         // now there's a bunch of text in _buffer, space separated, with each
00096         // link separated by a " symbol
00097 
00098         char* beginWord = 0;
00099         int beginIndex = 0;
00100         char* buffer = _buffer.front();
00101 
00102         for( unsigned int i=0; i<_buffer.position(); i++ ) {
00103 #ifndef WIN32
00104           if( isalnum(buffer[i]) && !beginWord ) {
00105 #else
00106           if( (buffer[i] >= 0 && isalnum(buffer[i])) && !beginWord ) {
00107 #endif
00108             beginWord = buffer+i;
00109 
00110             if(!beginIndex)
00111               beginIndex = (int)terms.size();
00112           } else if( isspace(buffer[i]) ) {
00113             buffer[i] = 0;
00114             if( beginWord )
00115               terms.push_back( beginWord );
00116             beginWord = 0;
00117           } else if( buffer[i] == '\"' ) {
00118             buffer[i] = 0;
00119             if( beginWord )
00120               terms.push_back( beginWord );
00121             beginWord = 0;
00122 
00123             if( beginIndex ) {        
00124               TagExtent * extent = new TagExtent;
00125               extent->name = "inlink";
00126               extent->begin = beginIndex;
00127               extent->end = (int)terms.size();
00128               extent->number = 0;
00129               extent->parent = 0;
00130 
00131               assert( extent->begin <= extent->end );
00132 
00133               tags.push_back(extent);
00134               if( terms.size() > 125000 )
00135                 break;
00136             }
00137 
00138 
00139             beginIndex = 0;
00140           }
00141 
00142         }
00143 
00144       }
00145 
00146       bool _matchingDocno( indri::api::ParsedDocument* document ) {
00147         // find DOCNO attribute in document
00148         for( size_t i=0; i<document->metadata.size(); i++ ) {
00149           const char* attributeName = document->metadata[i].key;
00150           const char* attributeValue = (const char*) document->metadata[i].value;
00151 
00152           if( !strcmp( attributeName, "docno" ) ) {
00153             if( !strcmp( attributeValue, _docno+6 ) ) {
00154               return true;
00155             } else {
00156               return false;
00157             }
00158           }
00159         }
00160  
00161         return false;
00162       }
00163 
00164     public:
00165       AnchorTextAnnotator() {
00166         _handler = 0;
00167       }
00168 
00169       ~AnchorTextAnnotator() {
00170         _in.close();
00171       }
00172 
00173       void open( const std::string& anchorFile ) {
00174         _in.close();
00175         _in.clear();
00176         _in.open( anchorFile.c_str() );
00177         _buffer.clear();
00178         _readDocumentHeader();
00179       }
00180 
00181       indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) {
00182         _buffer.clear();
00183         _buffer.grow(2*1024*1024);
00184 
00185         // surround current text with a mainbody tag
00186         TagExtent * mainbody = new TagExtent;
00187         mainbody->begin = 0;
00188         mainbody->end = (int)document->terms.size();
00189         mainbody->name = "mainbody";
00190         mainbody->number = 0;
00191         mainbody->parent = 0;
00192         // order no longer matters - the indexer takes care of any sorting needed
00193         document->tags.push_back( mainbody );
00194 
00195         // set old tags' parent to mainbody
00196         indri::utility::greedy_vector<TagExtent *>::iterator oldTag = document->tags.begin();
00197         indri::utility::greedy_vector<TagExtent *>::iterator oldTagsEnd = document->tags.end();
00198         while ( oldTag != oldTagsEnd ) {
00199           if ((*oldTag)->parent == 0) {
00200             (*oldTag)->parent = mainbody;
00201           }
00202           oldTag++;
00203         }
00204 
00205         if( _matchingDocno( document ) ) {
00206           _fetchText( document->tags, document->terms );
00207           _readDocumentHeader();
00208         }
00209         std::sort( document->tags.begin(), document->tags.end(), indri::parse::LessTagExtent() );
00210         return document;
00211       }
00212 
00213       void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler ) {
00214         _handler = &handler;
00215       }
00216 
00217       void handle( indri::api::ParsedDocument* document ) {
00218         _handler->handle( transform( document ) );
00219       }
00220     };
00221   }
00222 }
00223 
00224 #endif // INDRI_ANCHORTEXTANNOTATOR_HPP
00225