00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 #ifndef INDRI_ANCHORTEXTANNOTATOR_HPP
00024 #define INDRI_ANCHORTEXTANNOTATOR_HPP
00025 
00026 #include "indri/Buffer.hpp"
00027 #include "indri/Transformation.hpp"
00028 #include <iostream>
00029 #include "indri/TagExtent.hpp"
00030 #include "indri/ParsedDocument.hpp"
00031 #include <fstream>
00032 #include <algorithm>
00033 
00035 namespace indri
00036 {
00038   namespace parse
00039   {
00043     class AnchorTextAnnotator : public Transformation {
00044       std::ifstream _in;
00045       char _docno[256];
00046       int _count;
00047       indri::utility::Buffer _buffer;
00048       ObjectHandler<indri::api::ParsedDocument>* _handler;
00049 
00050       void _readDocumentHeader() {
00051         char line[65536];
00052         _count = 0;
00053 
00054         if( !_in.good() || _in.eof() )
00055           return;
00056 
00057         
00058         _in.getline( _docno, sizeof _docno-1 );
00059         if( !_in.good() || _in.eof() )
00060           return;
00061 
00062         
00063         _in.getline( line, sizeof line-1 );
00064         if( !_in.good() || _in.eof() )
00065           return;
00066 
00067         
00068         _in.getline( line, sizeof line-1 );
00069 
00070         _count = atoi( line+6 );
00071       }
00072 
00073       void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) {
00074         
00075         char line[65536];
00076         _buffer.clear();
00077         for( int i=0; i<_count; i++ ) {
00078           
00079           _in.getline( line, sizeof line-1 );
00080 
00081           
00082           _in.getline( line, sizeof line-1 );
00083           
00084           
00085           _in.getline( line, sizeof line-1 );
00086           if (!line[0]) continue;
00087           size_t textLen = strlen(line+6);
00088           strcpy( _buffer.write(textLen+1), line+6 );
00089           _buffer.unwrite(1);
00090           
00091           assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" );
00092         }
00093         *(_buffer.write(1)) = 0;
00094 
00095         
00096         
00097 
00098         char* beginWord = 0;
00099         int beginIndex = 0;
00100         char* buffer = _buffer.front();
00101 
00102         for( unsigned int i=0; i<_buffer.position(); i++ ) {
00103 #ifndef WIN32
00104           if( isalnum(buffer[i]) && !beginWord ) {
00105 #else
00106           if( (buffer[i] >= 0 && isalnum(buffer[i])) && !beginWord ) {
00107 #endif
00108             beginWord = buffer+i;
00109 
00110             if(!beginIndex)
00111               beginIndex = (int)terms.size();
00112           } else if( isspace(buffer[i]) ) {
00113             buffer[i] = 0;
00114             if( beginWord )
00115               terms.push_back( beginWord );
00116             beginWord = 0;
00117           } else if( buffer[i] == '\"' ) {
00118             buffer[i] = 0;
00119             if( beginWord )
00120               terms.push_back( beginWord );
00121             beginWord = 0;
00122 
00123             if( beginIndex ) {        
00124               TagExtent * extent = new TagExtent;
00125               extent->name = "inlink";
00126               extent->begin = beginIndex;
00127               extent->end = (int)terms.size();
00128               extent->number = 0;
00129               extent->parent = 0;
00130 
00131               assert( extent->begin <= extent->end );
00132 
00133               tags.push_back(extent);
00134               if( terms.size() > 125000 )
00135                 break;
00136             }
00137 
00138 
00139             beginIndex = 0;
00140           }
00141 
00142         }
00143 
00144       }
00145 
00146       bool _matchingDocno( indri::api::ParsedDocument* document ) {
00147         
00148         for( size_t i=0; i<document->metadata.size(); i++ ) {
00149           const char* attributeName = document->metadata[i].key;
00150           const char* attributeValue = (const char*) document->metadata[i].value;
00151 
00152           if( !strcmp( attributeName, "docno" ) ) {
00153             if( !strcmp( attributeValue, _docno+6 ) ) {
00154               return true;
00155             } else {
00156               return false;
00157             }
00158           }
00159         }
00160  
00161         return false;
00162       }
00163 
00164     public:
00165       AnchorTextAnnotator() {
00166         _handler = 0;
00167       }
00168 
00169       ~AnchorTextAnnotator() {
00170         _in.close();
00171       }
00172 
00173       void open( const std::string& anchorFile ) {
00174         _in.close();
00175         _in.clear();
00176         _in.open( anchorFile.c_str() );
00177         _buffer.clear();
00178         _readDocumentHeader();
00179       }
00180 
00181       indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) {
00182         _buffer.clear();
00183         _buffer.grow(2*1024*1024);
00184 
00185         
00186         TagExtent * mainbody = new TagExtent;
00187         mainbody->begin = 0;
00188         mainbody->end = (int)document->terms.size();
00189         mainbody->name = "mainbody";
00190         mainbody->number = 0;
00191         mainbody->parent = 0;
00192         
00193         document->tags.push_back( mainbody );
00194 
00195         
00196         indri::utility::greedy_vector<TagExtent *>::iterator oldTag = document->tags.begin();
00197         indri::utility::greedy_vector<TagExtent *>::iterator oldTagsEnd = document->tags.end();
00198         while ( oldTag != oldTagsEnd ) {
00199           if ((*oldTag)->parent == 0) {
00200             (*oldTag)->parent = mainbody;
00201           }
00202           oldTag++;
00203         }
00204 
00205         if( _matchingDocno( document ) ) {
00206           _fetchText( document->tags, document->terms );
00207           _readDocumentHeader();
00208         }
00209         std::sort( document->tags.begin(), document->tags.end(), indri::parse::LessTagExtent() );
00210         return document;
00211       }
00212 
00213       void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler ) {
00214         _handler = &handler;
00215       }
00216 
00217       void handle( indri::api::ParsedDocument* document ) {
00218         _handler->handle( transform( document ) );
00219       }
00220     };
00221   }
00222 }
00223 
00224 #endif // INDRI_ANCHORTEXTANNOTATOR_HPP
00225