00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifndef INDRI_ANCHORTEXTANNOTATOR_HPP
00024 #define INDRI_ANCHORTEXTANNOTATOR_HPP
00025
00026 #include "indri/Buffer.hpp"
00027 #include "indri/Transformation.hpp"
00028 #include <iostream>
00029 #include "indri/TagExtent.hpp"
00030 #include "indri/ParsedDocument.hpp"
00031 #include <fstream>
00032 #include <algorithm>
00033
00035 namespace indri
00036 {
00038 namespace parse
00039 {
00043 class AnchorTextAnnotator : public Transformation {
00044 std::ifstream _in;
00045 char _docno[256];
00046 int _count;
00047 indri::utility::Buffer _buffer;
00048 ObjectHandler<indri::api::ParsedDocument>* _handler;
00049
00050 void _readDocumentHeader() {
00051 char line[65536];
00052 _count = 0;
00053
00054 if( !_in.good() || _in.eof() )
00055 return;
00056
00057
00058 _in.getline( _docno, sizeof _docno-1 );
00059 if( !_in.good() || _in.eof() )
00060 return;
00061
00062
00063 _in.getline( line, sizeof line-1 );
00064 if( !_in.good() || _in.eof() )
00065 return;
00066
00067
00068 _in.getline( line, sizeof line-1 );
00069
00070 _count = atoi( line+6 );
00071 }
00072
00073 void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) {
00074
00075 char line[65536];
00076 _buffer.clear();
00077 for( int i=0; i<_count; i++ ) {
00078
00079 _in.getline( line, sizeof line-1 );
00080
00081
00082 _in.getline( line, sizeof line-1 );
00083
00084
00085 _in.getline( line, sizeof line-1 );
00086 if (!line[0]) continue;
00087 size_t textLen = strlen(line+6);
00088 strcpy( _buffer.write(textLen+1), line+6 );
00089 _buffer.unwrite(1);
00090
00091 assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" );
00092 }
00093 *(_buffer.write(1)) = 0;
00094
00095
00096
00097
00098 char* beginWord = 0;
00099 int beginIndex = 0;
00100 char* buffer = _buffer.front();
00101
00102 for( unsigned int i=0; i<_buffer.position(); i++ ) {
00103 #ifndef WIN32
00104 if( isalnum(buffer[i]) && !beginWord ) {
00105 #else
00106 if( (buffer[i] >= 0 && isalnum(buffer[i])) && !beginWord ) {
00107 #endif
00108 beginWord = buffer+i;
00109
00110 if(!beginIndex)
00111 beginIndex = (int)terms.size();
00112 } else if( isspace(buffer[i]) ) {
00113 buffer[i] = 0;
00114 if( beginWord )
00115 terms.push_back( beginWord );
00116 beginWord = 0;
00117 } else if( buffer[i] == '\"' ) {
00118 buffer[i] = 0;
00119 if( beginWord )
00120 terms.push_back( beginWord );
00121 beginWord = 0;
00122
00123 if( beginIndex ) {
00124 TagExtent * extent = new TagExtent;
00125 extent->name = "inlink";
00126 extent->begin = beginIndex;
00127 extent->end = (int)terms.size();
00128 extent->number = 0;
00129 extent->parent = 0;
00130
00131 assert( extent->begin <= extent->end );
00132
00133 tags.push_back(extent);
00134 if( terms.size() > 125000 )
00135 break;
00136 }
00137
00138
00139 beginIndex = 0;
00140 }
00141
00142 }
00143
00144 }
00145
00146 bool _matchingDocno( indri::api::ParsedDocument* document ) {
00147
00148 for( size_t i=0; i<document->metadata.size(); i++ ) {
00149 const char* attributeName = document->metadata[i].key;
00150 const char* attributeValue = (const char*) document->metadata[i].value;
00151
00152 if( !strcmp( attributeName, "docno" ) ) {
00153 if( !strcmp( attributeValue, _docno+6 ) ) {
00154 return true;
00155 } else {
00156 return false;
00157 }
00158 }
00159 }
00160
00161 return false;
00162 }
00163
00164 public:
00165 AnchorTextAnnotator() {
00166 _handler = 0;
00167 }
00168
00169 ~AnchorTextAnnotator() {
00170 _in.close();
00171 }
00172
00173 void open( const std::string& anchorFile ) {
00174 _in.close();
00175 _in.clear();
00176 _in.open( anchorFile.c_str() );
00177 _buffer.clear();
00178 _readDocumentHeader();
00179 }
00180
00181 indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) {
00182 _buffer.clear();
00183 _buffer.grow(2*1024*1024);
00184
00185
00186 TagExtent * mainbody = new TagExtent;
00187 mainbody->begin = 0;
00188 mainbody->end = (int)document->terms.size();
00189 mainbody->name = "mainbody";
00190 mainbody->number = 0;
00191 mainbody->parent = 0;
00192
00193 document->tags.push_back( mainbody );
00194
00195
00196 indri::utility::greedy_vector<TagExtent *>::iterator oldTag = document->tags.begin();
00197 indri::utility::greedy_vector<TagExtent *>::iterator oldTagsEnd = document->tags.end();
00198 while ( oldTag != oldTagsEnd ) {
00199 if ((*oldTag)->parent == 0) {
00200 (*oldTag)->parent = mainbody;
00201 }
00202 oldTag++;
00203 }
00204
00205 if( _matchingDocno( document ) ) {
00206 _fetchText( document->tags, document->terms );
00207 _readDocumentHeader();
00208 }
00209 std::sort( document->tags.begin(), document->tags.end(), indri::parse::LessTagExtent() );
00210 return document;
00211 }
00212
00213 void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler ) {
00214 _handler = &handler;
00215 }
00216
00217 void handle( indri::api::ParsedDocument* document ) {
00218 _handler->handle( transform( document ) );
00219 }
00220 };
00221 }
00222 }
00223
00224 #endif // INDRI_ANCHORTEXTANNOTATOR_HPP
00225