00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_ANCHORTEXTWRITER_HPP
00020 #define INDRI_ANCHORTEXTWRITER_HPP
00021
00022 #include <iostream>
00023 #include <algorithm>
00024 #include "indri/Path.hpp"
00025 #include "lemur-compat.hpp"
00026 namespace indri
00027 {
00028 namespace parse
00029 {
00031 class AnchorTextWriter : public ObjectHandler<indri::api::ParsedDocument> {
00032 private:
00033 std::ofstream _out;
00034
00035 public:
00036 AnchorTextWriter( const std::string& outputPath ) {
00037 std::string directory = indri::file::Path::directory( outputPath );
00038 indri::file::Path::make( directory );
00039 _out.open( outputPath.c_str(), std::ios::out | std::ios::binary);
00040 }
00041
00042 ~AnchorTextWriter() {
00043 _out.close();
00044 }
00045
00046 void handle( indri::api::ParsedDocument* document ) {
00047 indri::utility::greedy_vector<MetadataPair>::iterator iter;
00048
00049 iter = std::find_if( document->metadata.begin(),
00050 document->metadata.end(),
00051 MetadataPair::key_equal( "docno" ) );
00052
00053 const char* docno = (char*)iter->value;
00054
00055 iter = std::find_if( document->metadata.begin(),
00056 document->metadata.end(),
00057 MetadataPair::key_equal( "url" ) );
00058
00059 const char* page = (char*)iter->value;
00060 const char* url = 0;
00061 int count = 0;
00062 int urlEnd = -1;
00063
00064
00065
00066 const char* slash = 0;
00067 if(page) slash = strchr( page, '/' );
00068 if(slash) slash = strchr( slash+1, '/' );
00069 if(slash) slash = strchr( slash+1, '/' );
00070
00071 size_t domainLength;
00072 if( slash )
00073 domainLength = slash - page;
00074 else
00075 domainLength = strlen(page);
00076
00077
00078 for( unsigned int i=0; i<document->tags.size(); i++ ) {
00079 TagExtent& extent = *(document->tags[i]);
00080
00081
00082 if( !strcmp( extent.name, "absolute-url" ) ||
00083 !strcmp( extent.name, "relative-url" ) ) {
00084 url = document->terms[ extent.begin ];
00085 urlEnd = extent.end;
00086
00087
00088
00089
00090
00091
00092 } else if( !strcmp( extent.name, "a" ) &&
00093 url &&
00094 urlEnd == extent.begin &&
00095 extent.end - extent.begin > 0 )
00096 {
00097 count++;
00098 url = 0;
00099 }
00100 }
00101
00102
00103 _out << "DOCNO=" << docno << std::endl;
00104 _out << "DOCURL=" << page << std::endl;
00105 _out << "LINKS=" << count << std::endl;
00106 url = 0;
00107 urlEnd = -1;
00108
00109 for( unsigned int i=0; i<document->tags.size(); i++ ) {
00110 TagExtent& extent = *(document->tags[i]);
00111
00112 if( !strcmp( extent.name, "absolute-url" ) ||
00113 !strcmp( extent.name, "relative-url" ) ) {
00114 url = document->terms[ extent.begin ];
00115 urlEnd = extent.end;
00116
00117
00118
00119
00120
00121
00122 } else if( !strcmp( extent.name, "a" ) &&
00123 url &&
00124 urlEnd == extent.begin &&
00125 extent.end - extent.begin > 0 )
00126 {
00127 int textLength = 0;
00128
00129 _out << "LINKURL=" << url << std::endl;
00130 _out << "TEXT=\"";
00131 for( size_t j=extent.begin; int(j) < extent.end && textLength < 60000; j++ ) {
00132 if( !document->terms[j] )
00133 continue;
00134
00135 textLength += strlen(document->terms[j])+1;
00136 _out << document->terms[j] << " ";
00137 }
00138 _out << "\"" << std::endl;
00139
00140
00141 url = 0;
00142 }
00143 }
00144 }
00145 };
00146 }
00147 }
00148
00149 #endif // INDRI_ANCHORTEXTWRITER_HPP
00150