00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #ifndef INDRI_COMBINER_HPP
00021 #define INDRI_COMBINER_HPP
00022
00023 #include <iostream>
00024 #include <vector>
00025 #include <string>
00026 #include "indri/Buffer.hpp"
00027 #include "indri/HashTable.hpp"
00028 namespace indri
00029 {
00030 namespace parse
00031 {
00032
00033 class Combiner {
00034 private:
00035 std::vector< std::ofstream* > _docBucketFiles;
00036 std::vector< std::ofstream* > _linkBucketFiles;
00037 std::vector< std::stringstream* > _docBuckets;
00038 std::vector< std::stringstream* > _linkBuckets;
00039 int _bins;
00040
00041 struct strhash {
00042 public:
00043 int operator() ( const char* k ) const {
00044 int hash = 0;
00045 for( ; *k; k++ ){
00046 hash *= 7;
00047 hash += *k;
00048 }
00049 return hash;
00050 }
00051 };
00052
00053 struct strcompst {
00054 public:
00055 int operator () ( const char* o, const char* t ) const {
00056 return strcmp( o, t );
00057 }
00058 };
00059
00060 struct url_entry {
00061 char* url;
00062 char* corpusPath;
00063 char* docNo;
00064 int linkCount;
00065 indri::utility::Buffer linkinfo;
00066
00067 void addLink( const char* docno,
00068 const char* linkDocUrl,
00069 const char* linkText )
00070 {
00071 if( linkinfo.position() ) {
00072
00073 linkinfo.unwrite(1);
00074 }
00075
00076 int docnoLen = (int)strlen(docno);
00077 int docUrlLen = (int)strlen(linkDocUrl);
00078 int textLen = (int)strlen(linkText);
00079
00080 int total = docnoLen + sizeof "LINKDOCNO=" +
00081 docUrlLen + sizeof "LINKFROM=" +
00082 textLen + sizeof "TEXT=" + 1;
00083
00084 sprintf( linkinfo.write(total),
00085 "LINKDOCNO=%s\nLINKFROM=%s\nTEXT=%s\n",
00086 docno,
00087 linkDocUrl,
00088 linkText );
00089
00090 linkCount++;
00091 }
00092 };
00093
00094 typedef indri::utility::HashTable<char*, url_entry*, strhash, strcompst> UrlEntryTable;
00095 typedef indri::utility::HashTable<char*, std::vector<url_entry*>, strhash, strcompst> UrlEntryVectorTable;
00096
00097 url_entry* _newUrlEntry( const char* url, const char* corpusPath, const char* docNo );
00098 void _deleteUrlEntry( void* buffer );
00099
00100 void _readLinks( UrlEntryTable& urlTable, std::ifstream& linkIn );
00101 void _readRedirects( UrlEntryTable& urlTable, const std::string& redirectPath, int number );
00102 void _writeCorpusTable( UrlEntryVectorTable& corpusTable, const std::string& outputPath );
00103 void _hashToCorpusTable( UrlEntryVectorTable& corpusTable, UrlEntryTable& urlTable );
00104
00105 void _openWriteBuckets( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets, const std::string& path, int bins );
00106 void _flushWriteBuffer( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets, bool force, int i );
00107 void _flushWriteBuffers( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets, bool force );
00108 void _closeWriteBuckets( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets );
00109 void _openReadBuckets( std::vector<std::ifstream*>& buckets, const std::string& path, int bins );
00110 void _readDocBucket( UrlEntryTable& urlTable, std::ifstream& docIn );
00111
00112 int hashString( const char* str );
00113 void hashToBuckets( std::ifstream& in, const std::string& path );
00114 void createBuckets( const std::string& tmpPath );
00115 void closeBuckets();
00116 void combineBucket( const std::string& outputPath, const std::string& tmpPath, int bucket );
00117 void hashToBuckets( const std::string& inputPath );
00118 void combineRedirectDestinationBucket( const std::string& tmpPath, int i, std::vector<std::stringstream*>& outBuffers, std::vector<std::ofstream*>& outputFiles );
00119
00120 public:
00121 Combiner( int bins = 10 ) : _bins(bins) {}
00122
00123 void combineRedirectDestinationBuckets( const std::string& tmpPath );
00124 void combineBuckets( const std::string& outputPath, const std::string& tmpPath );
00125 void hashRedirectTargets( const std::string& bucketPath, const std::string& redirectsPath );
00126 void hashToBuckets( const std::string& bucketPath, const std::string& inputPath );
00127 void sortCorpusFiles( const std::string& outputPath, const std::string& preSortPath, const std::string& inputPath );
00128 };
00129 }
00130 }
00131
00132 #endif // INDRI_COMBINER_HPP
00133