Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

Combiner.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // Combiner
00015 //
00016 // 3 June 2004 -- tds
00017 //
00018 
00019 
00020 #ifndef INDRI_COMBINER_HPP
00021 #define INDRI_COMBINER_HPP
00022 
00023 #include <iostream>
00024 #include <vector>
00025 #include <string>
00026 #include "indri/Buffer.hpp"
00027 #include "indri/HashTable.hpp"
00028 namespace indri
00029 {
00030   namespace parse
00031   {
00032     
00033     class Combiner {
00034     private:
00035       std::vector< std::ofstream* > _docBucketFiles;
00036       std::vector< std::ofstream* > _linkBucketFiles;
00037       std::vector< std::stringstream* > _docBuckets;
00038       std::vector< std::stringstream* > _linkBuckets;
00039       int _bins;
00040 
00041       struct strhash {
00042       public:
00043         int operator() ( const char* k ) const {
00044           int hash = 0;
00045           for( ; *k; k++ ){
00046             hash *= 7;
00047             hash += *k;
00048           }
00049           return hash;
00050         }
00051       };
00052 
00053       struct strcompst {
00054       public:
00055         int operator () ( const char* o, const char* t ) const {
00056           return strcmp( o, t );
00057         }
00058       };
00059 
00060       struct url_entry {
00061         char* url;
00062         char* corpusPath;
00063         char* docNo;
00064         int linkCount;
00065         indri::utility::Buffer linkinfo;
00066 
00067         void addLink( const char* docno,
00068                       const char* linkDocUrl,
00069                       const char* linkText )
00070         {
00071           if( linkinfo.position() ) {
00072             // remove trailing 0
00073             linkinfo.unwrite(1);
00074           }
00075 
00076           int docnoLen = (int)strlen(docno);
00077           int docUrlLen = (int)strlen(linkDocUrl);
00078           int textLen = (int)strlen(linkText);
00079 
00080           int total = docnoLen + sizeof "LINKDOCNO=" + 
00081             docUrlLen + sizeof "LINKFROM=" +
00082             textLen + sizeof "TEXT=" + 1;
00083 
00084           sprintf( linkinfo.write(total),
00085                    "LINKDOCNO=%s\nLINKFROM=%s\nTEXT=%s\n",
00086                    docno,
00087                    linkDocUrl,
00088                    linkText );
00089 
00090           linkCount++;
00091         }
00092       };
00093 
00094       typedef indri::utility::HashTable<char*, url_entry*, strhash, strcompst> UrlEntryTable;
00095       typedef indri::utility::HashTable<char*, std::vector<url_entry*>, strhash, strcompst> UrlEntryVectorTable;
00096 
00097       url_entry* _newUrlEntry( const char* url, const char* corpusPath, const char* docNo );
00098       void _deleteUrlEntry( void* buffer );
00099   
00100       void _readLinks( UrlEntryTable& urlTable, std::ifstream& linkIn );
00101       void _readRedirects( UrlEntryTable& urlTable, const std::string& redirectPath, int number );
00102       void _writeCorpusTable( UrlEntryVectorTable& corpusTable, const std::string& outputPath );
00103       void _hashToCorpusTable( UrlEntryVectorTable& corpusTable, UrlEntryTable& urlTable );
00104       
00105       void _openWriteBuckets( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets, const std::string& path, int bins );
00106       void _flushWriteBuffer( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets, bool force, int i );
00107       void _flushWriteBuffers( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets, bool force );
00108       void _closeWriteBuckets( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets );
00109       void _openReadBuckets( std::vector<std::ifstream*>& buckets, const std::string& path, int bins );
00110       void _readDocBucket( UrlEntryTable& urlTable, std::ifstream& docIn );
00111 
00112       int hashString( const char* str );
00113       void hashToBuckets( std::ifstream& in, const std::string& path );
00114       void createBuckets( const std::string& tmpPath );
00115       void closeBuckets();
00116       void combineBucket( const std::string& outputPath, const std::string& tmpPath, int bucket );
00117       void hashToBuckets( const std::string& inputPath );
00118       void combineRedirectDestinationBucket( const std::string& tmpPath, int i, std::vector<std::stringstream*>& outBuffers, std::vector<std::ofstream*>& outputFiles );
00119 
00120     public:
00121       Combiner( int bins = 10 ) : _bins(bins) {}
00122 
00123       void combineRedirectDestinationBuckets( const std::string& tmpPath );
00124       void combineBuckets( const std::string& outputPath, const std::string& tmpPath );
00125       void hashRedirectTargets( const std::string& bucketPath, const std::string& redirectsPath );
00126       void hashToBuckets( const std::string& bucketPath, const std::string& inputPath );
00127       void sortCorpusFiles( const std::string& outputPath, const std::string& preSortPath, const std::string& inputPath );
00128     };
00129   }
00130 }
00131 
00132 #endif // INDRI_COMBINER_HPP
00133 

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4