00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017 #ifndef INDRI_PAGERANK_HPP
00018 #define INDRI_PAGERANK_HPP
00019
00020 #include <string>
00021 #include <vector>
00022 #include <map>
00023 #include <cmath>
00024 #include "indri/UnparsedDocument.hpp"
00025 #include "indri/Parameters.hpp"
00026 #include "indri/FileTreeIterator.hpp"
00027 #include "indri/TaggedDocumentIterator.hpp"
00028 #include "indri/TaggedTextParser.hpp"
00029 #include "indri/Path.hpp"
00030
00031 #include "indri/Repository.hpp"
00032
00033 namespace indri
00034 {
00035 namespace parse
00036 {
00037 class pagerank
00038 {
00039 public:
00040 std::string doc;
00041 float val;
00042 int int_val;
00043 struct pagerank_greater
00044 {
00045 bool operator() (const pagerank &one, const pagerank &two)
00046 {
00047 if (one.val == two.val)
00048 return one.doc > two.doc;
00049 return one.val > two.val;
00050 }
00051 };
00052 };
00053 class prEntry {
00054 public:
00055 lemur::api::DOCID_T doc;
00056 float val;
00057 int int_val;
00058 struct prEntry_greater {
00059 bool operator() (const prEntry &one, const prEntry &two) {
00060 if (one.val == two.val)
00061 return one.doc > two.doc;
00062 return one.val > two.val;
00063 }
00064 };
00065 };
00066
00067 class PageRank {
00068 private:
00069
00070 static const double _intToProb[11];
00071 float *prTable;
00072
00073 const std::string _corpusPath;
00074 const std::string _linkPath;
00075
00076 double _c;
00077 UINT64 _colLen;
00078
00079 indri::collection::Repository _repository;
00080 typedef std::map< std::string, float > PageRankVector;
00081 typedef std::map< std::string, std::pair< int, std::vector< std::string > > > Links;
00082
00083 inline void _swap( std::string& a, std::string& b ) {
00084 std::string tmp = b;
00085 b = a;
00086 a = tmp;
00087 }
00088
00089 void _computeColLen();
00090
00091 float _readPageRankFromFile( std::ifstream& src, const std::string& sourceDoc );
00092 void _writePageRankToFile( std::ofstream& src, const std::string& destDoc, const float pr );
00093
00094 void _computeOutDegrees( Links& links );
00095 void _doPageRankIter( const int docsPerIter, const std::string& srcFile, const std::string& destFile );
00096 void _updatePageRank( std::ifstream& src, std::ofstream& dest, Links& links );
00097
00098 void _raw2int(std::vector<pagerank> &);
00099 void _ranks2int(std::vector<prEntry> &ranks);
00100
00101 void _loadRanks( const std::string& dest,
00102 std::vector<pagerank> &pageranks);
00103
00104 public:
00105 PageRank( const std::string& corpusPath, const std::string& linkPath,
00106 UINT64 colLen = 0 ) : _corpusPath( corpusPath ),
00107 _linkPath( linkPath ),
00108 _colLen( colLen ), prTable(0) {
00109 if (_colLen == 0 ) _computeColLen();
00110 }
00111 PageRank( const std::string& corpusPath, const std::string& linkPath,
00112 const std::string& indexPath ) : _corpusPath( corpusPath ),
00113 _linkPath( linkPath ), prTable(0) {
00114
00115 _repository.openRead(indexPath);
00116 indri::collection::Repository::index_state indexes = _repository.indexes();
00117 _colLen = 0;
00118 for( int i=0; i<indexes->size(); i++ ) {
00119 _colLen += (*indexes)[i]->documentCount();
00120 }
00121 }
00122 ~PageRank( ) {
00123 delete(prTable);
00124 }
00125
00126 void computePageRank( const std::string& outputFile, const int maxIters = 10, const int docsPerIter = 1000, const double c = 0.7 );
00127 void writeRaw( const std::string& dest, const std::string &fawFile );
00128 void writePriors( const std::string& dest, const std::string &priorFile );
00129 void writeRanks( const std::string& dest, const std::string &ranksFile );
00130
00131 void indexPageRank(const std::string& outputFile, const int maxIters = 100, const double c = 0.85 );
00132 };
00133 }
00134 }
00135
00136 #endif