Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

harvestlinks.cpp File Reference

#include <iostream>
#include <fstream>
#include <vector>
#include <time.h>
#include "indri/Parameters.hpp"
#include "indri/TokenizedDocument.hpp"
#include "indri/ParsedDocument.hpp"
#include "indri/TaggedDocumentIterator.hpp"
#include "indri/WARCDocumentIterator.hpp"
#include "indri/TaggedTextParser.hpp"
#include "indri/HTMLParser.hpp"
#include "indri/TokenizerFactory.hpp"
#include "indri/ConflationPattern.hpp"
#include "indri/AnchorTextHarvester.hpp"
#include "indri/FileTreeIterator.hpp"
#include "indri/Path.hpp"
#include "Exception.hpp"
#include "indri/Combiner.hpp"
#include "Keyfile.hpp"
#include "HarvestSortMerge.hpp"
#include "indri/IndriTimer.hpp"
#include "SHA1.hpp"

Functions

std::string getFinalHarvestPath (const std::string &corpusPath, const std::string &filePath, const std::string &harvestPath)
lemur::file::KeyfilecreateRedirectKeyfile (const std::string &redirectKeyfilePath, const std::string &redirectPath)
void harvest_anchor_text_file (const std::string &path, const std::string &linkFilePath, const std::string &docOrderPath, lemur::file::Keyfile *redirectKeyfile, indri::parse::HTMLParser &parser, indri::parse::Tokenizer *tokenizer, lemur::file::Keyfile *keyfile, const std::string &fileClass)
void harvest_anchor_text (const std::string &corpusPath, const std::string &fileClass, const std::string &harvestPath, const std::string &docUrlNoKeyfilePath, const std::string &preSortPath, const std::string &redirectPath)
void collect_harvest_paths (const std::string &corpusPath, const std::string &fileClass, const std::string &harvestPath, const std::string &docUrlNoKeyfilePath, const std::string &preSortPath, const std::string &redirectPath)
void combineOutputFile (const std::string &corpusFile, const std::string &sortedPath, const std::string &outputSortedLinkFile, const std::string &docOrderPath, lemur::file::Keyfile *urlKeyfile, FILE *sortedDestFile, lemur::file::Keyfile *docNoKeyfile)
void combineSortedFiles (const std::string &corpusPath, const std::string &harvestPath, const std::string &outputSortedLinkFile, const std::string &preSortPath, const std::string &sortedPath, lemur::file::Keyfile *docNoKeyfile)
void usage ()
 Prints out useful usage information.

int main (int argc, char *argv[])

Variables

std::vector< std::string > harvestedLinkPaths
indri::utility::IndriTimer g_timer
lemur::utility::SHA1 SHA1Hasher
char _outputBuffer [5 *1024 *1024]

Function Documentation

void collect_harvest_paths const std::string &  corpusPath,
const std::string &  fileClass,
const std::string &  harvestPath,
const std::string &  docUrlNoKeyfilePath,
const std::string &  preSortPath,
const std::string &  redirectPath
[static]
 

void combineOutputFile const std::string &  corpusFile,
const std::string &  sortedPath,
const std::string &  outputSortedLinkFile,
const std::string &  docOrderPath,
lemur::file::Keyfile urlKeyfile,
FILE *  sortedDestFile,
lemur::file::Keyfile docNoKeyfile
 

void combineSortedFiles const std::string &  corpusPath,
const std::string &  harvestPath,
const std::string &  outputSortedLinkFile,
const std::string &  preSortPath,
const std::string &  sortedPath,
lemur::file::Keyfile docNoKeyfile
 

lemur::file::Keyfile* createRedirectKeyfile const std::string &  redirectKeyfilePath,
const std::string &  redirectPath
[static]
 

std::string getFinalHarvestPath const std::string &  corpusPath,
const std::string &  filePath,
const std::string &  harvestPath
 

void harvest_anchor_text const std::string &  corpusPath,
const std::string &  fileClass,
const std::string &  harvestPath,
const std::string &  docUrlNoKeyfilePath,
const std::string &  preSortPath,
const std::string &  redirectPath
[static]
 

void harvest_anchor_text_file const std::string &  path,
const std::string &  linkFilePath,
const std::string &  docOrderPath,
lemur::file::Keyfile redirectKeyfile,
indri::parse::HTMLParser parser,
indri::parse::Tokenizer tokenizer,
lemur::file::Keyfile keyfile,
const std::string &  fileClass
[static]
 

int main int  argc,
char *  argv[]
 

void usage  ) 
 

Prints out useful usage information.


Variable Documentation

char _outputBuffer[5*1024*1024]
 

indri::utility::IndriTimer g_timer [static]
 

std::vector<std::string> harvestedLinkPaths
 

lemur::utility::SHA1 SHA1Hasher [static]
 


Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4