Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

AnchorTextHarvester.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // AnchorTextHarvester
00015 //
00016 // 03 Mar 2008 - mjh
00017 //
00018 
00019 #ifndef INDRI_ANCHORTEXTHARVESTER_HPP
00020 #define INDRI_ANCHORTEXTHARVESTER_HPP
00021 
00022 #include <iostream>
00023 #include <fstream>
00024 #include <algorithm>
00025 #include <map>
00026 #include "indri/ParsedDocument.hpp"
00027 #include "indri/ObjectHandler.hpp"
00028 #include "indri/Path.hpp"
00029 #include "lemur-compat.hpp"
00030 #include "Keyfile.hpp"
00031 #include "SortMergeTextFiles.hpp"
00032 #include "SHA1.hpp"
00033 
00034 namespace indri
00035 {
00036   namespace parse
00037   {
00039     class AnchorTextHarvester : public ObjectHandler<indri::api::ParsedDocument> {
00040     private:
00041       // holds destinationURL->(array of <sourceURL, anchor text> pairs)
00042       std::ofstream _linkFile;
00043 
00044       // docOrder file (document URL->DOCNO)
00045       std::ofstream _docOrder;
00046 
00047       char linkFileOutBuffer[5*1024*1024];
00048       char docOrderOutBuffer[3*1024*1024];
00049 
00050       lemur::file::Keyfile *_docNoKeyfile;
00051       lemur::file::Keyfile *_redirectKeyfile;
00052 
00053       lemur::utility::SHA1 SHA1Hasher;
00054 
00055     public:
00056       AnchorTextHarvester(const std::string &linkFilePath, const std::string& docOrderPath, lemur::file::Keyfile *docNoKeyfile, lemur::file::Keyfile *redirectKeyfile=NULL );
00057       ~AnchorTextHarvester();
00058 
00059       void handle( indri::api::ParsedDocument* document );
00060 
00061     }; // end class AnchorTextHarvester
00062 
00063   } // end namespace parse
00064 } // end namespace indri
00065 
00066 
00067 
00068 #endif // #define INDRI_ANCHORTEXTHARVESTER_HPP

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4