00001 /*========================================================================== 00002 * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // AnchorTextHarvester 00015 // 00016 // 03 Mar 2008 - mjh 00017 // 00018 00019 #ifndef INDRI_ANCHORTEXTHARVESTER_HPP 00020 #define INDRI_ANCHORTEXTHARVESTER_HPP 00021 00022 #include <iostream> 00023 #include <fstream> 00024 #include <algorithm> 00025 #include <map> 00026 #include "indri/ParsedDocument.hpp" 00027 #include "indri/ObjectHandler.hpp" 00028 #include "indri/Path.hpp" 00029 #include "lemur-compat.hpp" 00030 #include "Keyfile.hpp" 00031 #include "SortMergeTextFiles.hpp" 00032 #include "SHA1.hpp" 00033 00034 namespace indri 00035 { 00036 namespace parse 00037 { 00039 class AnchorTextHarvester : public ObjectHandler<indri::api::ParsedDocument> { 00040 private: 00041 // holds destinationURL->(array of <sourceURL, anchor text> pairs) 00042 std::ofstream _linkFile; 00043 00044 // docOrder file (document URL->DOCNO) 00045 std::ofstream _docOrder; 00046 00047 char linkFileOutBuffer[5*1024*1024]; 00048 char docOrderOutBuffer[3*1024*1024]; 00049 00050 lemur::file::Keyfile *_docNoKeyfile; 00051 lemur::file::Keyfile *_redirectKeyfile; 00052 00053 lemur::utility::SHA1 SHA1Hasher; 00054 00055 public: 00056 AnchorTextHarvester(const std::string &linkFilePath, const std::string& docOrderPath, lemur::file::Keyfile *docNoKeyfile, lemur::file::Keyfile *redirectKeyfile=NULL ); 00057 ~AnchorTextHarvester(); 00058 00059 void handle( indri::api::ParsedDocument* document ); 00060 00061 }; // end class AnchorTextHarvester 00062 00063 } // end namespace parse 00064 } // end namespace indri 00065 00066 00067 00068 #endif // #define INDRI_ANCHORTEXTHARVESTER_HPP