00001 /*========================================================================== 00002 * Copyright (c) 2004-2008 Carnegie Mellon University and University of 00003 * Massachusetts. All Rights Reserved. 00004 * 00005 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00006 * is subject to the terms of the software license set forth in the LICENSE 00007 * file included with this software, and also available at 00008 * http://www.lemurproject.org/license.html 00009 * 00010 *========================================================================== 00011 */ 00012 00013 #ifndef _HARVESTSORTMERGE_HPP 00014 #define _HARVESTSORTMERGE_HPP 00015 00016 // class to sort and merge multiple text files into one 00017 // with early rejection of URLs that do not exist within the keyfile 00018 00019 #include "SortMergeTextFiles.hpp" 00020 #include "Keyfile.hpp" 00021 #include "SHA1.hpp" 00022 00023 namespace lemur { 00024 namespace file { 00025 00026 class HarvestSortMerge : public SortMergeTextFiles { 00027 protected: 00028 lemur::file::Keyfile *_docNoKeyfile; 00029 lemur::utility::SHA1 SHA1Hasher; 00030 virtual void _doSingleFileMergesort(std::string &inputFile, std::string &outputFile, std::vector<std::string> &chunkList, int chunkRecordSize=16384*10); 00031 00032 public: 00033 HarvestSortMerge(std::string &outputFilePath, std::string &tempDirectory, lemur::file::Keyfile *docNoKeyfile, int numMergeThreads=4, bool displayStatus=false); 00034 ~HarvestSortMerge(); 00035 00036 static void splitLineOnTabs(char *inputLine, std::vector<std::string> &retVec); 00037 00038 }; // end class HarvestSortMerge 00039 } // end namespace file 00040 } // end namespace lemur 00041 00042 #endif // _HARVESTSORTMERGE_HPP