00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _SORTMERGETEXTFILES_HPP
00014 #define _SORTMERGETEXTFILES_HPP
00015
00016
00017
00018 #include <time.h>
00019 #include <algorithm>
00020 #include <iostream>
00021 #include <fstream>
00022 #include <string>
00023 #include <sstream>
00024 #include <vector>
00025 #include <list>
00026
00027 #include "indri/Buffer.hpp"
00028 #include "indri/Path.hpp"
00029 #include "indri/UtilityThread.hpp"
00030 #include "lemur-compat.hpp"
00031 #include "Exception.hpp"
00032
00033 namespace lemur {
00034 namespace file {
00035
00039 class FileMergeThread : public indri::thread::UtilityThread {
00040 public:
00041 enum {
00042 MAX_INPUT_FILES=16,
00043 MAX_INPUT_LINESIZE=65536
00044 };
00045
00046 private:
00047 FILE *inputFile[MAX_INPUT_FILES];
00048 std::ofstream outfile;
00049
00050 std::string filePath[MAX_INPUT_FILES];
00051 std::string outputFilePath;
00052
00053 char _buffer[MAX_INPUT_FILES][MAX_INPUT_LINESIZE];
00054 bool fileDone[MAX_INPUT_FILES];
00055
00056 char _outputBuffer[2*1024*1024];
00057
00058 int numInputFiles;
00059
00060 int recordCounter;
00061 bool isActive;
00062
00063 int chooseNextBuffer();
00064
00065 public:
00066 FileMergeThread(std::vector<std::string> &inputFileList, const std::string& outputFile);
00067 ~FileMergeThread() { }
00068
00069 virtual bool hasWork() { return false; }
00070 virtual UINT64 work();
00071 virtual UINT64 initialize();
00072 virtual void deinitialize();
00073
00074 int getRecordCounter() { return recordCounter; }
00075 bool isThreadActive() { return isActive; }
00076
00077 };
00078
00079 class SortMergeTextFiles {
00080 protected:
00081 std::string _outputFilePath;
00082 std::string _tempDirectory;
00083
00084 indri::utility::Buffer _inputBuffer;
00085 indri::utility::Buffer _inputBufferTwo;
00086
00087 bool _displayStatus;
00088 int _numMergeThreads;
00089
00090 std::string _flushChunks(std::string& basePathname, std::vector<std::string> *inMemRecords, int currentChunkNumber);
00091 int _mergeSortTwoFiles(std::string &firstFilePath, std::string &secondFilePath, std::string &outputFile, bool doCleanup=true);
00092 std::vector<std::string> _doMidFinalMerge(std::vector<std::string> &inputList, std::string &outputPathBase, int &recordCounter);
00093 int _doFinalMergesortFiles(std::vector<std::string> &inputFiles, std::string &outputFile);
00094
00095 virtual void _doSingleFileMergesort(std::string &inputFile, std::string &outputFile, std::vector<std::string> &chunkList, int chunkRecordSize=16384*10);
00096
00097 public:
00098 SortMergeTextFiles(std::string &outputFilePath, std::string &tempDirectory, int numMergeThreads=4, bool displayStatus=false);
00099 ~SortMergeTextFiles();
00100
00101 int sort(std::vector<std::string> &inputFilePaths);
00102 static bool _readLine(FILE *_in, char*& beginLine, size_t& lineLength, indri::utility::Buffer &_buffer);
00103 void showStatus(bool displayStatus) { _displayStatus=displayStatus; }
00104
00105 };
00106 }
00107 }
00108
00109 #endif // _SORTMERGETEXTFILES_HPP