00001 /*========================================================================== 00002 * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // TextDocumentExtractor 00015 // 00016 // 16 August 2004 -- tds 00017 // 00018 00019 #ifndef INDRI_TEXTDOCUMENTEXTRACTOR_HPP 00020 #define INDRI_TEXTDOCUMENTEXTRACTOR_HPP 00021 00022 #include "indri/DocumentIterator.hpp" 00023 #include "indri/UnparsedDocument.hpp" 00024 #include "indri/Buffer.hpp" 00025 #include <fstream> 00026 namespace indri 00027 { 00028 namespace parse 00029 { 00030 00031 class TextDocumentExtractor : public DocumentIterator { 00032 private: 00033 std::string _filename; 00034 UnparsedDocument _document; 00035 indri::utility::Buffer _buffer; 00036 std::ifstream _in; 00037 00038 public: 00039 void open( const std::string& filename ); 00040 UnparsedDocument* nextDocument(); 00041 void close(); 00042 }; 00043 } 00044 } 00045 00046 #endif // INDRI_TEXTDOCUMENTEXTRACTOR_HPP