00001 /*========================================================================== 00002 * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 // 00013 // MboxDocumentIterator 00014 // 00015 // 20 May 2005 -- tds 00016 // 00017 00018 #ifndef INDRI_MBOXDOCUMENTITERATOR_HPP 00019 #define INDRI_MBOXDOCUMENTITERATOR_HPP 00020 00021 #include "indri/DocumentIterator.hpp" 00022 #include "indri/UnparsedDocument.hpp" 00023 #include "indri/Buffer.hpp" 00024 #include <fstream> 00025 namespace indri 00026 { 00027 namespace parse 00028 { 00029 class MboxDocumentIterator : public DocumentIterator { 00030 private: 00031 std::string _filename; 00032 UnparsedDocument _document; 00033 indri::utility::Buffer _buffer; 00034 indri::utility::Buffer _metaBuffer; 00035 std::ifstream _in; 00036 00037 void _copyMetadata( const char* headerLine, int ignoreBytes, const char* tagName ); 00038 00039 public: 00040 void open( const std::string& filename ); 00041 UnparsedDocument* nextDocument(); 00042 void close(); 00043 }; 00044 } 00045 } 00046 00047 #endif // INDRI_MBOXDOCUMENTITERATOR_HPP 00048