00001 /*========================================================================== 00002 * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // TaggedDocumentIterator 00015 // 00016 // 14 May 2004 -- tds 00017 // 00018 00019 #ifndef INDRI_TRECDOCUMENTITERATOR_HPP 00020 #define INDRI_TRECDOCUMENTITERATOR_HPP 00021 #include "zlib.h" 00022 #include "indri/DocumentIterator.hpp" 00023 #include "indri/Buffer.hpp" 00024 #include "indri/UnparsedDocument.hpp" 00025 #include <string> 00026 #include <fstream> 00027 namespace indri 00028 { 00029 namespace parse 00030 { 00031 00032 class TaggedDocumentIterator : public DocumentIterator { 00033 private: 00034 UnparsedDocument _document; 00035 gzFile _in; 00036 indri::utility::Buffer _buffer; 00037 indri::utility::Buffer _metaBuffer; 00038 std::string _lastMetadataTag; 00039 std::string _fileName; 00040 00041 bool _readLine( char*& beginLine, size_t& lineLength ); 00042 00043 const char* _startDocTag; 00044 const char* _endDocTag; 00045 const char* _endMetadataTag; 00046 00047 int _startDocTagLength; 00048 int _endDocTagLength; 00049 int _endMetadataTagLength; 00050 00051 public: 00052 TaggedDocumentIterator(); 00053 ~TaggedDocumentIterator(); 00054 00055 void setTags( const char* startDoc, const char* endDoc, const char* endMetadata ); 00056 00057 void open( const std::string& filename ); 00058 void close(); 00059 00060 UnparsedDocument* nextDocument(); 00061 }; 00062 } 00063 } 00064 00065 #endif // INDRI_TRECDOCUMENTITERATOR_HPP