00001 /*========================================================================== 00002 * Copyright (c) 2009 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // WARCDocumentIterator 00015 // 00016 // 03 Mar 2009 -- dmf 00017 // 00018 00019 #ifndef INDRI_WARCDOCUMENTITERATOR_HPP 00020 #define INDRI_WARCDOCUMENTITERATOR_HPP 00021 #include <string> 00022 #include <fstream> 00023 #include "zlib.h" 00024 #include "indri/DocumentIterator.hpp" 00025 #include "indri/Buffer.hpp" 00026 #include "indri/UnparsedDocument.hpp" 00027 #include "indri/HashTable.hpp" 00028 00029 namespace indri 00030 { 00031 namespace parse 00032 { 00033 class WARCRecord { 00034 private: 00035 // header fields 00036 //WARC-TYPE 00037 std::string warcType; 00038 //WARC-Record-ID 00039 std::string uuid; 00040 // WARC-TREC-ID // clueweb specific 00041 std::string trecID; 00042 // WARC-Target-URI 00043 std::string targetURI; 00044 //Content-Length 00045 int contentLength; 00046 // other metadata headers 00047 indri::utility::HashTable< std::string, std::string > metadata; 00048 // the header of the record 00049 std::string header; 00050 // the body of the record 00051 const char *content; 00052 bool _readLine( char*& beginLine, size_t& lineLength ); 00053 bool readHeader(); 00054 bool readContent(); 00055 gzFile &_gzin; 00056 indri::utility::Buffer & _buffer; 00057 public: 00058 WARCRecord(gzFile &in, indri::utility::Buffer &buf) : _gzin(in), 00059 _buffer(buf) { } 00060 00061 ~WARCRecord(); 00062 00063 std::string getWarcType() { return warcType ; } 00064 std::string getUUID() { return uuid; } 00065 std::string getTrecID() { return trecID; } 00066 std::string getTargetURI() { return targetURI; } 00067 const char *getHeader() { return header.c_str(); } 00068 const char *getContent(){ return content; } 00069 00070 std::string getMetadata(const char *key); 00071 00072 bool readRecord(); 00073 // header string constants 00074 static const char * WARCTYPE; 00075 static const char * WARCRECORDID; 00076 static const char * CONTENTLENGTH; 00077 static const char * WARCTARGETURI; 00078 static const char * WARCTRECID; 00079 }; 00080 00081 class WARCDocumentIterator : public DocumentIterator { 00082 private: 00083 WARCRecord *_record; 00084 UnparsedDocument _document; 00085 gzFile _gzin; 00086 indri::utility::Buffer _buffer; 00087 indri::utility::Buffer _metaBuffer; 00088 std::string _warcUUID; 00089 const char * _warcMeta; 00090 const char * _dochdr; 00091 const char * _docnoString; 00092 char _docno[512]; 00093 00094 public: 00095 WARCDocumentIterator(); 00096 ~WARCDocumentIterator(); 00097 void open( const std::string& filename ); 00098 void close(); 00099 UnparsedDocument* nextDocument(); 00100 }; 00101 } 00102 } 00103 00104 #endif // INDRI_WARCDOCUMENTITERATOR_HPP