00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 #ifndef _BASICFILESTREAM_HPP 00014 #define _BASICFILESTREAM_HPP 00015 #include "common_headers.hpp" 00016 #include <cassert> 00017 #include <cstdio> 00018 #include <cstring> 00019 #include "DocStream.hpp" 00020 #include "Exception.hpp" 00021 00022 00023 namespace lemur 00024 { 00025 namespace parse 00026 { 00027 00029 00068 #define MAXLINE 65536 00069 00070 00072 00073 class BasicTokenDoc : public lemur::api::Document { 00074 public: 00075 BasicTokenDoc() { 00076 } 00077 BasicTokenDoc(ifstream *stream): docStr(stream) { 00078 } 00079 void startTermIteration() const; 00080 00081 const char *getID() const { return id;} 00082 00083 bool hasMore() const{ return (strcmp(curWord, "</DOC>") != 0);} 00084 00085 const lemur::api::Term * nextTerm() const; 00086 00087 void skipToEnd() const; 00088 friend class BasicDocStream; 00089 private: 00090 void readID(); 00091 mutable char *curWord; 00092 mutable char buf1[20000]; 00093 mutable char buf2[20000]; 00094 char id[2000]; 00095 ifstream *docStr; 00096 streampos startPos; // starting position of the terms in the file 00097 //replace static BasicTokenTerm t; with attribute 00098 mutable lemur::api::Term t; 00099 }; 00100 00101 00103 class BasicDocStream : public lemur::api::DocStream 00104 { 00105 public: 00106 BasicDocStream() {} 00107 BasicDocStream (const string &inputFile); 00108 00109 virtual ~BasicDocStream() { delete ifs;} 00110 00111 public: 00112 00113 bool hasMore(); 00114 00115 void startDocIteration(); 00116 00117 lemur::api::Document *nextDoc(); 00118 00119 private: 00120 char file[1024]; 00121 ifstream *ifs; 00122 char buf[2000]; 00123 bool nextTokenRead; 00124 // replace static BasicTokenDoc doc; with attribute 00125 BasicTokenDoc doc; 00126 }; 00127 } 00128 } 00129 #endif