Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

BasicDocStream.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 #ifndef _BASICFILESTREAM_HPP
00014 #define _BASICFILESTREAM_HPP
00015 #include "common_headers.hpp"
00016 #include <cassert>
00017 #include <cstdio>
00018 #include <cstring>
00019 #include "DocStream.hpp"
00020 #include "Exception.hpp"
00021 
00022 
00023 namespace lemur 
00024 {
00025   namespace parse 
00026   {
00027     
00029 
00068 #define MAXLINE 65536
00069 
00070 
00072 
00073     class BasicTokenDoc : public lemur::api::Document {
00074     public:
00075       BasicTokenDoc() {
00076       }
00077       BasicTokenDoc(ifstream *stream): docStr(stream) {
00078       }
00079       void startTermIteration() const;  
00080   
00081       const char *getID() const { return id;}
00082 
00083       bool hasMore() const{ return (strcmp(curWord, "</DOC>") != 0);}
00084     
00085       const lemur::api::Term * nextTerm() const;
00086 
00087       void skipToEnd() const;
00088       friend class BasicDocStream;
00089     private:
00090       void readID(); 
00091       mutable char *curWord;
00092       mutable char buf1[20000];
00093       mutable char buf2[20000];
00094       char id[2000];
00095       ifstream *docStr;
00096       streampos startPos; // starting position of the terms in the file
00097       //replace  static BasicTokenTerm t; with attribute
00098       mutable lemur::api::Term t;
00099     };
00100 
00101 
00103     class BasicDocStream : public lemur::api::DocStream
00104     {
00105     public:
00106       BasicDocStream() {}
00107       BasicDocStream (const string &inputFile);
00108 
00109       virtual ~BasicDocStream() {  delete ifs;}
00110 
00111     public:
00112         
00113       bool hasMore(); 
00114 
00115       void startDocIteration();
00116 
00117       lemur::api::Document *nextDoc();
00118 
00119     private:
00120       char file[1024];
00121       ifstream *ifs;
00122       char buf[2000];
00123       bool nextTokenRead;
00124       // replace static BasicTokenDoc doc;  with attribute
00125       BasicTokenDoc doc;
00126     };
00127   }
00128 }
00129 #endif

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4