Lemur: RawTextParser.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // RawTextParser
00015 //
00016 // 10 February 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_RAWTEXTPARSER_HPP
00020 #define INDRI_RAWTEXTPARSER_HPP
00021 
00022 #define PARSER_MAX_WORD_LENGTH (30)
00023 namespace indri
00024 {
00025   namespace parse
00026   {
00027     
00028     class RawTextParser {
00029     private:
00030       std::ifstream _in;
00031       char* _buffer;
00032       char* _current;
00033       int _bufferSize;
00034 
00035     public:
00036       RawTextParser( int memorySize = 1024*1024 ) {
00037         _bufferSize = memorySize;
00038         _buffer = new char[_bufferSize]; // let's hope that 1mb is enough for any docs we're gonna see
00039       }
00040 
00041       ~RawTextParser() {
00042         delete _buffer;
00043       }
00044 
00045       bool open( const std::string& fileName ) {
00046         _in.open( fileName.c_str(), std::ifstream::in );
00047         return _in.good();
00048       }
00049 
00050       void close() {
00051         _in.close();
00052       }
00053 
00057 
00058       bool parseDocument( std::string& docName, indri::utility::greedy_vector<char*>& words ) { 
00059         static const char docPrefix[] = "<DOC>";
00060         static const char endDocPrefix[] = "</DOC>";
00061         static const char docnoPrefix[] = "<DOCNO ";
00062         static const char urlPrefix[] = "<URL";
00063         bool gotDocID = false;
00064         int bufferPos = 0;
00065 
00066         while( 1 ) {
00067           int remainingSpace = _bufferSize - bufferPos;
00068           _in.getline( _buffer + bufferPos, remainingSpace );
00069           int length = _in.gcount();
00070 
00071           if( _in.rdstate() & (std::ifstream::failbit|std::ifstream::eofbit) ) {
00072             if( _in.rdstate() & std::ifstream::eofbit ) {
00073               return false; // at end of file, we're done
00074             }
00075         
00076             if( length == remainingSpace - 1 ) {
00077               throw Exception( "RawTextParser", "Buffer size is too small to handle some document in the corpus, use -parserMemory to change." );
00078             }
00079 
00080             if( _in.rdstate() & std::ifstream::failbit ) {
00081               throw Exception( "RawTextParser", "Unable to recover from failed read" );
00082             }
00083           }
00084 
00085           _buffer[bufferPos+length] = 0;
00086           char* line = _buffer + bufferPos;
00087 
00088           if( length && _buffer[bufferPos] == '<' ) {
00089             if( length > sizeof docnoPrefix-1 && !strncmp( docnoPrefix, line, sizeof docnoPrefix-1 ) ) {
00090               docName.assign( line+sizeof docnoPrefix-1, line + length - 2 );
00091               gotDocID = true;
00092             } else if ( length > sizeof endDocPrefix-1 && !strncmp( endDocPrefix, line, sizeof endDocPrefix-1 ) ) {
00093               // handle end doc -- return
00094               if( gotDocID )
00095                 return true;
00096             }
00097           } else {
00098             if( !gotDocID )
00099               continue;
00100 
00101             int i = 0;
00102 
00103             while(1) {
00104               for( ; isspace(line[i]) && i<length && line[i]; i++ )
00105                 ;
00106 
00107               if( i>= length || !line[i] )
00108                 break;
00109 
00110               char* begin = &line[i];
00111 
00112               for( ; !isspace(line[i]) && i<length && line[i]; i++ )
00113                 ;
00114 
00115               line[i] = 0;
00116               i++;
00117 
00118               if( &line[i] - begin > PARSER_MAX_WORD_LENGTH )
00119                 begin[PARSER_MAX_WORD_LENGTH-1] = 0;
00120 
00121               words.push_back(begin);
00122             }
00123           }
00124 
00125           bufferPos += length + 1; // have to skip trailing \0
00126         }
00127       }
00128     };
00129   }
00130 }
00131 
00132 #endif // INDRI_RAWTEXTPARSER_HPP