00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_RAWTEXTPARSER_HPP
00020 #define INDRI_RAWTEXTPARSER_HPP
00021
00022 #define PARSER_MAX_WORD_LENGTH (30)
00023 namespace indri
00024 {
00025 namespace parse
00026 {
00027
00028 class RawTextParser {
00029 private:
00030 std::ifstream _in;
00031 char* _buffer;
00032 char* _current;
00033 int _bufferSize;
00034
00035 public:
00036 RawTextParser( int memorySize = 1024*1024 ) {
00037 _bufferSize = memorySize;
00038 _buffer = new char[_bufferSize];
00039 }
00040
00041 ~RawTextParser() {
00042 delete _buffer;
00043 }
00044
00045 bool open( const std::string& fileName ) {
00046 _in.open( fileName.c_str(), std::ifstream::in );
00047 return _in.good();
00048 }
00049
00050 void close() {
00051 _in.close();
00052 }
00053
00057
00058 bool parseDocument( std::string& docName, indri::utility::greedy_vector<char*>& words ) {
00059 static const char docPrefix[] = "<DOC>";
00060 static const char endDocPrefix[] = "</DOC>";
00061 static const char docnoPrefix[] = "<DOCNO ";
00062 static const char urlPrefix[] = "<URL";
00063 bool gotDocID = false;
00064 int bufferPos = 0;
00065
00066 while( 1 ) {
00067 int remainingSpace = _bufferSize - bufferPos;
00068 _in.getline( _buffer + bufferPos, remainingSpace );
00069 int length = _in.gcount();
00070
00071 if( _in.rdstate() & (std::ifstream::failbit|std::ifstream::eofbit) ) {
00072 if( _in.rdstate() & std::ifstream::eofbit ) {
00073 return false;
00074 }
00075
00076 if( length == remainingSpace - 1 ) {
00077 throw Exception( "RawTextParser", "Buffer size is too small to handle some document in the corpus, use -parserMemory to change." );
00078 }
00079
00080 if( _in.rdstate() & std::ifstream::failbit ) {
00081 throw Exception( "RawTextParser", "Unable to recover from failed read" );
00082 }
00083 }
00084
00085 _buffer[bufferPos+length] = 0;
00086 char* line = _buffer + bufferPos;
00087
00088 if( length && _buffer[bufferPos] == '<' ) {
00089 if( length > sizeof docnoPrefix-1 && !strncmp( docnoPrefix, line, sizeof docnoPrefix-1 ) ) {
00090 docName.assign( line+sizeof docnoPrefix-1, line + length - 2 );
00091 gotDocID = true;
00092 } else if ( length > sizeof endDocPrefix-1 && !strncmp( endDocPrefix, line, sizeof endDocPrefix-1 ) ) {
00093
00094 if( gotDocID )
00095 return true;
00096 }
00097 } else {
00098 if( !gotDocID )
00099 continue;
00100
00101 int i = 0;
00102
00103 while(1) {
00104 for( ; isspace(line[i]) && i<length && line[i]; i++ )
00105 ;
00106
00107 if( i>= length || !line[i] )
00108 break;
00109
00110 char* begin = &line[i];
00111
00112 for( ; !isspace(line[i]) && i<length && line[i]; i++ )
00113 ;
00114
00115 line[i] = 0;
00116 i++;
00117
00118 if( &line[i] - begin > PARSER_MAX_WORD_LENGTH )
00119 begin[PARSER_MAX_WORD_LENGTH-1] = 0;
00120
00121 words.push_back(begin);
00122 }
00123 }
00124
00125 bufferPos += length + 1;
00126 }
00127 }
00128 };
00129 }
00130 }
00131
00132 #endif // INDRI_RAWTEXTPARSER_HPP