00001 /*========================================================================== 00002 * Copyright (c) 2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // TermList 00015 // 00016 // 23 November 2004 -- tds 00017 // 00018 00019 #ifndef INDRI_TERMLIST_HPP 00020 #define INDRI_TERMLIST_HPP 00021 00022 #include "indri/greedy_vector" 00023 #include "indri/FieldExtent.hpp" 00024 #include "RVLCompress.hpp" 00025 #include "indri/Buffer.hpp" 00026 #include "indri/RVLCompressStream.hpp" 00027 #include "indri/RVLDecompressStream.hpp" 00028 #include "IndexTypes.hpp" 00029 00030 namespace indri { 00031 namespace index { 00032 class TermList { 00033 private: 00034 indri::utility::greedy_vector<lemur::api::TERMID_T> _terms; 00035 indri::utility::greedy_vector<FieldExtent> _fields; 00036 00037 public: 00038 void clear() { 00039 _terms.clear(); 00040 _fields.clear(); 00041 } 00042 00043 void addField( const indri::index::FieldExtent& field ) { 00044 _fields.push_back( field ); 00045 } 00046 00047 void addTerm( const lemur::api::TERMID_T termID ) { 00048 _terms.push_back( termID ); 00049 } 00050 00051 indri::utility::greedy_vector<lemur::api::TERMID_T>& terms() { 00052 return _terms; 00053 } 00054 00055 const indri::utility::greedy_vector<lemur::api::TERMID_T>& terms() const { 00056 return _terms; 00057 } 00058 00059 indri::utility::greedy_vector<indri::index::FieldExtent>& fields() { 00060 return _fields; 00061 } 00062 00063 const indri::utility::greedy_vector<indri::index::FieldExtent>& fields() const { 00064 return _fields; 00065 } 00066 00067 void read( const char* buffer, int size ) { 00068 clear(); 00069 indri::utility::RVLDecompressStream stream( buffer, size ); 00070 00071 int termCount; 00072 int fieldCount; 00073 00074 stream >> termCount 00075 >> fieldCount; 00076 00077 for( int i=0; i<termCount; i++ ) { 00078 lemur::api::TERMID_T termID; 00079 stream >> termID; 00080 00081 assert( termID >= 0 ); 00082 _terms.push_back( termID ); 00083 } 00084 00085 for( int i=0; i<fieldCount; i++ ) { 00086 FieldExtent extent; 00087 00088 stream >> extent.id 00089 >> extent.parentOrdinal 00090 >> extent.begin 00091 >> extent.end 00092 >> extent.number; 00093 00094 assert( extent.id >= 0 ); 00095 assert( extent.parentOrdinal >= 0 ); 00096 assert( extent.begin >= 0 ); 00097 assert( extent.end >= extent.begin ); 00098 00099 extent.ordinal = i + 1; 00100 00101 _fields.push_back( extent ); 00102 } 00103 } 00104 00105 void write( indri::utility::Buffer& buffer ) { 00106 // format: 00107 // term count 00108 // field count 00109 // termID * termCount (compressed) 00110 // ( fieldID, begin, end, number ) * fieldCount 00111 00112 indri::utility::RVLCompressStream out( buffer ); 00113 00114 // write count of terms and fields in the document first 00115 int termCount = (int)_terms.size(); 00116 int fieldCount = (int)_fields.size(); 00117 00118 out << termCount 00119 << fieldCount; 00120 00121 // write out terms 00122 for( size_t i=0; i<_terms.size(); i++ ) { 00123 assert( _terms[i] >= 0 ); 00124 out << _terms[i]; 00125 } 00126 00127 // write out fields 00128 for( size_t i=0; i<_fields.size(); i++ ) { 00129 00130 assert( _fields[i].id >= 0 ); 00131 assert( _fields[i].ordinal == i + 1 ); 00132 00133 out << _fields[i].id 00134 << _fields[i].parentOrdinal 00135 << _fields[i].begin 00136 << _fields[i].end 00137 << _fields[i].number; 00138 } 00139 } 00140 }; 00141 } 00142 } 00143 00144 #endif // INDRI_TERMLIST_HPP