00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_TERMDATA_HPP
00020 #define INDRI_TERMDATA_HPP
00021
00022 #include "indri/TermFieldStatistics.hpp"
00023 #include <indri/greedy_vector>
00024 #include "lemur-compat.hpp"
00025 #include "indri/RVLCompressStream.hpp"
00026 #include "indri/RVLDecompressStream.hpp"
00027
00028 #ifdef WIN32
00029
00030 #pragma warning ( disable: 4200 )
00031 #endif
00032
00033 namespace indri {
00034 namespace index {
00035 struct TermData {
00036 private:
00037
00038
00039
00040 TermData( const TermData& other ) {}
00041 const TermData& operator= ( const TermData& other ) { return *this; }
00042
00043 public:
00044 TermData() :
00045 maxDocumentLength(0),
00046 minDocumentLength(MAX_INT32)
00047 {
00048 term = 0;
00049 }
00050
00051 struct term_less {
00052 public:
00053 bool operator () ( const TermData* one, const TermData* two ) const {
00054 return strcmp( one->term, two->term ) < 0;
00055 }
00056 };
00057
00058 TermFieldStatistics corpus;
00059
00060 unsigned int maxDocumentLength;
00061 unsigned int minDocumentLength;
00062
00063 const char* term;
00064
00065 TermFieldStatistics fields[0];
00066 };
00067 }
00068 }
00069
00070 inline indri::index::TermData* termdata_construct( void* buffer, int fieldCount ) {
00071
00072 new(buffer) indri::index::TermData();
00073
00074
00075 for( int i=0; i<fieldCount; i++ ) {
00076 new((char*)buffer +
00077 sizeof(indri::index::TermData) +
00078 sizeof(indri::index::TermFieldStatistics)*i) indri::index::TermFieldStatistics();
00079 }
00080
00081 return (indri::index::TermData*) buffer;
00082 }
00083
00084 inline indri::index::TermData* termdata_create( int fieldCount ) {
00085
00086 void* buffer = malloc( sizeof(indri::index::TermData) + sizeof(indri::index::TermFieldStatistics)*fieldCount );
00087 return termdata_construct( buffer, fieldCount );
00088 }
00089
00090 inline void termdata_destruct( indri::index::TermData* termData, int fieldCount ) {
00091 if( termData ) {
00092 termData->~TermData();
00093
00094 for( int i=0; i<fieldCount; i++ ) {
00095 termData->fields[i].~TermFieldStatistics();
00096 }
00097 }
00098 }
00099
00100 inline void termdata_delete( indri::index::TermData* termData, int fieldCount ) {
00101 if( termData ) {
00102 termdata_destruct( termData, fieldCount );
00103 free(termData);
00104 }
00105 }
00106
00107 inline void termdata_clear( indri::index::TermData* termData, int fieldCount ) {
00108 termData->corpus.documentCount = 0;
00109 termData->corpus.totalCount = 0;
00110 termData->corpus.lastCount = 0;
00111 termData->corpus.lastDocument = 0;
00112
00113 for( int i=0; i<fieldCount; i++ ) {
00114 indri::index::TermFieldStatistics& field = termData->fields[i];
00115
00116 field.documentCount = 0;
00117 field.totalCount = 0;
00118 field.lastCount = 0;
00119 field.lastDocument = 0;
00120 }
00121
00122 termData->minDocumentLength = MAX_INT32;
00123 termData->maxDocumentLength = 0;
00124 }
00125
00126 inline void termdata_merge( indri::index::TermData* termData, indri::index::TermData* merger, int fieldCount ) {
00127 termData->corpus.documentCount += merger->corpus.documentCount;
00128 termData->corpus.totalCount += merger->corpus.totalCount;
00129
00130 for( int i=0; i<fieldCount; i++ ) {
00131 indri::index::TermFieldStatistics& field = termData->fields[i];
00132 indri::index::TermFieldStatistics& mergeField = merger->fields[i];
00133
00134 field.documentCount += mergeField.documentCount;
00135 field.totalCount += mergeField.totalCount;
00136 }
00137
00138 termData->maxDocumentLength = lemur_compat::max( termData->maxDocumentLength, merger->maxDocumentLength );
00139 termData->minDocumentLength = lemur_compat::min( termData->minDocumentLength, merger->minDocumentLength );
00140 }
00141
00142 inline int termdata_size( int fieldCount ) {
00143 return sizeof(indri::index::TermData) + fieldCount * sizeof(indri::index::TermFieldStatistics);
00144 }
00145
00146 inline void termdata_compress( indri::utility::RVLCompressStream& stream, indri::index::TermData* termData, int fieldCount ) {
00147
00148 stream << termData->corpus.totalCount
00149 << termData->corpus.documentCount;
00150
00151
00152 stream << termData->maxDocumentLength
00153 << termData->minDocumentLength;
00154
00155 for( int i=0; i<fieldCount; i++ ) {
00156 stream << termData->fields[i].totalCount
00157 << termData->fields[i].documentCount;
00158 }
00159 }
00160
00161 inline void termdata_decompress( indri::utility::RVLDecompressStream& stream, indri::index::TermData* termData, int fieldCount ) {
00162
00163 stream >> termData->corpus.totalCount
00164 >> termData->corpus.documentCount;
00165
00166
00167 stream >> termData->maxDocumentLength
00168 >> termData->minDocumentLength;
00169
00170
00171 for( int i=0; i<fieldCount; i++ ) {
00172 stream >> termData->fields[i].totalCount
00173 >> termData->fields[i].documentCount;
00174 }
00175 }
00176
00177 #endif // INDRI_TERMDATA_HPP