Lemur: TermData.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // TermData.hpp
00015 //
00016 // 4 February 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_TERMDATA_HPP
00020 #define INDRI_TERMDATA_HPP
00021 
00022 #include "indri/TermFieldStatistics.hpp"
00023 #include <indri/greedy_vector>
00024 #include "lemur-compat.hpp"
00025 #include "indri/RVLCompressStream.hpp"
00026 #include "indri/RVLDecompressStream.hpp"
00027 
00028 #ifdef WIN32
00029 // remove warning about zero-sized arrays
00030 #pragma warning ( disable: 4200 )
00031 #endif 
00032 
00033 namespace indri {
00034   namespace index {
00035     struct TermData {
00036     private:
00037       // these are private, bogus functions so that this object can never be copied
00038       // we don't want to be able to copy it, because any real copy operator needs to
00039       // take into account the data in the fields[] array, and we don't know how long it is.
00040       TermData( const TermData& other ) {}
00041       const TermData& operator= ( const TermData& other ) { return *this; }
00042 
00043     public:
00044       TermData() :
00045         maxDocumentLength(0),
00046         minDocumentLength(MAX_INT32)
00047       {
00048         term = 0;
00049       }
00050       
00051       struct term_less {
00052       public:
00053         bool operator () ( const TermData* one, const TermData* two ) const {
00054           return strcmp( one->term, two->term ) < 0;
00055         }
00056       };
00057 
00058       TermFieldStatistics corpus;
00059 
00060       unsigned int maxDocumentLength;    // maximum length of any document containing this term
00061       unsigned int minDocumentLength;    // minimum length of any document containing this term
00062 
00063       const char* term;                  // name of this term
00064 
00065       TermFieldStatistics fields[0];
00066     };
00067   }
00068 }
00069 
00070 inline indri::index::TermData* termdata_construct( void* buffer, int fieldCount ) {
00071   // call the constructor in place
00072   new(buffer) indri::index::TermData();
00073 
00074   // call field data constructors in place
00075   for( int i=0; i<fieldCount; i++ ) {
00076     new((char*)buffer +
00077         sizeof(indri::index::TermData) +
00078         sizeof(indri::index::TermFieldStatistics)*i) indri::index::TermFieldStatistics();
00079   }
00080 
00081   return (indri::index::TermData*) buffer;
00082 }
00083 
00084 inline indri::index::TermData* termdata_create( int fieldCount ) {
00085   // allocate enough room for the term data, plus enough room for fields
00086   void* buffer = malloc( sizeof(indri::index::TermData) + sizeof(indri::index::TermFieldStatistics)*fieldCount );
00087   return termdata_construct( buffer, fieldCount );
00088 }
00089 
00090 inline void termdata_destruct( indri::index::TermData* termData, int fieldCount ) {
00091   if( termData ) {
00092     termData->~TermData();
00093 
00094     for( int i=0; i<fieldCount; i++ ) {
00095       termData->fields[i].~TermFieldStatistics();
00096     }
00097   }
00098 }
00099 
00100 inline void termdata_delete( indri::index::TermData* termData, int fieldCount ) {
00101   if( termData ) {
00102     termdata_destruct( termData, fieldCount );
00103     free(termData);
00104   }
00105 }
00106 
00107 inline void termdata_clear( indri::index::TermData* termData, int fieldCount ) {
00108   termData->corpus.documentCount = 0;
00109   termData->corpus.totalCount = 0;
00110   termData->corpus.lastCount = 0;
00111   termData->corpus.lastDocument = 0;
00112 
00113   for( int i=0; i<fieldCount; i++ ) {
00114     indri::index::TermFieldStatistics& field = termData->fields[i];
00115 
00116     field.documentCount = 0;
00117     field.totalCount = 0;
00118     field.lastCount = 0;
00119     field.lastDocument = 0;
00120   }
00121 
00122   termData->minDocumentLength = MAX_INT32;
00123   termData->maxDocumentLength = 0;
00124 }
00125 
00126 inline void termdata_merge( indri::index::TermData* termData, indri::index::TermData* merger, int fieldCount ) {
00127   termData->corpus.documentCount += merger->corpus.documentCount;
00128   termData->corpus.totalCount += merger->corpus.totalCount;
00129 
00130   for( int i=0; i<fieldCount; i++ ) {
00131     indri::index::TermFieldStatistics& field = termData->fields[i];
00132     indri::index::TermFieldStatistics& mergeField = merger->fields[i];
00133 
00134     field.documentCount += mergeField.documentCount;
00135     field.totalCount += mergeField.totalCount;
00136   }
00137 
00138   termData->maxDocumentLength = lemur_compat::max( termData->maxDocumentLength, merger->maxDocumentLength );
00139   termData->minDocumentLength = lemur_compat::min( termData->minDocumentLength, merger->minDocumentLength );
00140 }
00141 
00142 inline int termdata_size( int fieldCount ) {
00143   return sizeof(indri::index::TermData) + fieldCount * sizeof(indri::index::TermFieldStatistics);
00144 }
00145 
00146 inline void termdata_compress( indri::utility::RVLCompressStream& stream, indri::index::TermData* termData, int fieldCount ) {
00147   // corpus statistics
00148   stream << termData->corpus.totalCount
00149          << termData->corpus.documentCount;
00150 
00151   // max-score statistics
00152   stream << termData->maxDocumentLength
00153          << termData->minDocumentLength;
00154   // field statistics
00155   for( int i=0; i<fieldCount; i++ ) {
00156     stream << termData->fields[i].totalCount
00157            << termData->fields[i].documentCount;
00158   }
00159 }
00160 
00161 inline void termdata_decompress( indri::utility::RVLDecompressStream& stream, indri::index::TermData* termData, int fieldCount ) {
00162   // corpus statistics
00163   stream >> termData->corpus.totalCount
00164          >> termData->corpus.documentCount;
00165 
00166   // max-score statistics
00167   stream >> termData->maxDocumentLength
00168          >> termData->minDocumentLength;
00169 
00170   // field statistics
00171   for( int i=0; i<fieldCount; i++ ) {
00172     stream >> termData->fields[i].totalCount
00173            >> termData->fields[i].documentCount;
00174   }
00175 }
00176 
00177 #endif // INDRI_TERMDATA_HPP