00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #ifndef INDRI_TERMRECORDER_HPP
00019 #define INDRI_TERMRECORDER_HPP
00020 #include "indri/Buffer.hpp"
00021 #include <vector>
00022 #include <utility>
00023 #include <algorithm>
00024
00025 namespace indri {
00026 namespace index {
00027 class TermRecorder {
00028 private:
00029 struct less {
00030 const char* _base;
00031
00032 less( const char* base ) {
00033 _base = base;
00034 }
00035
00036 bool operator () ( const std::pair<size_t, lemur::api::TERMID_T>& one, const std::pair<size_t, lemur::api::TERMID_T>& two ) {
00037 return strcmp( _base + one.first, _base + two.first ) < 0;
00038 }
00039 };
00040
00041 indri::utility::Buffer _buffer;
00042 std::vector< std::pair<size_t, lemur::api::TERMID_T> > _pairs;
00043
00044 public:
00045 void add( int sequence, const char* term ) {
00046 size_t termLength = strlen( term );
00047 char* termSpot = _buffer.write( termLength+1 );
00048 size_t termOffset = termSpot - _buffer.front();
00049 strcpy( termSpot, term );
00050
00051 _pairs.push_back( std::make_pair( termOffset, sequence ) );
00052 }
00053
00054 void sort() {
00055 std::sort( _pairs.begin(), _pairs.end(), less( _buffer.front() ) );
00056 }
00057
00058 void buildMap( std::vector<lemur::api::TERMID_T>& map, TermRecorder& other, std::vector< std::pair< const char*, lemur::api::TERMID_T > >* missing = 0 ) {
00059 map.resize( _pairs.size(), -1 );
00060 size_t i = 0;
00061 size_t j = 0;
00062 std::vector< std::pair<size_t, lemur::api::TERMID_T > >& otherPairs = other._pairs;
00063
00064
00065 while( i < otherPairs.size() && j < _pairs.size() ) {
00066 int result = strcmp( _buffer.front() + otherPairs[i].first,
00067 _buffer.front() + _pairs[j].first );
00068
00069 if( result == 0 ) {
00070 map[ _pairs[j].second ] = otherPairs[i].second;
00071 i++;
00072 j++;
00073 } else if( result < 0 ) {
00074 i++;
00075 } else {
00076 if( missing )
00077 missing->push_back( std::make_pair( _buffer.front() + _pairs[j].first, _pairs[j].second ) );
00078
00079 j++;
00080 }
00081 }
00082
00083 while( missing && j < _pairs.size() ) {
00084 missing->push_back( std::make_pair( _buffer.front() + _pairs[j].first, _pairs[j].second ) );
00085 j++;
00086 }
00087 }
00088
00089 std::vector< std::pair<size_t, lemur::api::TERMID_T> >& pairs() {
00090 return _pairs;
00091 }
00092
00093 indri::utility::Buffer& buffer() {
00094 return _buffer;
00095 }
00096
00097 int memorySize() {
00098 return int(_buffer.position() + _pairs.size() * sizeof(std::pair<size_t, lemur::api::TERMID_T>));
00099 }
00100 };
00101 }
00102 }
00103
00104 #endif // INDRI_TERMRECORDER_HPP
00105