00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_TERMBITMAP_HPP
00020 #define INDRI_TERMBITMAP_HPP
00021
00022 #include "indri/Buffer.hpp"
00023 #include "indri/delete_range.hpp"
00024
00025 namespace indri {
00026 namespace index {
00060 class TermBitmap {
00061 private:
00062 std::vector<indri::utility::Buffer*> _maps;
00063 int _fromBase;
00064 int _toBase;
00065 int _lastFrom;
00066 char* _current;
00067
00068 void _addBufferIfNecessary() {
00069 if( !_maps.size() || _maps.back()->position() == _maps.back()->size() )
00070 _maps.push_back( new indri::utility::Buffer( 64*1024 ) );
00071 }
00072
00073 indri::utility::Buffer* _findBuffer( int from ) {
00074 assert( from >= 0 );
00075
00076 int left = 0;
00077 int right = (int)_maps.size()-1;
00078
00079 if( _maps.size() == 0 )
00080 return 0;
00081
00082 while( right - left > 1 ) {
00083 int middle = left + (right - left) / 2;
00084 indri::utility::Buffer* mid = _maps[ middle ];
00085
00086 if( from < *(INT32*) mid->front() ) {
00087 right = middle;
00088 } else {
00089 left = middle;
00090 }
00091 }
00092
00093 if( right == left )
00094 return _maps[left];
00095
00096 int rightFront = *(INT32*) _maps[right]->front();
00097
00098 if( from < rightFront ) {
00099 return _maps[left];
00100 }
00101
00102 return _maps[right];
00103 }
00104
00105 const char* _findInBuffer( indri::utility::Buffer* b, int from ) {
00106 const char* start = b->front();
00107 const char* end = b->front() + b->position();
00108
00109 while( end - start > 32 ) {
00110 const char* mid = start + (((end - start) / 2) & ~31);
00111 INT32 middle = *(INT32*) mid;
00112
00113 if( from >= middle )
00114 start = mid;
00115 else
00116 end = mid;
00117 }
00118
00119 INT32 front = *(INT32*)start;
00120 assert( from >= front && from < (front + 192) );
00121
00122 return start;
00123 }
00124
00125 int _bitsSet( unsigned char c ) {
00126 static int bset[] = { 0, 1, 1, 2,
00127 1, 2, 2, 3,
00128 1, 2, 2, 3,
00129 2, 3, 3, 4 };
00130
00131 return bset[ c & 0xf ] + bset[ (c>>4) ];
00132 }
00133
00134 public:
00135 TermBitmap() {
00136 _lastFrom = -1;
00137 _toBase = -10000;
00138 _fromBase = -10000;
00139 }
00140
00141 ~TermBitmap() {
00142 delete_vector_contents( _maps );
00143 }
00144
00145 int lastFrom() {
00146 return _lastFrom;
00147 }
00148
00149 void add( int to ) {
00150 add( _lastFrom+1, to );
00151 }
00152
00153 void add( int from, int to ) {
00154 assert( _lastFrom < from );
00155
00156 const int availableSpace = ((32 - 8) * 8);
00157 int difference = to - _toBase;
00158
00159 assert( difference >= 0 );
00160
00161 if( difference >= availableSpace || _lastFrom < from-1 ) {
00162 _addBufferIfNecessary();
00163
00164
00165 indri::utility::Buffer* back = _maps.back();
00166 _current = back->write( 32 );
00167
00168 *(INT32*)_current = from;
00169 *(INT32*)(_current + 4) = to;
00170 memset( _current + 8, 0, (32 - 8) );
00171
00172 _toBase = to;
00173 _fromBase = from;
00174 difference = 0;
00175 }
00176
00177 _current[8+difference/8] |= 1<<(difference%8);
00178 _lastFrom = from;
00179
00180 assert( get(from) == to );
00181 }
00182
00183 int get( int from ) {
00184 assert( from <= _lastFrom );
00185
00186
00187 indri::utility::Buffer* buffer = _findBuffer( from );
00188 const char* spot = _findInBuffer( buffer, from );
00189
00190
00191 int fromBase = *(INT32*)spot;
00192 int toBase = *(INT32*)(spot+4);
00193 spot += 8;
00194 int bits = 0;
00195 int found = 0;
00196
00197
00198 unsigned char c;
00199 int need = from - fromBase + 1;
00200
00201 c = spot[bits/8];
00202 int byteBits = _bitsSet( c );
00203
00204 while( found + byteBits < need ) {
00205 bits += 8;
00206 found += byteBits;
00207 c = spot[bits/8];
00208 byteBits = _bitsSet( c );
00209 }
00210
00211
00212 int i;
00213 for( i=0; i<8; i++ ) {
00214 if( c & 1<<i ) {
00215 found++;
00216
00217 if( found == need )
00218 break;
00219 }
00220 }
00221
00222 bits += i;
00223 assert( (fromBase + found - 1) == from );
00224 return toBase + bits;
00225 }
00226
00227 size_t memorySize() {
00228 return _maps.size() * (64*1024);
00229 }
00230 };
00231 }
00232 }
00233
00234 #endif // INDRI_TERMBITMAP_HPP