Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

TermRecorder.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2005 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 //
00013 // TermRecorder
00014 //
00015 // 14 January 2005 -- tds
00016 //
00017 
00018 #ifndef INDRI_TERMRECORDER_HPP
00019 #define INDRI_TERMRECORDER_HPP
00020 #include "indri/Buffer.hpp"
00021 #include <vector>
00022 #include <utility>
00023 #include <algorithm>
00024 
00025 namespace indri {
00026   namespace index {
00027     class TermRecorder {
00028     private:
00029       struct less {
00030         const char* _base;
00031 
00032         less( const char* base ) {
00033           _base = base;
00034         }
00035 
00036         bool operator () ( const std::pair<size_t, lemur::api::TERMID_T>& one, const std::pair<size_t, lemur::api::TERMID_T>& two ) {
00037           return strcmp( _base + one.first, _base + two.first ) < 0;
00038         }
00039       };
00040 
00041       indri::utility::Buffer _buffer;
00042       std::vector< std::pair<size_t, lemur::api::TERMID_T> > _pairs;
00043 
00044     public:
00045       void add( int sequence, const char* term ) {
00046         size_t termLength = strlen( term );
00047         char* termSpot = _buffer.write( termLength+1 );
00048         size_t termOffset = termSpot - _buffer.front();
00049         strcpy( termSpot, term );
00050 
00051         _pairs.push_back( std::make_pair( termOffset, sequence ) );
00052       }
00053 
00054       void sort() {
00055         std::sort( _pairs.begin(), _pairs.end(), less( _buffer.front() ) );
00056       }
00057 
00058       void buildMap( std::vector<lemur::api::TERMID_T>& map, TermRecorder& other, std::vector< std::pair< const char*, lemur::api::TERMID_T > >* missing = 0 ) {
00059         map.resize( _pairs.size(), -1 );
00060         size_t i = 0;
00061         size_t j = 0;
00062         std::vector< std::pair<size_t, lemur::api::TERMID_T > >& otherPairs = other._pairs;
00063 
00064         // this joins all matching pairs
00065         while( i < otherPairs.size() && j < _pairs.size() ) {
00066           int result = strcmp( _buffer.front() + otherPairs[i].first,
00067                                _buffer.front() + _pairs[j].first );
00068           
00069           if( result == 0 ) {
00070             map[ _pairs[j].second ] = otherPairs[i].second;
00071             i++;
00072             j++;
00073           } else if( result < 0 ) {
00074             i++;
00075           } else {
00076             if( missing )
00077               missing->push_back( std::make_pair( _buffer.front() + _pairs[j].first, _pairs[j].second ) );
00078 
00079             j++;
00080           }
00081         }
00082 
00083         while( missing && j < _pairs.size() ) {
00084           missing->push_back( std::make_pair( _buffer.front() + _pairs[j].first, _pairs[j].second ) );
00085           j++;
00086         }
00087       }
00088 
00089       std::vector< std::pair<size_t, lemur::api::TERMID_T> >& pairs() {
00090         return _pairs;
00091       }
00092 
00093       indri::utility::Buffer& buffer() {
00094         return _buffer;
00095       }
00096 
00097       int memorySize() {
00098         return int(_buffer.position() + _pairs.size() * sizeof(std::pair<size_t, lemur::api::TERMID_T>));
00099       }
00100     };
00101   }
00102 }
00103 
00104 #endif // INDRI_TERMRECORDER_HPP
00105 

Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4