00001 /*========================================================================== 00002 * Copyright (c) 2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // DocListMemoryBuilder.hpp 00015 // 00016 // tds - 17 December 2003 00017 // 00018 00019 #ifndef LEMUR_KEYFILEDOCLISTMEMORYBUILDER_HPP 00020 #define LEMUR_KEYFILEDOCLISTMEMORYBUILDER_HPP 00021 00022 #include "RVLCompress.hpp" 00023 #include <vector> 00024 #include <assert.h> 00025 #include "indri/greedy_vector" 00026 #include "indri/DocListIterator.hpp" 00027 #include "indri/RegionAllocator.hpp" 00028 00029 namespace indri { 00030 namespace index { 00031 struct DocListMemoryBuilderSegment { 00032 DocListMemoryBuilderSegment( char* b, char* d, char* c ) { 00033 base = b; 00034 data = d; 00035 capacity = c; 00036 } 00037 00038 char* base; 00039 char* data; 00040 char* capacity; 00041 }; 00042 00043 class DocListMemoryBuilderIterator : public DocListIterator { 00044 const indri::utility::greedy_vector< DocListMemoryBuilderSegment, 4 >* _lists; 00045 indri::utility::greedy_vector< DocListMemoryBuilderSegment, 4 >::const_iterator _current; 00046 indri::index::DocListIterator::DocumentData _data; 00047 indri::utility::greedy_vector<DocListIterator::TopDocument> _emptyTopDocuments; 00048 00049 const char* _list; 00050 const char* _listEnd; 00051 bool _finished; 00052 00053 TermData* _termData; 00054 00055 public: 00056 DocListMemoryBuilderIterator(); 00057 DocListMemoryBuilderIterator( class DocListMemoryBuilder& builder, TermData* termData ); 00058 00059 void reset( class DocListMemoryBuilder& builder, TermData* termData ); 00060 void reset( const indri::utility::greedy_vector< DocListMemoryBuilderSegment, 4 >& lists, TermData* termData ); 00061 00062 void startIteration(); 00063 bool finished(); 00064 bool nextEntry( lemur::api::DOCID_T documentID ); 00065 bool nextEntry(); 00066 TermData* termData(); 00067 DocListIterator::DocumentData* currentEntry(); 00068 indri::utility::greedy_vector<DocListIterator::TopDocument>& topDocuments(); 00069 }; 00070 00071 class DocListMemoryBuilder { 00072 public: 00073 typedef DocListMemoryBuilderIterator iterator; 00074 friend class DocListMemoryBuilderIterator; 00075 00076 private: 00077 int _documentFrequency; 00078 int _termFrequency; 00079 00080 indri::utility::greedy_vector< DocListMemoryBuilderSegment, 4 > _lists; 00081 00082 char* _list; 00083 char* _listBegin; 00084 char* _listEnd; 00085 00086 char* _documentPointer; 00087 char* _locationCountPointer; 00088 00089 int _lastLocation; 00090 int _lastDocument; 00091 int _lastTermFrequency; 00092 00093 indri::utility::RegionAllocator* _allocator; 00094 00095 inline void _safeAddLocation( int position ); 00096 size_t _roundUp( size_t amount ); 00097 void _grow(); 00098 void _terminateDocument(); 00099 00100 public: 00101 DocListMemoryBuilder( indri::utility::RegionAllocator* allocator ); 00102 ~DocListMemoryBuilder(); 00103 const DocListMemoryBuilder& operator=( DocListMemoryBuilder& other ); 00104 00105 void startDocument( int docID ); 00106 void addLocation( int location ); 00107 void endDocument(); 00108 00109 void clear(); 00110 void flush(); 00111 bool empty(); 00112 00113 int documentFrequency() const; 00114 int termFrequency() const; 00115 size_t memorySize() const; 00116 }; 00117 } 00118 } 00119 00120 #endif // LEMUR_DOCLISTMEMORYBUILDER_HPP