00001 /*========================================================================== 00002 * Copyright (c) 2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // DocListIterator 00015 // 00016 // 9 January 2004 - tds 00017 // 00018 00019 #ifndef INDRI_DOCLISTITERATOR_HPP 00020 #define INDRI_DOCLISTITERATOR_HPP 00021 00022 #include "indri/greedy_vector" 00023 #include "indri/TermData.hpp" 00024 #include "IndexTypes.hpp" 00025 00026 namespace indri { 00027 namespace index { 00028 class DocListIterator { 00029 public: 00030 struct DocumentData { 00031 lemur::api::DOCID_T document; 00032 indri::utility::greedy_vector<int> positions; 00033 }; 00034 00035 struct TopDocument { 00036 struct less { 00037 bool operator() ( const TopDocument& one, const TopDocument& two ) const { 00038 double oneFrac = double(one.count) / double(one.length); 00039 double twoFrac = double(two.count) / double(two.length); 00040 return (oneFrac < twoFrac); 00041 } 00042 }; 00043 00044 struct greater { 00045 bool operator() ( const TopDocument& one, const TopDocument& two ) const { 00046 double oneFrac = double(one.count) / double(one.length); 00047 double twoFrac = double(two.count) / double(two.length); 00048 return (oneFrac > twoFrac); 00049 } 00050 }; 00051 00052 struct docid_less { 00053 bool operator() ( const TopDocument& one, const TopDocument& two ) const { 00054 return one.document < two.document; 00055 } 00056 }; 00057 00058 TopDocument( lemur::api::DOCID_T _document, int _count, int _length ) : 00059 document(_document), 00060 count(_count), 00061 length(_length) 00062 { 00063 } 00064 00065 lemur::api::DOCID_T document; 00066 int count; 00067 int length; 00068 }; 00069 00070 virtual ~DocListIterator() {}; 00071 00072 // get the iterator ready to return data; call this before calling currentEntry or nextEntry 00073 virtual void startIteration() = 0; 00074 00075 // get the termData structure associated with this term 00076 virtual TermData* termData() = 0; 00077 00078 // get a list of top documents for this iterator (must call startIteration() first) 00079 virtual const indri::utility::greedy_vector<TopDocument>& topDocuments() = 0; 00080 00081 // return the current document entry if we're not finished, null otherwise. 00082 virtual DocumentData* currentEntry() = 0; 00083 00084 // move to the next document in the list; return false if there are no more valid documents 00085 virtual bool nextEntry() = 0; 00086 00087 // find the first document that contains this term that has an id >= documentID. 00088 // returns false if no such document exists. 00089 virtual bool nextEntry( lemur::api::DOCID_T documentID ) = 0; 00090 00091 // returns true if the iterator has no more entries 00092 virtual bool finished() = 0; 00093 }; 00094 } 00095 } 00096 00097 #endif // INDRI_DOCLISTITERATOR_HPP 00098 00099