00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 #ifndef _QRYBASEDSAMPLER_HPP 00013 #define _QRYBASEDSAMPLER_HPP 00014 00015 00016 00017 #include "FreqCounter.hpp" 00018 #include "DBManager.hpp" 00019 00020 namespace lemur 00021 { 00022 namespace distrib 00023 { 00024 00026 typedef set<docid_t, less<string> > docidset; 00027 00029 #define T_NDOCS 1 00030 00031 #define T_NWORDS 2 00032 00033 #define T_NQRYS 4 00034 00038 class QryBasedSampler { 00039 public: 00040 QryBasedSampler(); 00041 ~QryBasedSampler(); 00042 00044 bool probe(const char * initQuery); 00045 00047 void setDBManager(const DBManager * database); 00048 00050 const DBManager * getDBManager() const; 00051 00052 00055 void setFreqCounter(FreqCounter * counter); 00056 00058 const FreqCounter * getFreqCounter() const; 00059 00060 00064 void setOutputPrefix(const string &prefix); 00065 00067 const string &getOutputPrefix() const; 00068 00070 void setNumDocs(int n); 00071 00073 int getNumDocs() const; 00074 00075 00077 void setNumWords(int n); 00078 00080 int getNumWords() const; 00081 00082 00084 void setNumQueries(int n); 00085 00087 int getNumQueries() const; 00088 00089 00096 void setTermMode(int m); 00097 00099 int getTermMode() const; 00100 00101 00103 void setDocsPerQuery(int n); 00104 00106 int getDocsPerQuery() const; 00107 00108 00109 private: 00110 00111 /* for querying a db */ 00112 const DBManager * db; 00113 00114 00115 /* for building a description of a db */ 00116 FreqCounter * freqCounter; 00117 00118 00119 /* output prefix for filenames */ 00120 string outputPrefix; 00121 00122 00123 /* termination mode of the probe - 00124 * either T_NDOCS or T_NWORDS */ 00125 int termMode; 00126 00127 /* number unique docs to retrieve - only used if 00128 * termMode == T_NDOCS */ 00129 int numDocs; 00130 00131 /* number unique words to retrieve - only used if 00132 * termMode == T_NWORDS */ 00133 int numWords; 00134 00135 /* number of queries to run - only used if 00136 * termMode == T_NQRYS */ 00137 int numQueries; 00138 00139 /* documents per query to use */ 00140 int docsPerQuery; 00141 00142 /* stores the ids of the document already retrieved 00143 * from the system. used to prevent parsing 00144 * a document multiple times */ 00145 docidset seenDocs; 00146 }; 00147 } 00148 } 00149 00150 #endif