00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 #ifndef _FREQCOUNTER_HPP 00013 #define _FREQCOUNTER_HPP 00014 00015 #include <map> 00016 #include <set> 00017 #include "InvFPTypes.hpp" 00018 #include "TextHandler.hpp" 00019 #include "Stopper.hpp" 00020 namespace lemur 00021 { 00022 namespace distrib 00023 { 00024 00026 #define R_CTF 0 00027 00028 #define R_DF 1 00029 00030 #define R_AVE_TF 2 00031 00032 #define R_UNIFORM 3 00033 00035 typedef struct freqinfo_tt { 00036 char * word; 00037 int ctf; 00038 int df; 00039 } freqinfo_t; 00040 00042 typedef map<std::string , freqinfo_t, less<std::string> > freqmap; 00044 typedef set<std::string , less<std::string> > stringset; 00045 00046 00052 class FreqCounter : public lemur::api::TextHandler { 00053 00054 public: 00057 FreqCounter(const lemur::api::Stopper * stopWords = NULL); 00060 FreqCounter(const string &filename, const lemur::api::Stopper * stopWords = NULL); 00061 00063 ~FreqCounter(); 00064 00066 void clear(); 00067 00069 void output(const string &filename) const; 00070 00073 char * randomWord(); 00080 void setRandomMode(int mode); 00082 int getRandomMode() const; 00083 00086 char * randomCtf() const; 00089 char * randomDf() const; 00092 char * randomAveTf() const; 00095 char * randomUniform() const; 00096 00097 00099 int numWords() const; 00101 int totWords() const; 00102 00104 const freqmap * getFreqInfo() const; 00105 00107 int getCtf(const char * word) const; 00109 int getDf(const char * word) const; 00111 double getAveTf(const char * word) const; 00112 00114 double ctfRatio(FreqCounter & lm1) const; 00115 00117 char * handleDoc(char * docno); 00119 char * handleWord(char * word); 00120 00122 void endDoc(); 00123 00125 void setName(const string &freqCounterName); 00127 const string & getName() const; 00128 00130 void pruneBottomWords(int topWords); 00131 00132 00133 protected: 00134 /* Loads a language model from file. */ 00135 void input(const string &filename); 00136 00137 /* Collection term frequencies. */ 00138 mutable freqmap freqInfo; 00139 00140 /* Words in a doc. */ 00141 stringset doc; 00142 /* Random words returned so far. */ 00143 stringset randdone; 00144 00145 /* The frequency counter's name. */ 00146 string name; 00147 00148 /* Stopword list */ 00149 const lemur::api::Stopper * stopper; 00150 00151 00152 /* used for calculating probabilities when 00153 * selecting a random word 00154 */ 00155 /* Sum over words of ctf. */ 00156 long ctfTot; 00157 /* Sum over words of df. */ 00158 int dfTot; 00159 /* Sum over words of average tf. */ 00160 mutable long double avetfTot; 00161 /* Indicates whether avetfTot is valid (true) 00162 * or needs to be recalculated (false). */ 00163 mutable bool atfValid; 00164 /* Random selection mode. */ 00165 int randomMode; 00166 /* Number of unique words. */ 00167 int nWords; 00168 00169 00170 }; 00171 00172 } 00173 } 00174 00175 00176 #endif