00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _UNIGRAMLM_HPP
00014 #define _UNIGRAMLM_HPP
00015
00016 #include "Counter.hpp"
00017 #include "Exception.hpp"
00018 #include "IndexTypes.hpp"
00019 #include <cstring>
00020 namespace lemur
00021 {
00022 namespace langmod
00023 {
00024
00026
00031 class UnigramLM {
00032 public:
00034 virtual double prob(lemur::api::TERMID_T wordIndex) const = 0;
00036 virtual const string lexiconID() const= 0;
00037
00039 virtual void startIteration() const = 0;
00040 virtual bool hasMore() const = 0;
00041 virtual void nextWordProb(lemur::api::TERMID_T &wordIndex, double &prob) const = 0;
00042 };
00043
00044
00046
00047 class SmoothedMLEstimator : public UnigramLM {
00048 public:
00049 SmoothedMLEstimator(const lemur::utility::Counter &counter, const string &lexiconID) : ct(counter), lexID(lexiconID) {}
00050 virtual ~SmoothedMLEstimator() {}
00051
00052 virtual double prob(lemur::api::TERMID_T wordIndex) const {
00053 return (probEstimate(wordIndex, ct.count(wordIndex),ct.sum()));
00054 }
00055
00056 virtual void startIteration() const {
00057 ct.startIteration();
00058 }
00059
00060 virtual bool hasMore() const {
00061 return ct.hasMore();
00062 }
00063
00064 virtual void nextWordProb(lemur::api::TERMID_T &wordIndex, double &prob) const{
00065 double count;
00066
00067 ct.nextCount((int&)wordIndex, count);
00068 prob = probEstimate(wordIndex, count, ct.sum());
00069 }
00070
00071 virtual const string lexiconID() const { return lexID;}
00072
00074 virtual double probEstimate(lemur::api::TERMID_T wordIndex, double wdCount, double sumCount) const=0;
00075
00076 protected:
00077 const lemur::utility::Counter &ct;
00078 const string lexID;
00079 };
00080
00082
00083 class MLUnigramLM : public SmoothedMLEstimator {
00084 public:
00085 MLUnigramLM(const lemur::utility::Counter & counter, const string &lexiconID) : SmoothedMLEstimator(counter, lexiconID) {};
00086 virtual ~MLUnigramLM() {}
00087
00088 virtual double probEstimate(lemur::api::TERMID_T wordIndex, double count, double sum) const{
00089 return (count/sum);
00090 }
00091 };
00092
00094 class LaplaceUnigramLM : public SmoothedMLEstimator {
00095 public:
00096 LaplaceUnigramLM(const lemur::utility::Counter & counter, const string &lexiconID, double vocabSize) : SmoothedMLEstimator(counter, lexiconID), vocSz(vocabSize) {};
00097 virtual ~LaplaceUnigramLM() {}
00098
00099 virtual double probEstimate(lemur::api::TERMID_T wordIndex, double count, double sum) const {
00100 return ((count+1)/(sum+vocSz));
00101 }
00102 private:
00103 double vocSz;
00104 };
00105
00106
00108
00109 class DirichletUnigramLM : public SmoothedMLEstimator {
00110 public:
00111 DirichletUnigramLM(const lemur::utility::Counter & counter, const string &lexiconID,
00112 const UnigramLM &refLM, double priorSampleSize)
00113 : SmoothedMLEstimator(counter, lexiconID), ref(&refLM),
00114 s(priorSampleSize) {}
00115
00116 virtual ~DirichletUnigramLM() {}
00117
00118 virtual double probEstimate(lemur::api::TERMID_T wordIndex, double count, double sum) const {
00119 return ((count+s*ref->prob(wordIndex))/(sum+s));
00120 }
00121
00122 private:
00123 const UnigramLM *ref;
00125 double s;
00126 };
00127
00128
00129
00130
00131
00132
00134
00135 class InterpUnigramLM : public SmoothedMLEstimator {
00136 public:
00137 InterpUnigramLM(const lemur::utility::Counter & counter, const string &lexiconID,
00138 const UnigramLM &refLM, double refCoeff)
00139 : SmoothedMLEstimator(counter, lexiconID), ref(&refLM),
00140 refC(refCoeff) {}
00141
00142 virtual ~InterpUnigramLM() {}
00143
00144 virtual double probEstimate(lemur::api::TERMID_T wordIndex, double count, double sum) const {
00145 return ((1-refC)*count/sum + refC*ref->prob(wordIndex));
00146 }
00147
00148 private:
00149 const UnigramLM *ref;
00151 double refC;
00152 };
00153
00154 }
00155 }
00156 #endif