00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 #ifndef _LEMUR_PDICT_HPP
00017 #define _LEMUR_PDICT_HPP
00018
00019 #include "common_headers.hpp"
00020 #include "algorithm"
00021 #include "Keyfile.hpp"
00022 #include "File.hpp"
00023 #include "TextHandlerManager.hpp"
00024 namespace lemur
00025 {
00027 namespace dictionary
00028 {
00029
00032 class DictEntry {
00033 public:
00035 DictEntry();
00036
00038 DictEntry(const string &targ, const string &typ, double pr) : target(targ), type(typ),
00039 prob(pr) { }
00040
00044 char *toBytes(int &numBytes) const;
00045
00049 int toBytes(char *buffer) const;
00050
00053 int numBytes() const;
00057 int fromBytes(char *buffer);
00059 bool operator==(const DictEntry & a) const {
00060 return (target == a.target && type == a.type);
00061 }
00062 string toString(string delim = ";") const ;
00063
00065 string target;
00067 string type;
00069 double prob;
00070 };
00071
00075 class DictEntryFilter {
00076 public:
00079 virtual bool accept(const DictEntry &entry) const = 0;
00080 virtual ~DictEntryFilter() {}
00081 } ;
00082
00083
00087 class AllDictEntryFilter : public DictEntryFilter {
00088 public:
00091 bool accept(const DictEntry &entry) const { return true; }
00092 };
00093
00094
00098 class ProbDictEntryFilter : public DictEntryFilter {
00099 public:
00102 ProbDictEntryFilter(double thresh = 0.0) : threshold(thresh) {
00103 }
00107 bool accept(const DictEntry &entry) const { return entry.prob > threshold; }
00108 private:
00109 double threshold;
00110 };
00111
00112
00116 class TypeDictEntryFilter : public DictEntryFilter {
00117 public:
00120 TypeDictEntryFilter(const string &filtType) : type(filtType) {
00121 }
00125 bool accept(const DictEntry &entry) const { return entry.type == type; }
00126 private:
00127 string type;
00128 };
00129
00134 class StopwordDictEntryFilter : public DictEntryFilter {
00135 public:
00138 StopwordDictEntryFilter(const string &stopwords) {
00139 stopper = lemur::api::TextHandlerManager::createStopper(stopwords);
00140 }
00144 bool accept(const DictEntry &entry) const {
00145 return !(stopper->stopWord(entry.target.c_str()));
00146 }
00147 private:
00148 lemur::api::Stopper *stopper;
00149 };
00150
00152 class DictEntryVector : public vector<DictEntry> {
00153 public:
00154 DictEntryVector() : vector<DictEntry>() {
00155 }
00156 DictEntryVector(char *buffer, DictEntryFilter *filter);
00158 void sortScores() {
00159 sort(this->begin(), this->end(), cmpFn);
00160 }
00165 bool addEntry(DictEntry &entry, double (*compose)(double, double) = NULL);
00166
00170 bool removeEntry(DictEntry &entry);
00171
00175 char *toBytes(int &numBytes) const;
00179
00183 void toBytes(char *buffer) const;
00184
00185 void fromBytes(char *buffer, DictEntryFilter *filter);
00186
00189 int numEntries() const;
00190
00192 void normalize();
00193
00194 private:
00195 class DictEntryProbDescending {
00196 public:
00197 bool operator()(const DictEntry & a, const DictEntry & b) {
00198 return a.prob > b.prob;
00199 }
00200 };
00201 static DictEntryProbDescending cmpFn;
00202 };
00203
00205 struct dictStats {
00207 int dictSize;
00209 int sourceSize;
00211 int targetSize;
00212 };
00213
00217 class PDict {
00218 public:
00220 PDict();
00221
00223 ~PDict();
00224
00230 DictEntryVector *getTranslations(const string &term,
00231 DictEntryFilter *filter=NULL) const ;
00236 int numTranslations(const string &term,
00237 DictEntryFilter *filter=NULL) const;
00240 int getNumPairs() const;
00241
00244 int getSourceCount() const;
00245
00248 int getTargetCount() const ;
00249
00252 const string &getName() const {return name;}
00253
00256 bool isUsingCounts() const {return usingCounts;}
00257
00260 void setUsingCounts(bool val) {usingCounts = val;}
00261
00267 void add(const string &source, DictEntry &value,
00268 double (*compose)(double, double) = NULL);
00269
00273 void remove(const string &source, DictEntry &value);
00274
00277 void remove(const string &source);
00278
00283 void write(const string &outputName, const string &delim);
00284
00294 bool read(const string &dictName, const string &delim, bool counts = false);
00295
00300 bool open(const string &dictName);
00301
00306 bool create(const string &dictName);
00307
00310 void close();
00311
00314 void normalize();
00315
00317 void startIteration() {dict.setFirst();}
00318
00323 DictEntryVector *nextTranslations(string &term,
00324 DictEntryFilter *filter=NULL) const;
00325
00326 private:
00328 void writeTOC() const;
00330 bool contains(const string &term, lemur::file::Keyfile &keyfile) const;
00332 void flush();
00334 dictStats stats;
00336 DictEntryVector* currentVec;
00338 bool usingCounts;
00340 string currentTerm;
00342 string name;
00344 mutable lemur::file::Keyfile dict;
00346 mutable lemur::file::Keyfile targetIDs;
00348 mutable lemur::file::File dictEntries;
00349 };
00350 }
00351 }
00352
00353 #endif // _LEMUR_PDICT_HPP