lemur::dictionary::PDict Class Reference

Probabilistic dictionary using Keyfile for data storage. More...

#include <PDict.hpp>

List of all members.

Public Member Functions

PDict ()

default constructor

~PDict ()

clean up

DictEntryVector * getTranslations (const string &term, DictEntryFilter *filter=NULL) const

Get dictionary entries (translations) for a term.

int numTranslations (const string &term, DictEntryFilter *filter=NULL) const

Get the number dictionary entries (translations) for a term.

int getNumPairs () const

Get the total size of the dictionary.

int getSourceCount () const

Get the number of unique terms in the source vocabulary.

int getTargetCount () const

Get the number of unique terms in the target vocabulary.

const string & getName () const

Get the name of the dictionary.

bool isUsingCounts () const

Is the dictionary using counts or probabilities.

void setUsingCounts (bool val)

Set the flag for using counts or probabilities.

void add (const string &source, DictEntry &value, double(*compose)(double, double)=NULL)

Add an entry for a term.

void remove (const string &source, DictEntry &value)

Remove an entry for a term.

void remove (const string &source)

Remove all entries for a term.

void write (const string &outputName, const string &delim)

Output dictionary as plain text, separator delimited values.

bool read (const string &dictName, const string &delim, bool counts=false)

Input a dictionary from plain text, separator delimited values. The input file must contain 4 columns. The columns are: sourceterm;type;targetterm;probability; where type is an arbitrary symbol, such as a part of speech tag.

bool open (const string &dictName)

Open an existing probabilistic dictionary.

bool create (const string &dictName)

Create a new, empty probabilistic dictionary.

void close ()

Close the dictionary. Flushes all buffers and closes all files.

void normalize ()

Normalize probabilities of entries to sum to one Normalizes all entries, updating the dictionary.

void startIteration ()

Initialize for iteration over all keys.

DictEntryVector * nextTranslations (string &term, DictEntryFilter *filter=NULL) const

Get next key's dictionary entry (translations).

Private Member Functions

void writeTOC () const

write toc file

bool contains (const string &term, lemur::file::Keyfile &keyfile) const

known term?

void flush ()

flush the current entry to table

Private Attributes

dictStats stats

dictionary statistics

DictEntryVector * currentVec

Current term's entry vector.

bool usingCounts

are we storing frequencies or probabilities?

string currentTerm

Current term.

string name

base name for dictionary

lemur::file::Keyfile dict

btree for dictionary entry records.

lemur::file::Keyfile targetIDs

target vocab termName -> freq table (delete if f == 0).

lemur::file::File dictEntries

File for entry data.

Detailed Description

Probabilistic dictionary using Keyfile for data storage.

Constructor & Destructor Documentation

lemur::dictionary::PDict::PDict ( )

default constructor

lemur::dictionary::PDict::~PDict ( )

clean up

Member Function Documentation

void lemur::dictionary::PDict::add ( const string & source,

DictEntry & value,

double(* compose)(double, double) = NULL

)

Add an entry for a term.

Parameters:

source the key for the entry

value the value to add

compose the function to use to combine this entries probability/frequency value with if there is an existing entry in the dictionary. Default is to sum. Replaces the entry for value if one exists.

void lemur::dictionary::PDict::close ( )

Close the dictionary. Flushes all buffers and closes all files.

bool lemur::dictionary::PDict::contains ( const string & term,

lemur::file::Keyfile & keyfile

) const [private]

known term?

bool lemur::dictionary::PDict::create ( const string & dictName )

Create a new, empty probabilistic dictionary.

Parameters:

dictName the dictionary file to create.

Returns:
true if created successfully. Otherwise false. Create a new dictionary.

void lemur::dictionary::PDict::flush ( ) [private]

flush the current entry to table

const string& lemur::dictionary::PDict::getName ( ) const [inline]

Get the name of the dictionary.

Returns:
The name of the dictionary

int lemur::dictionary::PDict::getNumPairs ( ) const

Get the total size of the dictionary.

Returns:
Total number of pairs in the dictionary

int lemur::dictionary::PDict::getSourceCount ( ) const

Get the number of unique terms in the source vocabulary.

Returns:
Total number of unique source term entries in the dictionary

int lemur::dictionary::PDict::getTargetCount ( ) const

Get the number of unique terms in the target vocabulary.

Returns:
Total number of unique target term entries in the dictionary

lemur::dictionary::DictEntryVector * lemur::dictionary::PDict::getTranslations ( const string & term,

DictEntryFilter * filter = NULL

) const

Get dictionary entries (translations) for a term.

Parameters:

term the term to lookup.

filter to apply to the entries. If unspecified, defaults to NULL.

Returns:
Pointer to the vector of dictionary entries for the term. Caller is responsible for deleting.

bool lemur::dictionary::PDict::isUsingCounts ( ) const [inline]

Is the dictionary using counts or probabilities.

Returns:
true if the dictionary contains frequencies, otherwise false.

lemur::dictionary::DictEntryVector * lemur::dictionary::PDict::nextTranslations ( string & term,

DictEntryFilter * filter = NULL

) const

Get next key's dictionary entry (translations).

Parameters:

term set to the term for this entry.

filter to apply to the entries. If unspecified, defaults to NULL.

Returns:
Pointer to the vector of dictionary entries for the term. Caller is responsible for deleting. Returns NULL at end of file.

void lemur::dictionary::PDict::normalize ( )

Normalize probabilities of entries to sum to one Normalizes all entries, updating the dictionary.

int lemur::dictionary::PDict::numTranslations ( const string & term,

DictEntryFilter * filter = NULL

) const

Get the number dictionary entries (translations) for a term.

Parameters:

term the term to lookup.

filter to apply to the entries. If unspecified, defaults to NULL.

Returns:
Number of dictionary entries for the term after filtering.

bool lemur::dictionary::PDict::open ( const string & dictName )

Open an existing probabilistic dictionary.

Parameters:

dictName the dictionary file to open.

Returns:
true if opened successfully. Otherwise false. Open an existing dictionary.

bool lemur::dictionary::PDict::read ( const string & dictName,

const string & delim,

bool counts = false

)

Input a dictionary from plain text, separator delimited values. The input file must contain 4 columns. The columns are: sourceterm;type;targetterm;probability; where type is an arbitrary symbol, such as a part of speech tag.

Parameters:

dictName the file to read

delim the delimiter to use.

counts true if the input file contains frequencies. Default is false.

Returns:
true if created successfully. Otherwise false.

void lemur::dictionary::PDict::remove ( const string & source )

Remove all entries for a term.

Parameters:

source the key for the entry

void lemur::dictionary::PDict::remove ( const string & source,

DictEntry & value

)

Remove an entry for a term.

Parameters:

source the key for the entry

value the value to delete

void lemur::dictionary::PDict::setUsingCounts ( bool val ) [inline]

Set the flag for using counts or probabilities.

Parameters:

val true if the dictionary contains frequencies otherwise false.

void lemur::dictionary::PDict::startIteration ( ) [inline]

Initialize for iteration over all keys.

void lemur::dictionary::PDict::write ( const string & outputName,

const string & delim

)

Output dictionary as plain text, separator delimited values.

Parameters:

outputName the name of the file to write to.

delim the delimiter to use. NB single char delimiter ? Escape in source/target.

void lemur::dictionary::PDict::writeTOC ( ) const [private]

write toc file

Member Data Documentation

string lemur::dictionary::PDict::currentTerm [private]

Current term.

DictEntryVector* lemur::dictionary::PDict::currentVec [private]

Current term's entry vector.

lemur::file::Keyfile lemur::dictionary::PDict::dict [mutable, private]

btree for dictionary entry records.

lemur::file::File lemur::dictionary::PDict::dictEntries [mutable, private]

File for entry data.

string lemur::dictionary::PDict::name [private]

base name for dictionary

dictStats lemur::dictionary::PDict::stats [private]

dictionary statistics

lemur::file::Keyfile lemur::dictionary::PDict::targetIDs [mutable, private]

target vocab termName -> freq table (delete if f == 0).

bool lemur::dictionary::PDict::usingCounts [private]

are we storing frequencies or probabilities?

The documentation for this class was generated from the following files:

Generated on Tue Jun 15 11:03:05 2010 for Lemur by

1.3.4


Public Member Functions
	PDict ()
	default constructor
	~PDict ()
	clean up
DictEntryVector *	getTranslations (const string &term, DictEntryFilter *filter=NULL) const
	Get dictionary entries (translations) for a term.
int	numTranslations (const string &term, DictEntryFilter *filter=NULL) const
	Get the number dictionary entries (translations) for a term.
int	getNumPairs () const
	Get the total size of the dictionary.
int	getSourceCount () const
	Get the number of unique terms in the source vocabulary.
int	getTargetCount () const
	Get the number of unique terms in the target vocabulary.
const string &	getName () const
	Get the name of the dictionary.
bool	isUsingCounts () const
	Is the dictionary using counts or probabilities.
void	setUsingCounts (bool val)
	Set the flag for using counts or probabilities.
void	add (const string &source, DictEntry &value, double(*compose)(double, double)=NULL)
	Add an entry for a term.
void	remove (const string &source, DictEntry &value)
	Remove an entry for a term.
void	remove (const string &source)
	Remove all entries for a term.
void	write (const string &outputName, const string &delim)
	Output dictionary as plain text, separator delimited values.
bool	read (const string &dictName, const string &delim, bool counts=false)
	Input a dictionary from plain text, separator delimited values. The input file must contain 4 columns. The columns are: sourceterm;type;targetterm;probability; where type is an arbitrary symbol, such as a part of speech tag.
bool	open (const string &dictName)
	Open an existing probabilistic dictionary.
bool	create (const string &dictName)
	Create a new, empty probabilistic dictionary.
void	close ()
	Close the dictionary. Flushes all buffers and closes all files.
void	normalize ()
	Normalize probabilities of entries to sum to one Normalizes all entries, updating the dictionary.
void	startIteration ()
	Initialize for iteration over all keys.
DictEntryVector *	nextTranslations (string &term, DictEntryFilter *filter=NULL) const
	Get next key's dictionary entry (translations).
Private Member Functions
void	writeTOC () const
	write toc file
bool	contains (const string &term, lemur::file::Keyfile &keyfile) const
	known term?
void	flush ()
	flush the current entry to table
Private Attributes
dictStats	stats
	dictionary statistics
DictEntryVector *	currentVec
	Current term's entry vector.
bool	usingCounts
	are we storing frequencies or probabilities?
string	currentTerm
	Current term.
string	name
	base name for dictionary
lemur::file::Keyfile	dict
	btree for dictionary entry records.
lemur::file::Keyfile	targetIDs
	target vocab termName -> freq table (delete if f == 0).
lemur::file::File	dictEntries
	File for entry data.