lemur::index::KeyfileIncIndex Class Reference

#include <KeyfileIncIndex.hpp>

Inheritance diagram for lemur::index::KeyfileIncIndex:

List of all members.


Public Member Functions
	KeyfileIncIndex (const string &prefix, int cachesize=128000000, lemur::api::DOCID_T startdocid=1)
	KeyfileIncIndex ()
	New empty one for index manager to use.
	~KeyfileIncIndex ()
	Clean up.
void	setName (const string &prefix)
	sets the name for this index
bool	beginDoc (const lemur::parse::DocumentProps *dp)
	the beginning of a new document
bool	addTerm (const lemur::api::Term &t)
	adding a term to the current document
void	endDoc (const lemur::parse::DocumentProps *dp)
	signify the end of current document
virtual void	endDoc (const lemur::parse::DocumentProps *dp, const string &mgr)
	signify the end of current document
void	endCollection (const lemur::parse::CollectionProps *cp)
	signify the end of this collection.
void	setDocManager (const string &mgrID)
	set the document manager to use for succeeding documents
void	setMesgStream (ostream *lemStream)
	set the mesg stream
void	addKnownTerm (lemur::api::TERMID_T termID, lemur::api::LOC_T position)
	update data for an already seen term
lemur::api::TERMID_T	addUnknownTerm (const InvFPTerm *term)
	initialize data for a previously unseen term.
lemur::api::TERMID_T	addUncachedTerm (const InvFPTerm *term)
	update data for a term that is not cached in the term cache.
Open index
bool	open (const string &indexName)
	Open previously created Index with given prefix.
Spelling and index conversion
lemur::api::TERMID_T	term (const lemur::api::TERM_T &word) const
	Convert a term spelling to a termID.
const lemur::api::TERM_T	term (lemur::api::TERMID_T termID) const
	Convert a termID to its spelling.
lemur::api::DOCID_T	document (const lemur::api::EXDOCID_T &docIDStr) const
	Convert a spelling to docID.
const lemur::api::EXDOCID_T	document (lemur::api::DOCID_T docID) const
	Convert a docID to its spelling.
const lemur::api::DocumentManager *	docManager (lemur::api::DOCID_T docID) const
	The document manager for this document.
const lemur::parse::CollectionProps *	collectionProps () const
Summary counts
lemur::api::COUNT_T	docCount () const
	Total count (i.e., number) of documents in collection.
lemur::api::COUNT_T	termCountUnique () const
	Total count of unique terms in collection.
lemur::api::COUNT_T	termCount (lemur::api::TERMID_T termID) const
	Total counts of a term in collection.
lemur::api::COUNT_T	termCount () const
	Total counts of all terms in collection.
float	docLengthAvg () const
	Average document length.
lemur::api::COUNT_T	docCount (lemur::api::TERMID_T termID) const
	Total counts of doc with a given term.
lemur::api::COUNT_T	docLength (lemur::api::DOCID_T docID) const
	Total counts of terms in a document, including stop words maybe.
virtual lemur::api::COUNT_T	totaldocLength (lemur::api::DOCID_T docID) const
	Total counts of terms in a document including stopwords for sure.
lemur::api::COUNT_T	docLengthCounted (lemur::api::DOCID_T docID) const
	Total count of terms in given document, not including stop words.
Index entry access
lemur::api::DocInfoList *	docInfoList (lemur::api::TERMID_T termID) const
	doc entries in a term index, DocList InvFPDocList
lemur::api::TermInfoList *	termInfoList (lemur::api::DOCID_T docID) const
	word entries in a document index (bag of words), TermList
lemur::api::TermInfoList *	termInfoListSeq (lemur::api::DOCID_T docID) const
	word entries in a document index (sequence of words), TermList
Protected Member Functions
bool	tryOpen ()
	try to open an existing index
void	writeTOC (const lemur::parse::CollectionProps *cp)
	write out the table of contents file.
void	writeCache (bool lastRun=false)
	write out the cache
void	lastWriteCache ()
	final run write out of cache
void	mergeCacheSegments ()
	out-of-tree cache management combine segments into single segment
void	writeCacheSegment ()
	write out segments
void	writeDocMgrIDs ()
	write out document manager ids
int	docMgrID (const string &mgr)
virtual void	doendDoc (const lemur::parse::DocumentProps *dp, int mgrid)
	handle end of document token.
void	openDBs ()
	open the database files
void	openSegments ()
	open the segment files
void	createDBs ()
	create the database files
void	fullToc ()
	readin all toc
bool	docMgrIDs ()
	read in document manager internal and external ids map
record	fetchDocumentRecord (lemur::api::DOCID_T key) const
	retrieve a document record.
void	addDocumentLookup (lemur::api::DOCID_T documentKey, const char *documentName)
	store a document record
void	addTermLookup (lemur::api::TERMID_T termKey, const char *termSpelling)
	store a term record
void	addGeneralLookup (lemur::file::Keyfile &numberNameIndex, lemur::file::Keyfile &nameNumberIndex, lemur::api::TERMID_T number, const char *name)
	store a record
InvFPDocList *	internalDocInfoList (lemur::api::TERMID_T termID) const
	retrieve and construct the DocInfoList for a term.
void	_updateTermlist (InvFPDocList *curlist, lemur::api::LOC_T position)
	add a position to a DocInfoList
int	_cacheSize ()
	total memory used by cache
void	_computeMemoryBounds (int memorySize)
	cache size limits based on cachesize parameter to constructor
void	_resetEstimatePoint ()
	Approximate how many updates to collect before flushing the cache.
Protected Attributes
int	listlengths
	how long all the lists are
lemur::api::COUNT_T *	counts
	array to hold all the overall count stats of this db
std::vector< std::string >	names
	array to hold all the names for files we need for this db
float	aveDocLen
	the average document length in this index
vector< std::string >	docmgrs
	list of document managers
ostream *	msgstream
	Lemur code messages stream.
lemur::file::Keyfile	invlookup
	termID -> TermData (term statistics and inverted list segment offsets)
lemur::file::Keyfile	dIDs
	documentName -> documentID
lemur::file::Keyfile	dSTRs
	documentID -> documentName
lemur::file::Keyfile	tIDs
	termName -> termID
lemur::file::Keyfile	tSTRs
	termID -> termName
lemur::file::File	dtlookup
	document statistics (document length, etc.)
lemur::file::ReadBuffer *	dtlookupReadBuffer
	read buffer for dtlookup
lemur::file::File	writetlist
char	termKey [MAX_TERM_LENGTH]
	buffers for term() lookup functions
char	docKey [MAX_DOCID_LENGTH]
	buffers for document() lookup functions
int	_listsSize
	memory for use by inverted list buffers
int	_memorySize
	upper bound for memory use
std::string	name
	the prefix name
vector< InvFPDocList * >	invertlists
	array of pointers to doclists
vector< LocatedTerm >	termlist
	list of terms and their locations in this document
int	curdocmgr
	the current docmanager to use
vector< lemur::api::DocumentManager * >	docMgrs
	list of document manager objects
lemur::utility::TermCache	_cache
	cache of term entries
lemur::parse::BasicCollectionProps *	cprops
	list of collection properties
std::vector< lemur::file::File * >	_segments
	out-of-tree segments for data
lemur::api::TERMID_T	_largestFlushedTermID
	highest term id flushed to disk.
int	_estimatePoint
	invertlists point where we should next check on the cache size
bool	ignoreDoc
	are we in a bad document state?
bool	_readOnly
	are we read only

Detailed Description

KeyfileIncIndex builds an index assigning termids, docids, tracking locations of term within documents, and tracking terms within documents. It also expects a DocumentProp to have the total number of terms that were in a document. It expects that all stopping and stemming (if any) occurs before the term is passed in. If used with an existing index, new documents are added incrementally. Records are stored in keyfile b-trees. KeyfileIncIndex also provides the Index API for using the index.

Constructor & Destructor Documentation

lemur::index::KeyfileIncIndex::KeyfileIncIndex ( const string & prefix,

int cachesize = 128000000,

lemur::api::DOCID_T startdocid = 1

)

Instantiate with index name without extension. Optionally pass in cachesize and starting document id number.

lemur::index::KeyfileIncIndex::KeyfileIncIndex ( )

New empty one for index manager to use.

lemur::index::KeyfileIncIndex::~KeyfileIncIndex ( )

Clean up.

Member Function Documentation

int lemur::index::KeyfileIncIndex::_cacheSize ( ) [protected]

total memory used by cache

void lemur::index::KeyfileIncIndex::_computeMemoryBounds ( int memorySize ) [protected]

cache size limits based on cachesize parameter to constructor

void lemur::index::KeyfileIncIndex::_resetEstimatePoint ( ) [protected]

Approximate how many updates to collect before flushing the cache.

void lemur::index::KeyfileIncIndex::_updateTermlist ( InvFPDocList * curlist,

lemur::api::LOC_T position

) [protected]

add a position to a DocInfoList

void lemur::index::KeyfileIncIndex::addDocumentLookup ( lemur::api::DOCID_T documentKey,

const char * documentName

) [protected]

store a document record

void lemur::index::KeyfileIncIndex::addGeneralLookup ( lemur::file::Keyfile & numberNameIndex,

lemur::file::Keyfile & nameNumberIndex,

lemur::api::TERMID_T number,

const char * name

) [protected]

store a record

void lemur::index::KeyfileIncIndex::addKnownTerm ( lemur::api::TERMID_T termID,

lemur::api::LOC_T position

)

update data for an already seen term

bool lemur::index::KeyfileIncIndex::addTerm ( const lemur::api::Term & t ) [virtual]

adding a term to the current document

Implements lemur::index::PushIndex.

void lemur::index::KeyfileIncIndex::addTermLookup ( lemur::api::TERMID_T termKey,

const char * termSpelling

) [protected]

store a term record

lemur::api::TERMID_T lemur::index::KeyfileIncIndex::addUncachedTerm ( const InvFPTerm * term )

update data for a term that is not cached in the term cache.

lemur::api::TERMID_T lemur::index::KeyfileIncIndex::addUnknownTerm ( const InvFPTerm * term )

initialize data for a previously unseen term.

bool lemur::index::KeyfileIncIndex::beginDoc ( const lemur::parse::DocumentProps * dp ) [virtual]

the beginning of a new document

Implements lemur::index::PushIndex.

const lemur::parse::CollectionProps * lemur::index::KeyfileIncIndex::collectionProps ( ) const [virtual]

return whatever collection properties might have been passed in indexing with call to PushIndex::endCollection(CollectionProps)
Reimplemented from lemur::api::Index.

void lemur::index::KeyfileIncIndex::createDBs ( ) [protected]

create the database files

lemur::api::COUNT_T lemur::index::KeyfileIncIndex::docCount ( lemur::api::TERMID_T termID ) const [virtual]

Total counts of doc with a given term.

Implements lemur::api::Index.

lemur::api::COUNT_T lemur::index::KeyfileIncIndex::docCount ( ) const [inline, virtual]

Total count (i.e., number) of documents in collection.

Implements lemur::api::Index.

lemur::api::DocInfoList * lemur::index::KeyfileIncIndex::docInfoList ( lemur::api::TERMID_T termID ) const [virtual]

doc entries in a term index, DocList InvFPDocList

Implements lemur::api::Index.

lemur::api::COUNT_T lemur::index::KeyfileIncIndex::docLength ( lemur::api::DOCID_T docID ) const [virtual]

Total counts of terms in a document, including stop words maybe.

Implements lemur::api::Index.

float lemur::index::KeyfileIncIndex::docLengthAvg ( ) const [virtual]

Average document length.

Implements lemur::api::Index.

lemur::api::COUNT_T lemur::index::KeyfileIncIndex::docLengthCounted ( lemur::api::DOCID_T docID ) const

Total count of terms in given document, not including stop words.

const lemur::api::DocumentManager * lemur::index::KeyfileIncIndex::docManager ( lemur::api::DOCID_T docID ) const [virtual]

The document manager for this document.

Reimplemented from lemur::api::Index.

int lemur::index::KeyfileIncIndex::docMgrID ( const string & mgr ) [protected]

returns the internal id of given docmgr if not already registered, mgr will be added

bool lemur::index::KeyfileIncIndex::docMgrIDs ( ) [protected]

read in document manager internal and external ids map

const lemur::api::EXDOCID_T lemur::index::KeyfileIncIndex::document ( lemur::api::DOCID_T docID ) const [virtual]

Convert a docID to its spelling.

Implements lemur::api::Index.

lemur::api::DOCID_T lemur::index::KeyfileIncIndex::document ( const lemur::api::EXDOCID_T & docIDStr ) const

Convert a spelling to docID.

void lemur::index::KeyfileIncIndex::doendDoc ( const lemur::parse::DocumentProps * dp,

int mgrid

) [protected, virtual]

handle end of document token.

void lemur::index::KeyfileIncIndex::endCollection ( const lemur::parse::CollectionProps * cp ) [virtual]

signify the end of this collection.

Implements lemur::index::PushIndex.

void lemur::index::KeyfileIncIndex::endDoc ( const lemur::parse::DocumentProps * dp,

const string & mgr

) [virtual]

signify the end of current document

void lemur::index::KeyfileIncIndex::endDoc ( const lemur::parse::DocumentProps * dp ) [virtual]

signify the end of current document

Implements lemur::index::PushIndex.

lemur::index::KeyfileIncIndex::record lemur::index::KeyfileIncIndex::fetchDocumentRecord ( lemur::api::DOCID_T key ) const [protected]

retrieve a document record.

void lemur::index::KeyfileIncIndex::fullToc ( ) [protected]

readin all toc

lemur::index::InvFPDocList * lemur::index::KeyfileIncIndex::internalDocInfoList ( lemur::api::TERMID_T termID ) const [protected]

retrieve and construct the DocInfoList for a term.

void lemur::index::KeyfileIncIndex::lastWriteCache ( ) [protected]

final run write out of cache

void lemur::index::KeyfileIncIndex::mergeCacheSegments ( ) [protected]

out-of-tree cache management combine segments into single segment

bool lemur::index::KeyfileIncIndex::open ( const string & indexName ) [virtual]

Open previously created Index with given prefix.

Implements lemur::api::Index.

void lemur::index::KeyfileIncIndex::openDBs ( ) [protected]

open the database files

void lemur::index::KeyfileIncIndex::openSegments ( ) [protected]

open the segment files

void lemur::index::KeyfileIncIndex::setDocManager ( const string & mgrID ) [virtual]

set the document manager to use for succeeding documents

Implements lemur::index::PushIndex.

void lemur::index::KeyfileIncIndex::setMesgStream ( ostream * lemStream )

set the mesg stream

void lemur::index::KeyfileIncIndex::setName ( const string & prefix )

sets the name for this index

const lemur::api::TERM_T lemur::index::KeyfileIncIndex::term ( lemur::api::TERMID_T termID ) const [virtual]

Convert a termID to its spelling.

Implements lemur::api::Index.

lemur::api::TERMID_T lemur::index::KeyfileIncIndex::term ( const lemur::api::TERM_T & word ) const

Convert a term spelling to a termID.

lemur::api::COUNT_T lemur::index::KeyfileIncIndex::termCount ( ) const [inline, virtual]

Total counts of all terms in collection.

Implements lemur::api::Index.

lemur::api::COUNT_T lemur::index::KeyfileIncIndex::termCount ( lemur::api::TERMID_T termID ) const [virtual]

Total counts of a term in collection.

Implements lemur::api::Index.

lemur::api::COUNT_T lemur::index::KeyfileIncIndex::termCountUnique ( ) const [inline, virtual]

Total count of unique terms in collection.

Implements lemur::api::Index.

lemur::api::TermInfoList * lemur::index::KeyfileIncIndex::termInfoList ( lemur::api::DOCID_T docID ) const [virtual]

word entries in a document index (bag of words), TermList

Implements lemur::api::Index.

lemur::api::TermInfoList * lemur::index::KeyfileIncIndex::termInfoListSeq ( lemur::api::DOCID_T docID ) const [virtual]

word entries in a document index (sequence of words), TermList

Reimplemented from lemur::api::Index.

lemur::api::COUNT_T lemur::index::KeyfileIncIndex::totaldocLength ( lemur::api::DOCID_T docID ) const [virtual]

Total counts of terms in a document including stopwords for sure.

bool lemur::index::KeyfileIncIndex::tryOpen ( ) [protected]

try to open an existing index

void lemur::index::KeyfileIncIndex::writeCache ( bool lastRun = false ) [protected]

write out the cache

void lemur::index::KeyfileIncIndex::writeCacheSegment ( ) [protected]

write out segments

void lemur::index::KeyfileIncIndex::writeDocMgrIDs ( ) [protected]

write out document manager ids

void lemur::index::KeyfileIncIndex::writeTOC ( const lemur::parse::CollectionProps * cp ) [protected]

write out the table of contents file.

Member Data Documentation

lemur::utility::TermCache lemur::index::KeyfileIncIndex::_cache [protected]

cache of term entries

int lemur::index::KeyfileIncIndex::_estimatePoint [protected]

invertlists point where we should next check on the cache size

lemur::api::TERMID_T lemur::index::KeyfileIncIndex::_largestFlushedTermID [protected]

highest term id flushed to disk.

int lemur::index::KeyfileIncIndex::_listsSize [protected]

memory for use by inverted list buffers

int lemur::index::KeyfileIncIndex::_memorySize [protected]

upper bound for memory use

bool lemur::index::KeyfileIncIndex::_readOnly [protected]

are we read only

std::vector<lemur::file::File*> lemur::index::KeyfileIncIndex::_segments [protected]

out-of-tree segments for data

float lemur::index::KeyfileIncIndex::aveDocLen [protected]

the average document length in this index

lemur::api::COUNT_T* lemur::index::KeyfileIncIndex::counts [protected]

array to hold all the overall count stats of this db

lemur::parse::BasicCollectionProps* lemur::index::KeyfileIncIndex::cprops [mutable, protected]

list of collection properties

int lemur::index::KeyfileIncIndex::curdocmgr [protected]

the current docmanager to use

lemur::file::Keyfile lemur::index::KeyfileIncIndex::dIDs [mutable, protected]

documentName -> documentID

char lemur::index::KeyfileIncIndex::docKey[MAX_DOCID_LENGTH] [mutable, protected]

buffers for document() lookup functions

vector<lemur::api::DocumentManager*> lemur::index::KeyfileIncIndex::docMgrs [protected]

list of document manager objects

vector<std::string> lemur::index::KeyfileIncIndex::docmgrs [protected]

list of document managers

lemur::file::Keyfile lemur::index::KeyfileIncIndex::dSTRs [mutable, protected]

documentID -> documentName

lemur::file::File lemur::index::KeyfileIncIndex::dtlookup [mutable, protected]

document statistics (document length, etc.)

lemur::file::ReadBuffer* lemur::index::KeyfileIncIndex::dtlookupReadBuffer [protected]

read buffer for dtlookup

bool lemur::index::KeyfileIncIndex::ignoreDoc [protected]

are we in a bad document state?

vector<InvFPDocList*> lemur::index::KeyfileIncIndex::invertlists [protected]

array of pointers to doclists

lemur::file::Keyfile lemur::index::KeyfileIncIndex::invlookup [mutable, protected]

termID -> TermData (term statistics and inverted list segment offsets)

int lemur::index::KeyfileIncIndex::listlengths [protected]

how long all the lists are

ostream* lemur::index::KeyfileIncIndex::msgstream [protected]

Lemur code messages stream.

std::string lemur::index::KeyfileIncIndex::name [protected]

the prefix name

std::vector<std::string> lemur::index::KeyfileIncIndex::names [protected]

array to hold all the names for files we need for this db

char lemur::index::KeyfileIncIndex::termKey[MAX_TERM_LENGTH] [mutable, protected]

buffers for term() lookup functions

vector<LocatedTerm> lemur::index::KeyfileIncIndex::termlist [protected]

list of terms and their locations in this document

lemur::file::Keyfile lemur::index::KeyfileIncIndex::tIDs [mutable, protected]

termName -> termID

lemur::file::Keyfile lemur::index::KeyfileIncIndex::tSTRs [mutable, protected]

termID -> termName

lemur::file::File lemur::index::KeyfileIncIndex::writetlist [mutable, protected]

filestream for writing the list of located terms mutable for index access mode of Index API (not PushIndex)

The documentation for this class was generated from the following files:

Generated on Tue Jun 15 11:03:06 2010 for Lemur by

1.3.4

lemur::index::KeyfileIncIndex Class Reference

Public Member Functions

Protected Member Functions

Protected Attributes

Detailed Description

Constructor & Destructor Documentation

Member Function Documentation

Member Data Documentation