#include <KeyfileIncIndex.hpp>
Inheritance diagram for lemur::index::KeyfileIncIndex:
Public Member Functions | |
KeyfileIncIndex (const string &prefix, int cachesize=128000000, lemur::api::DOCID_T startdocid=1) | |
KeyfileIncIndex () | |
New empty one for index manager to use. | |
~KeyfileIncIndex () | |
Clean up. | |
void | setName (const string &prefix) |
sets the name for this index | |
bool | beginDoc (const lemur::parse::DocumentProps *dp) |
the beginning of a new document | |
bool | addTerm (const lemur::api::Term &t) |
adding a term to the current document | |
void | endDoc (const lemur::parse::DocumentProps *dp) |
signify the end of current document | |
virtual void | endDoc (const lemur::parse::DocumentProps *dp, const string &mgr) |
signify the end of current document | |
void | endCollection (const lemur::parse::CollectionProps *cp) |
signify the end of this collection. | |
void | setDocManager (const string &mgrID) |
set the document manager to use for succeeding documents | |
void | setMesgStream (ostream *lemStream) |
set the mesg stream | |
void | addKnownTerm (lemur::api::TERMID_T termID, lemur::api::LOC_T position) |
update data for an already seen term | |
lemur::api::TERMID_T | addUnknownTerm (const InvFPTerm *term) |
initialize data for a previously unseen term. | |
lemur::api::TERMID_T | addUncachedTerm (const InvFPTerm *term) |
update data for a term that is not cached in the term cache. | |
Open index | |
bool | open (const string &indexName) |
Open previously created Index with given prefix. | |
Spelling and index conversion | |
lemur::api::TERMID_T | term (const lemur::api::TERM_T &word) const |
Convert a term spelling to a termID. | |
const lemur::api::TERM_T | term (lemur::api::TERMID_T termID) const |
Convert a termID to its spelling. | |
lemur::api::DOCID_T | document (const lemur::api::EXDOCID_T &docIDStr) const |
Convert a spelling to docID. | |
const lemur::api::EXDOCID_T | document (lemur::api::DOCID_T docID) const |
Convert a docID to its spelling. | |
const lemur::api::DocumentManager * | docManager (lemur::api::DOCID_T docID) const |
The document manager for this document. | |
const lemur::parse::CollectionProps * | collectionProps () const |
Summary counts | |
lemur::api::COUNT_T | docCount () const |
Total count (i.e., number) of documents in collection. | |
lemur::api::COUNT_T | termCountUnique () const |
Total count of unique terms in collection. | |
lemur::api::COUNT_T | termCount (lemur::api::TERMID_T termID) const |
Total counts of a term in collection. | |
lemur::api::COUNT_T | termCount () const |
Total counts of all terms in collection. | |
float | docLengthAvg () const |
Average document length. | |
lemur::api::COUNT_T | docCount (lemur::api::TERMID_T termID) const |
Total counts of doc with a given term. | |
lemur::api::COUNT_T | docLength (lemur::api::DOCID_T docID) const |
Total counts of terms in a document, including stop words maybe. | |
virtual lemur::api::COUNT_T | totaldocLength (lemur::api::DOCID_T docID) const |
Total counts of terms in a document including stopwords for sure. | |
lemur::api::COUNT_T | docLengthCounted (lemur::api::DOCID_T docID) const |
Total count of terms in given document, not including stop words. | |
Index entry access | |
lemur::api::DocInfoList * | docInfoList (lemur::api::TERMID_T termID) const |
doc entries in a term index, DocList InvFPDocList | |
lemur::api::TermInfoList * | termInfoList (lemur::api::DOCID_T docID) const |
word entries in a document index (bag of words), TermList | |
lemur::api::TermInfoList * | termInfoListSeq (lemur::api::DOCID_T docID) const |
word entries in a document index (sequence of words), TermList | |
Protected Member Functions | |
bool | tryOpen () |
try to open an existing index | |
void | writeTOC (const lemur::parse::CollectionProps *cp) |
write out the table of contents file. | |
void | writeCache (bool lastRun=false) |
write out the cache | |
void | lastWriteCache () |
final run write out of cache | |
void | mergeCacheSegments () |
out-of-tree cache management combine segments into single segment | |
void | writeCacheSegment () |
write out segments | |
void | writeDocMgrIDs () |
write out document manager ids | |
int | docMgrID (const string &mgr) |
virtual void | doendDoc (const lemur::parse::DocumentProps *dp, int mgrid) |
handle end of document token. | |
void | openDBs () |
open the database files | |
void | openSegments () |
open the segment files | |
void | createDBs () |
create the database files | |
void | fullToc () |
readin all toc | |
bool | docMgrIDs () |
read in document manager internal and external ids map | |
record | fetchDocumentRecord (lemur::api::DOCID_T key) const |
retrieve a document record. | |
void | addDocumentLookup (lemur::api::DOCID_T documentKey, const char *documentName) |
store a document record | |
void | addTermLookup (lemur::api::TERMID_T termKey, const char *termSpelling) |
store a term record | |
void | addGeneralLookup (lemur::file::Keyfile &numberNameIndex, lemur::file::Keyfile &nameNumberIndex, lemur::api::TERMID_T number, const char *name) |
store a record | |
InvFPDocList * | internalDocInfoList (lemur::api::TERMID_T termID) const |
retrieve and construct the DocInfoList for a term. | |
void | _updateTermlist (InvFPDocList *curlist, lemur::api::LOC_T position) |
add a position to a DocInfoList | |
int | _cacheSize () |
total memory used by cache | |
void | _computeMemoryBounds (int memorySize) |
cache size limits based on cachesize parameter to constructor | |
void | _resetEstimatePoint () |
Approximate how many updates to collect before flushing the cache. | |
Protected Attributes | |
int | listlengths |
how long all the lists are | |
lemur::api::COUNT_T * | counts |
array to hold all the overall count stats of this db | |
std::vector< std::string > | names |
array to hold all the names for files we need for this db | |
float | aveDocLen |
the average document length in this index | |
vector< std::string > | docmgrs |
list of document managers | |
ostream * | msgstream |
Lemur code messages stream. | |
lemur::file::Keyfile | invlookup |
termID -> TermData (term statistics and inverted list segment offsets) | |
lemur::file::Keyfile | dIDs |
documentName -> documentID | |
lemur::file::Keyfile | dSTRs |
documentID -> documentName | |
lemur::file::Keyfile | tIDs |
termName -> termID | |
lemur::file::Keyfile | tSTRs |
termID -> termName | |
lemur::file::File | dtlookup |
document statistics (document length, etc.) | |
lemur::file::ReadBuffer * | dtlookupReadBuffer |
read buffer for dtlookup | |
lemur::file::File | writetlist |
char | termKey [MAX_TERM_LENGTH] |
buffers for term() lookup functions | |
char | docKey [MAX_DOCID_LENGTH] |
buffers for document() lookup functions | |
int | _listsSize |
memory for use by inverted list buffers | |
int | _memorySize |
upper bound for memory use | |
std::string | name |
the prefix name | |
vector< InvFPDocList * > | invertlists |
array of pointers to doclists | |
vector< LocatedTerm > | termlist |
list of terms and their locations in this document | |
int | curdocmgr |
the current docmanager to use | |
vector< lemur::api::DocumentManager * > | docMgrs |
list of document manager objects | |
lemur::utility::TermCache | _cache |
cache of term entries | |
lemur::parse::BasicCollectionProps * | cprops |
list of collection properties | |
std::vector< lemur::file::File * > | _segments |
out-of-tree segments for data | |
lemur::api::TERMID_T | _largestFlushedTermID |
highest term id flushed to disk. | |
int | _estimatePoint |
invertlists point where we should next check on the cache size | |
bool | ignoreDoc |
are we in a bad document state? | |
bool | _readOnly |
are we read only |
|
Instantiate with index name without extension. Optionally pass in cachesize and starting document id number. |
|
New empty one for index manager to use.
|
|
Clean up.
|
|
total memory used by cache
|
|
cache size limits based on cachesize parameter to constructor
|
|
Approximate how many updates to collect before flushing the cache.
|
|
add a position to a DocInfoList
|
|
store a document record
|
|
store a record
|
|
update data for an already seen term
|
|
adding a term to the current document
Implements lemur::index::PushIndex. |
|
store a term record
|
|
update data for a term that is not cached in the term cache.
|
|
initialize data for a previously unseen term.
|
|
the beginning of a new document
Implements lemur::index::PushIndex. |
|
return whatever collection properties might have been passed in indexing with call to PushIndex::endCollection(CollectionProps) Reimplemented from lemur::api::Index. |
|
create the database files
|
|
Total counts of doc with a given term.
Implements lemur::api::Index. |
|
Total count (i.e., number) of documents in collection.
Implements lemur::api::Index. |
|
doc entries in a term index, DocList InvFPDocList
Implements lemur::api::Index. |
|
Total counts of terms in a document, including stop words maybe.
Implements lemur::api::Index. |
|
Average document length.
Implements lemur::api::Index. |
|
Total count of terms in given document, not including stop words.
|
|
The document manager for this document.
Reimplemented from lemur::api::Index. |
|
returns the internal id of given docmgr if not already registered, mgr will be added |
|
read in document manager internal and external ids map
|
|
Convert a docID to its spelling.
Implements lemur::api::Index. |
|
Convert a spelling to docID.
|
|
handle end of document token.
|
|
signify the end of this collection.
Implements lemur::index::PushIndex. |
|
signify the end of current document
|
|
signify the end of current document
Implements lemur::index::PushIndex. |
|
retrieve a document record.
|
|
readin all toc
|
|
retrieve and construct the DocInfoList for a term.
|
|
final run write out of cache
|
|
out-of-tree cache management combine segments into single segment
|
|
Open previously created Index with given prefix.
Implements lemur::api::Index. |
|
open the database files
|
|
open the segment files
|
|
set the document manager to use for succeeding documents
Implements lemur::index::PushIndex. |
|
set the mesg stream
|
|
sets the name for this index
|
|
Convert a termID to its spelling.
Implements lemur::api::Index. |
|
Convert a term spelling to a termID.
|
|
Total counts of all terms in collection.
Implements lemur::api::Index. |
|
Total counts of a term in collection.
Implements lemur::api::Index. |
|
Total count of unique terms in collection.
Implements lemur::api::Index. |
|
word entries in a document index (bag of words), TermList
Implements lemur::api::Index. |
|
word entries in a document index (sequence of words), TermList
Reimplemented from lemur::api::Index. |
|
Total counts of terms in a document including stopwords for sure.
|
|
try to open an existing index
|
|
write out the cache
|
|
write out segments
|
|
write out document manager ids
|
|
write out the table of contents file.
|
|
cache of term entries
|
|
invertlists point where we should next check on the cache size
|
|
highest term id flushed to disk.
|
|
memory for use by inverted list buffers
|
|
upper bound for memory use
|
|
are we read only
|
|
out-of-tree segments for data
|
|
the average document length in this index
|
|
array to hold all the overall count stats of this db
|
|
list of collection properties
|
|
the current docmanager to use
|
|
documentName -> documentID
|
|
buffers for document() lookup functions
|
|
list of document manager objects
|
|
list of document managers
|
|
documentID -> documentName
|
|
document statistics (document length, etc.)
|
|
read buffer for dtlookup
|
|
are we in a bad document state?
|
|
array of pointers to doclists
|
|
termID -> TermData (term statistics and inverted list segment offsets)
|
|
how long all the lists are
|
|
Lemur code messages stream.
|
|
the prefix name
|
|
array to hold all the names for files we need for this db
|
|
buffers for term() lookup functions
|
|
list of terms and their locations in this document
|
|
termName -> termID
|
|
termID -> termName
|
|
filestream for writing the list of located terms mutable for index access mode of Index API (not PushIndex) |