00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 /* type definitions for objects we will use */ 00014 #ifndef _INVFPTYPES_H 00015 #define _INVFPTYPES_H 00016 00017 #include "common_headers.hpp" 00018 #include "IndexTypes.hpp" 00019 #include <cstring> 00020 namespace lemur 00021 { 00022 namespace index 00023 { 00024 00025 #define IND_VERSION "4.12" 00026 00027 // suffixes for filenames 00028 #define INVINDEX ".invf" 00029 #define INVFPINDEX ".invfp" 00030 #define INVLOOKUP ".invlookup" 00031 #define DTINDEX ".dt" 00032 #define DTLOOKUP ".dtlookup" 00033 #define TERMIDMAP ".tid" 00034 #define TERMIDSTRMAP ".tidstr" 00035 #define DOCIDMAP ".did" 00036 #define DOCIDSTRMAP ".didstr" 00037 #define MAINTOC ".inv" 00038 #define INVFPTOC ".ifp" 00039 #define DOCMGRMAP ".dm" 00040 00041 // what to call out of vocabulary ids 00042 #define INVALID_STR "[OOV]" 00043 00044 // name for parameters 00045 #define VERSION_PAR "VERSION" 00046 #define NUMDOCS_PAR "NUM_DOCS" 00047 #define NUMTERMS_PAR "NUM_TERMS" 00048 #define NUMUTERMS_PAR "NUM_UNIQUE_TERMS" 00049 #define AVEDOCLEN_PAR "AVE_DOCLEN" 00050 #define INVINDEX_PAR "INV_INDEX" 00051 #define INVLOOKUP_PAR "INV_LOOKUP" 00052 #define DTINDEX_PAR "DT_INDEX" 00053 #define DTLOOKUP_PAR "DT_LOOKUP" 00054 #define TERMIDMAP_PAR "TERMIDS" 00055 #define TERMIDSTRMAP_PAR "TERMIDSTRS" 00056 #define DOCIDMAP_PAR "DOCIDS" 00057 #define DOCIDSTRMAP_PAR "DOCIDSTRS" 00058 #define NUMDT_PAR "NUM_DTFILES" 00059 #define NUMINV_PAR "NUM_INVFILES" 00060 #define DOCMGR_PAR "DOCMGR_IDS" 00061 00062 struct LocatedTerm { // pair of term and its location 00063 lemur::api::TERMID_T term; 00064 lemur::api::LOC_T loc; 00065 }; 00066 00067 struct LLTerm { // pair of term and list of locations 00068 lemur::api::TERMID_T term; 00069 vector<lemur::api::LOC_T> loc; 00070 }; 00071 00072 struct dt_entry { // an entry in the lookup table for docterm lists index 00073 lemur::api::FILEID_T fileid; // which file the word is in 00074 long offset; // what the offset into the file is 00075 int length; // the length of the inverted list 00076 int docmgr; // the docmgr id of manager for this doc 00077 }; 00078 00079 struct inv_entry { // an entry in the lookup table for docterm lists index 00080 lemur::api::FILEID_T fileid; // which file the word is in 00081 long offset; // what the offset into the file is 00082 lemur::api::COUNT_T ctf; // collection term freq 00083 lemur::api::COUNT_T df; // doc freq 00084 }; 00085 00086 struct ltstr 00087 { 00088 bool operator()(char* s1, char* s2) const{ 00089 return strcmp(s1, s2) < 0; 00090 } 00091 }; 00092 } 00093 } 00094 00095 #endif