00001 /*========================================================================== 00002 * Copyright (c) 2000-2004 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software (and below), and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 #ifndef _DOCFREQINDEXER_HPP 00013 #define _DOCFREQINDEXER_HPP 00014 00015 #include "TextHandler.hpp" 00016 #include "PushIndex.hpp" 00017 #include "InvFPTermList.hpp" 00018 #include "Parser.hpp" 00019 #include "WordSet.hpp" 00020 00021 #include <stdio.h> 00022 00023 00024 namespace lemur 00025 { 00026 namespace distrib 00027 { 00028 00029 class DocFreqIndexer : public lemur::api::TextHandler { 00030 00031 public: 00032 DocFreqIndexer(const string &csName, const string &cwName, 00033 const string &ssName, int bufferSize, 00034 bool countStopWords = false); 00035 ~DocFreqIndexer(); 00036 00037 char * handleDoc(char * docno); 00038 char * handleWord(char * word); 00039 void handleEndDoc(); 00040 00041 void newDb(const string &name); 00042 00043 00044 00045 private: 00046 00047 int cw; 00048 int dfCount; 00049 bool first; 00050 00051 lemur::index::PushIndex * collsel; 00052 00053 lemur::parse::DocumentProps * csdp; 00054 lemur::index::InvFPTerm * term; 00055 00056 lemur::utility::WordSet docWords; 00057 00058 FILE * collWords; 00059 FILE * serverSizes; 00060 int numDocs; 00061 00062 bool countStopWds; 00063 00064 }; 00065 } 00066 } 00067 00068 #endif