00001 /*========================================================================== 00002 * Copyright (c) 2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // TwoStageTermScoreFunction.hpp 00015 // 00016 // 16 April 2004 -- tds 00017 // 00018 00019 #ifndef INDRI_TWOSTAGETERMSCOREFUNCTION_HPP 00020 #define INDRI_TWOSTAGETERMSCOREFUNCTION_HPP 00021 namespace indri 00022 { 00023 namespace query 00024 { 00025 00026 class TwoStageTermScoreFunction : public TermScoreFunction { 00027 private: 00028 double _mu; 00029 double _lambda; 00030 double _collectionFrequency; 00031 00032 public: 00033 TwoStageTermScoreFunction( double mu, double lambda, double collectionFrequency ) : 00034 _mu(mu), 00035 _lambda(lambda), 00036 _collectionFrequency(collectionFrequency) { 00037 } 00038 00039 double scoreOccurrence( double occurrences, int contextSize ) { 00040 00041 // [ c(w;d) + \mu * p(w|C) ] 00042 // ( 1 - \lambda ) [ ------------------------ ] + \lambda * p(w|C) 00043 // [ |d| + \mu ] 00044 00045 double dirichlet = ((double(occurrences) + _mu*_collectionFrequency) / (double(contextSize) + _mu)); 00046 double p = ( 1-_lambda ) * dirichlet + _lambda * _collectionFrequency; 00047 return log(p); 00048 } 00049 00050 double scoreOccurrence( double occurrences, int contextSize, double documentOccurrences, int documentLength ) { 00051 double documentFrequency = double(documentOccurrences) / double(documentLength); 00052 double dirichlet = ((double(occurrences) + _mu*documentFrequency) / (double(contextSize) + _mu)); 00053 double p = ( 1-_lambda ) * dirichlet + _lambda * _collectionFrequency; 00054 return log(p); 00055 } 00056 }; 00057 } 00058 } 00059 00060 #endif // INDRI_TWOSTAGETERMSCOREFUNCTION_HPP 00061