00001 /*========================================================================== 00002 * Copyright (c) 2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // DirichletTermScoreFunction 00015 // 00016 // 26 January 2004 - tds 00017 // 00018 00019 #ifndef INDRI_DIRICHLETTERMSCOREFUNCTION_HPP 00020 #define INDRI_DIRICHLETTERMSCOREFUNCTION_HPP 00021 00022 #include <math.h> 00023 namespace indri 00024 { 00026 namespace query 00027 { 00028 00029 class DirichletTermScoreFunction : public TermScoreFunction { 00030 private: 00031 double _mu; 00032 double _docmu; 00033 double _collectionFrequency; 00034 double _muTimesCollectionFrequency; 00035 00036 public: 00037 DirichletTermScoreFunction( double mu, double collectionFrequency, double docmu=-1.0 ) { 00038 _collectionFrequency = collectionFrequency; 00039 _mu = mu; 00040 _muTimesCollectionFrequency = _mu * _collectionFrequency; 00041 _docmu = docmu; 00042 } 00043 00044 double scoreOccurrence( double occurrences, int contextSize ) { 00045 double seen = ( double(occurrences) + _muTimesCollectionFrequency ) / ( double(contextSize) + _mu ); 00046 return log( seen ); 00047 } 00048 00049 double scoreOccurrence( double occurrences, int contextSize, double documentOccurrences, int documentLength ) { 00050 //two level Dir Smoothing! 00051 // tf_E + documentMu*P(t|D) 00052 //P(t|E)= ------------------------ 00053 // extentlen + documentMu 00054 // mu*P(t|C) + tf_D 00055 //where P(t|D)= --------------------- 00056 // doclen + mu 00057 // if the _docmu parameter is the default, do collection level 00058 // smoothing only. 00059 if (_docmu < 0) 00060 return scoreOccurrence(occurrences, contextSize); 00061 else { 00062 double seen = (occurrences+_docmu*(_muTimesCollectionFrequency+documentOccurrences)/(double(documentLength)+_mu))/(double(contextSize)+_docmu); 00063 return log(seen); 00064 } 00065 } 00066 }; 00067 } 00068 } 00069 00070 #endif // INDRI_DIRICHLETTERMSCOREFUNCTION_HPP