00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_TFIDFTERMSCOREFUNCTION_HPP
00020 #define INDRI_TFIDFTERMSCOREFUNCTION_HPP
00021 #include <iostream>
00022 #include "indri/TermScoreFunction.hpp"
00023 #include <math.h>
00024 namespace indri
00025 {
00026 namespace query
00027 {
00028
00029 class TFIDFTermScoreFunction : public TermScoreFunction {
00030 private:
00032 double _inverseDocumentFrequency;
00034 double _averageDocumentLength;
00035
00036 double _termWeight;
00037
00038
00039 double _k1;
00040 double _b;
00041
00042 double _k3;
00043
00044
00045 double _bOverAvgDocLength;
00046 double _k1TimesOneMinusB;
00047 double _idfTimesK1PlusOne;
00048 double _k1TimesBOverAvgDocLength;
00049 double _termWeightTimesIDFTimesK1;
00050 double _termWeightTimesidfTimesK1PlusOne;
00051 bool _okapi;
00052
00053 void _precomputeConstants() {
00054 _idfTimesK1PlusOne = _inverseDocumentFrequency * ( _k1 + 1 );
00055 _k1TimesOneMinusB = _k1 * (1-_b);
00056 _bOverAvgDocLength = _b / _averageDocumentLength;
00057 _k1TimesBOverAvgDocLength = _k1 * _bOverAvgDocLength;
00058 _termWeightTimesIDFTimesK1 = _termWeight * _inverseDocumentFrequency * _k1;
00059 _termWeightTimesidfTimesK1PlusOne = _termWeight * _idfTimesK1PlusOne;
00060 }
00061
00062 public:
00063 TFIDFTermScoreFunction( double idf, double averageDocumentLength, int qTF = 1, double k1 = 1.2, double b = 0.75, bool okapi = false, double k3 = 7 ) {
00064 _okapi = okapi;
00065 _inverseDocumentFrequency = idf;
00066 _averageDocumentLength = averageDocumentLength;
00067
00068 _k1 = k1;
00069 _b = b;
00070 _k3 = k3;
00071
00072
00073 _termWeight = queryTermWeight( 1000, 0, qTF ) / qTF;
00074 _precomputeConstants();
00075 }
00076
00077 TFIDFTermScoreFunction( double idf, double averageDocumentLength, double qtw = 1.0, double k1 = 1.2, double b = 0.75, bool okapi = false, double k3 = 7 ) {
00078 _okapi = okapi;
00079 _inverseDocumentFrequency = idf;
00080 _averageDocumentLength = averageDocumentLength;
00081
00082 _k1 = k1;
00083 _b = b;
00084 _k3 = k3;
00085
00086
00087 _termWeight = qtw;
00088
00089 _precomputeConstants();
00090 }
00091
00092 double scoreOccurrence( double occurrences, int documentLength ) {
00093 if (_okapi) {
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107 double numerator = _termWeightTimesidfTimesK1PlusOne * occurrences;
00108 double denominator = occurrences + _k1TimesOneMinusB + _k1TimesBOverAvgDocLength * documentLength;
00109 return numerator / denominator;
00110 } else {
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125 double numerator = _termWeightTimesIDFTimesK1 * occurrences;
00126 double denominator = occurrences + _k1TimesOneMinusB + _k1TimesBOverAvgDocLength * documentLength;
00127 return numerator / denominator;
00128 }
00129
00130 }
00131
00132 double scoreOccurrence( double occurrences, int contextSize, double documentOccurrences, int documentLength ) {
00133 return scoreOccurrence(occurrences, contextSize);
00134 }
00135
00136 double maximumScore( int minimumDocumentLength, int maximumOccurrences ){
00137 return scoreOccurrence( maximumOccurrences, minimumDocumentLength );
00138 }
00139
00140 double queryTermWeight( double queryK1, double queryB, double _qTF ) {
00141 if (_okapi)
00142 return (((_k3 + 1) * _qTF)/(_k3 + _qTF));
00143 else
00144
00145
00146
00147 return ( _inverseDocumentFrequency * queryK1 * _qTF ) / ( _qTF + queryK1 );
00148
00149
00150 }
00151 };
00152 }
00153 }
00154
00155 #endif // TFIDF_TERMSCOREFUNCTION_HPP
00156