00001 /*========================================================================== 00002 * Copyright (c) 2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // ContextSimpleCountCollectorCopier 00015 // 00016 // 5 March 2004 -- tds 00017 // 00018 // This copier uses a IndriIndex to extract context 00019 // counts for certain simple subgraphs. It can compute 00020 // counts for the following types of expressions: 00021 // 00022 // dog 00023 // <dog cat> 00024 // dog.title 00025 // <dog cat>.title 00026 // dog.(title) 00027 // <dog cat>.(title) 00028 // 00029 // Notably, it is unable to compute counts when more than 00030 // one field is involved. 00031 // 00032 00033 #ifndef INDRI_CONTEXTSIMPLECOUNTCOLLECTORCOPIER_HPP 00034 #define INDRI_CONTEXTSIMPLECOUNTCOLLECTORCOPIER_HPP 00035 00036 #include "indri/QuerySpec.hpp" 00037 #include "indri/Copier.hpp" 00038 #include "indri/delete_range.hpp" 00039 #include "indri/Repository.hpp" 00040 00041 namespace indri 00042 { 00043 namespace lang 00044 { 00045 class ContextSimpleCountCollectorCopier : public indri::lang::Copier { 00046 private: 00047 std::vector<indri::lang::Node*> _newNodes; 00048 indri::collection::Repository& _repository; 00049 00050 class SubtreeWalker : public indri::lang::Walker { 00051 private: 00052 bool _computable; 00053 bool _hasContext; 00054 00055 std::vector<indri::lang::IndexTerm*> _terms; 00056 indri::lang::Field* _field; 00057 00058 public: 00059 SubtreeWalker() : 00060 _computable(true), 00061 _field(0) 00062 { 00063 } 00064 00065 bool isComputable() { 00066 return _computable && _terms.size(); 00067 } 00068 00069 std::vector<indri::lang::IndexTerm*>& getTerms() { 00070 return _terms; 00071 } 00072 00073 indri::lang::Field* getField() { 00074 return _field; 00075 } 00076 00077 bool hasContext() const { 00078 return _hasContext; 00079 } 00080 00081 void defaultBefore( indri::lang::Node* node ) { 00082 // this means that we're seeing some node type that 00083 // we aren't otherwise trapping--that means this subtree 00084 // is surely not precomputable 00085 _computable = false; 00086 } 00087 00088 void before( indri::lang::ContextCounterNode* contextNode ) { 00089 // if the context node has a context, then it must have a field in the context 00090 // if we find more than one field, we say this isn't computable. Therefore, if 00091 // this subtree is computable and it has a context, the single field must be in the context. 00092 _hasContext = contextNode->getContext() ? true : false; 00093 } 00094 00095 00096 void before( indri::lang::ExtentAnd* extentAndNode ) { 00097 // we definitely can't deal with any "true" extentAnds 00098 // however, if this is just an and wrapper around a single 00099 // field, we won't let it fool us 00100 if( extentAndNode->getChildren().size() > 1 ) 00101 _computable = false; 00102 } 00103 00104 void before( indri::lang::Field* fieldNode ) { 00105 if( _field ) { 00106 // fields can't be or-ed together; only terms can (_extentOr) 00107 // If we already saw a field, then this one proves that the tree isn't computable (_field) 00108 _computable = false; 00109 } 00110 00111 _field = fieldNode; 00112 } 00113 00114 void before( indri::lang::IndexTerm* termNode ) { 00115 _terms.push_back(termNode); 00116 } 00117 00118 void before( indri::lang::ExtentInside* insideNode ) { 00119 // ignore this; the other checks should catch any bad trees 00120 // without having to worry about checking here 00121 } 00122 }; 00123 00124 public: 00125 ContextSimpleCountCollectorCopier( indri::collection::Repository& repository ) : 00126 _repository(repository) 00127 { 00128 } 00129 00130 ~ContextSimpleCountCollectorCopier() { 00131 indri::utility::delete_vector_contents<indri::lang::Node*>( _newNodes ); 00132 } 00133 00134 indri::lang::Node* defaultAfter( indri::lang::Node* oldNode, indri::lang::Node* newNode ) { 00135 _newNodes.push_back( newNode ); 00136 return newNode; 00137 } 00138 00139 indri::lang::Node* after( indri::lang::ContextCounterNode* contextNode, indri::lang::ContextCounterNode* newNode ) { 00140 // first, walk the subtree to find out if it's computable 00141 SubtreeWalker subtree; 00142 contextNode->walk(subtree); 00143 indri::lang::Node* result = newNode; 00144 00145 if( subtree.isComputable() ) { 00146 // terms 00147 std::vector<std::string> terms; 00148 for( size_t i=0; i<subtree.getTerms().size(); i++ ) { 00149 indri::lang::IndexTerm* indexTerm = subtree.getTerms()[i]; 00150 std::string term; 00151 00152 if( indexTerm->getStemmed() == false ) 00153 term = _repository.processTerm( indexTerm->getText() ); 00154 else 00155 term = indexTerm->getText(); 00156 00157 terms.push_back( term ); 00158 } 00159 00160 std::string field; 00161 std::string context; 00162 00163 if( subtree.hasContext() ) { 00164 context = subtree.getField()->getFieldName(); 00165 } else if( subtree.getField() ) { 00166 field = subtree.getField()->getFieldName(); 00167 } 00168 00169 result = new indri::lang::ContextSimpleCounterNode( terms, field, context ); 00170 result->setNodeName( contextNode->nodeName() ); 00171 delete newNode; 00172 } 00173 00174 // if it wasn't computable, keep the subtree around so the 00175 // inference network code can run it and figure out the counts 00176 _newNodes.push_back( result ); 00177 return result; 00178 } 00179 }; 00180 } 00181 } 00182 00183 #endif // INDRI_CONTEXTSIMPLECOUNTCOLLECTORCOPIER_HPP 00184