Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

Conflater.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2005 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // Conflater
00015 //
00016 // 15 September 2005 -- mwb
00017 //
00018 
00019 // Simple wrapper for tag conflation code, so that it can be called
00020 // both from the Parser as it processes incoming text, and from the
00021 // OffsetAnnotationsAnnotator as it reads from the offset annotations
00022 // file.
00023 
00024 #ifndef INDRI_CONFLATER_HPP
00025 #define INDRI_CONFLATER_HPP
00026 
00027 #include <vector>
00028 #include <ctype.h>
00029 #include <string.h>
00030 #include "indri/Buffer.hpp"
00031 #include "indri/TagExtent.hpp"
00032 #include "indri/TagEvent.hpp"
00033 #include "indri/ConflationPattern.hpp"
00034 #include "indri/HashTable.hpp"
00035 #include "indri/AttributeValuePair.hpp"
00036 
00037 namespace indri {
00038   namespace parse {
00039 
00040     class Conflater {
00041 
00042     private:
00043       indri::utility::Buffer _stringBuf;
00044       const char *_strdup(const char * token) {
00045         size_t token_len = strlen(token);
00046         char* write_loc = _stringBuf.write( token_len + 1 );
00047         memcpy( write_loc, token, token_len + 1 );
00048         return write_loc;
00049       }
00050       
00051       
00052     protected:
00053 
00054       struct attribute_pattern {
00055         indri::utility::HashTable<const char*,const char*> values;
00056         bool match_attribute_name;   // true if pattern matches att name alone
00057         const char* conflation;      // must be set if above flag is true
00058       };
00059 
00060       struct tag_pattern {
00061         indri::utility::HashTable<const char*,attribute_pattern*> attributes;
00062         bool match_tag_name;         // true if pattern matches tag name alone
00063         const char* conflation;      // must be set if above flag is true
00064       };
00065 
00066       indri::utility::HashTable<const char *,tag_pattern*> _pattern_table;
00067       
00068       // The process of conflating a tag structure is O(nm) in the
00069       // number of attributes, n, and in the number of matching
00070       // attribute-value pattern, m.
00071 
00072       const char *_lookup( const char* name, indri::utility::greedy_vector<indri::parse::AttributeValuePair, 2>& attributes ) {
00073 
00074         tag_pattern** p = _pattern_table.find( name );
00075         tag_pattern* p_tag_pattern;
00076 
00077         if ( ! p ) return NULL; // No patterns match this tag name.
00078 
00079         p_tag_pattern = *p;
00080 
00081         // Now, we iterate through the attributes present in the
00082         // TagExtent to see if there are patterns that match any of them.
00083 
00084         attribute_pattern** q;
00085         attribute_pattern* p_attribute_pattern;
00086 
00087         for ( indri::utility::greedy_vector<AttributeValuePair, 2>::iterator i =
00088                 attributes.begin(); i != attributes.end(); i++ ) {
00089 
00090           q = p_tag_pattern->attributes.find( (*i).attribute );
00091 
00092           if ( q ) {
00093 
00094             p_attribute_pattern = *q;
00095 
00096             // There is a pattern that matches this attribute name, so
00097             // iterate through the values in the attribute_pattern to
00098             // see if they match.
00099             
00100             for ( indri::utility::HashTable<const char*,const char*>::iterator
00101                     j = p_attribute_pattern->values.begin();
00102                   j != p_attribute_pattern->values.end(); j++ ) {
00103 
00104               if ( ! strcmp( *(*j).first, (*i).value ) ) {
00105 
00106                 // We have an attribute-value match, so conflate tag.
00107                 return *(*j).second;
00108               }
00109             }
00110 
00111             // At this point, none of the values matched exactly.  Can
00112             // we still conflate based on the presence of the
00113             // attribute alone?
00114 
00115             if ( p_attribute_pattern->match_attribute_name ) {
00116 
00117               return p_attribute_pattern->conflation;
00118             }
00119           }
00120         }
00121 
00122         // At this point, none of the attributes matched exactly.  Can
00123         // we still conflate based on the name of the tag?
00124 
00125         if ( p_tag_pattern->match_tag_name ) {
00126 
00127           return p_tag_pattern->conflation;
00128         }
00129         return NULL;
00130       }
00131 
00132     public:
00133       Conflater( const std::map<ConflationPattern*,std::string>& conflations ) {
00134         //allocate some space for the strings:
00135         _stringBuf.grow(10*1024); // 10K should be plenty.
00136 
00137         // Build _pattern_table.  The assumption is that patterns for
00138         // tag names and attribute names are already downcased, so
00139         // that they will match the strings coming out of the
00140         // Tokenizer exactly.
00141 
00142         for ( std::map<ConflationPattern*,std::string>::const_iterator i =
00143                 conflations.begin(); i != conflations.end(); i++ ) {
00144 
00145           const ConflationPattern* p_cp = (*i).first;
00146           std::string conflation = (*i).second;
00147 
00148           tag_pattern** p =       
00149             _pattern_table.find( p_cp->tag_name );
00150           tag_pattern* p_tag_pattern;
00151 
00152           // Ensure tag_pattern exists for the tag_name we are dealing with.
00153 
00154           if ( ! p ) {
00155 
00156             p_tag_pattern = new tag_pattern;
00157             p_tag_pattern->match_tag_name = false;
00158             _pattern_table.insert( p_cp->tag_name, p_tag_pattern );
00159 
00160           } else {
00161 
00162             p_tag_pattern = *p;
00163           }
00164 
00165           // If the current ConflationPattern is a tag_name match only:
00166 
00167           if ( ( ! p_cp->attribute_name ) &&
00168                ( ! p_cp->value ) ) {
00169 
00170             p_tag_pattern->match_tag_name = true;
00171             p_tag_pattern->conflation = _strdup(conflation.c_str());
00172             continue;
00173           }
00174 
00175           attribute_pattern** q =
00176             p_tag_pattern->attributes.find( p_cp->attribute_name );
00177           attribute_pattern* p_attribute_pattern;
00178 
00179           // Ensure attribute_pattern exists for the attribute_name we
00180           // are dealing with:
00181 
00182           if ( ! q ) {
00183 
00184             p_attribute_pattern = new attribute_pattern;
00185             p_attribute_pattern->match_attribute_name = false;
00186             p_tag_pattern->attributes.insert( p_cp->attribute_name, 
00187                                               p_attribute_pattern );
00188 
00189           } else {
00190 
00191             p_attribute_pattern = *q;
00192           }
00193 
00194           // If the current ConflationPattern is an attribute_name match
00195           // only:
00196 
00197           if ( ! p_cp->value ) {
00198 
00199             p_attribute_pattern->match_attribute_name = true;
00200             p_attribute_pattern->conflation = _strdup(conflation.c_str());
00201             continue;
00202           }
00203           
00204           // Otherwise, it is an attribute-value match:
00205 
00206           p_attribute_pattern->values.insert( p_cp->value, _strdup(conflation.c_str()) );
00207 
00208         }       
00209       }
00210 
00211       ~Conflater() {
00212 
00213         // TODO: improve this by using delete_hash_table?
00214 
00215         // Clean up _pattern_table
00216 
00217         for ( indri::utility::HashTable<const char*,tag_pattern*>
00218                 ::iterator i = _pattern_table.begin(); 
00219               i != _pattern_table.end(); i++ ) {
00220 
00221           tag_pattern* p_tag_pattern = *(*i).second;
00222 
00223           for ( indri::utility::HashTable<const char*,attribute_pattern*>
00224                   ::iterator j = p_tag_pattern->attributes.begin();
00225                 j != p_tag_pattern->attributes.end(); j++ ) {
00226 
00227             attribute_pattern* p_attribute_pattern = *(*j).second;
00228 
00229             delete p_attribute_pattern;
00230           }
00231           
00232           delete p_tag_pattern;
00233         }
00234       }
00235 
00236       // Two convenient interfaces into the tag conflation algorithm.
00237 
00238       // When a tag is conflated, its attributes are wiped out
00239       // completely, and its tag name is changed to the conflation
00240       // string.
00241 
00242       void conflate( TagEvent* tev ) {
00243 
00244         const char *new_name = _lookup( tev->name, tev->attributes );
00245 
00246         if ( new_name ) {
00247 
00248           tev->name = new_name;
00249           tev->attributes.clear();
00250         }
00251       }
00252 
00253       void conflate( TagExtent* tex ) {
00254 
00255         const char *new_name = _lookup( tex->name, tex->attributes );
00256 
00257         if ( new_name ) {
00258 
00259           tex->name = new_name;
00260           tex->attributes.clear();
00261         }
00262       }
00263 
00267       const char * conflate( const char* tagname ) {
00268         indri::utility::greedy_vector<AttributeValuePair,2> attributes;
00269         const char *new_name = _lookup( tagname, attributes );
00270 
00271         if ( ! new_name ) {
00272           new_name = tagname;
00273         }
00274         return new_name;
00275       }
00276     };
00277   }
00278 }
00279 
00280 
00281 #endif // INDRI_CONFLATER_HPP

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4