00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef INDRI_CONFLATER_HPP
00025 #define INDRI_CONFLATER_HPP
00026
00027 #include <vector>
00028 #include <ctype.h>
00029 #include <string.h>
00030 #include "indri/Buffer.hpp"
00031 #include "indri/TagExtent.hpp"
00032 #include "indri/TagEvent.hpp"
00033 #include "indri/ConflationPattern.hpp"
00034 #include "indri/HashTable.hpp"
00035 #include "indri/AttributeValuePair.hpp"
00036
00037 namespace indri {
00038 namespace parse {
00039
00040 class Conflater {
00041
00042 private:
00043 indri::utility::Buffer _stringBuf;
00044 const char *_strdup(const char * token) {
00045 size_t token_len = strlen(token);
00046 char* write_loc = _stringBuf.write( token_len + 1 );
00047 memcpy( write_loc, token, token_len + 1 );
00048 return write_loc;
00049 }
00050
00051
00052 protected:
00053
00054 struct attribute_pattern {
00055 indri::utility::HashTable<const char*,const char*> values;
00056 bool match_attribute_name;
00057 const char* conflation;
00058 };
00059
00060 struct tag_pattern {
00061 indri::utility::HashTable<const char*,attribute_pattern*> attributes;
00062 bool match_tag_name;
00063 const char* conflation;
00064 };
00065
00066 indri::utility::HashTable<const char *,tag_pattern*> _pattern_table;
00067
00068
00069
00070
00071
00072 const char *_lookup( const char* name, indri::utility::greedy_vector<indri::parse::AttributeValuePair, 2>& attributes ) {
00073
00074 tag_pattern** p = _pattern_table.find( name );
00075 tag_pattern* p_tag_pattern;
00076
00077 if ( ! p ) return NULL;
00078
00079 p_tag_pattern = *p;
00080
00081
00082
00083
00084 attribute_pattern** q;
00085 attribute_pattern* p_attribute_pattern;
00086
00087 for ( indri::utility::greedy_vector<AttributeValuePair, 2>::iterator i =
00088 attributes.begin(); i != attributes.end(); i++ ) {
00089
00090 q = p_tag_pattern->attributes.find( (*i).attribute );
00091
00092 if ( q ) {
00093
00094 p_attribute_pattern = *q;
00095
00096
00097
00098
00099
00100 for ( indri::utility::HashTable<const char*,const char*>::iterator
00101 j = p_attribute_pattern->values.begin();
00102 j != p_attribute_pattern->values.end(); j++ ) {
00103
00104 if ( ! strcmp( *(*j).first, (*i).value ) ) {
00105
00106
00107 return *(*j).second;
00108 }
00109 }
00110
00111
00112
00113
00114
00115 if ( p_attribute_pattern->match_attribute_name ) {
00116
00117 return p_attribute_pattern->conflation;
00118 }
00119 }
00120 }
00121
00122
00123
00124
00125 if ( p_tag_pattern->match_tag_name ) {
00126
00127 return p_tag_pattern->conflation;
00128 }
00129 return NULL;
00130 }
00131
00132 public:
00133 Conflater( const std::map<ConflationPattern*,std::string>& conflations ) {
00134
00135 _stringBuf.grow(10*1024);
00136
00137
00138
00139
00140
00141
00142 for ( std::map<ConflationPattern*,std::string>::const_iterator i =
00143 conflations.begin(); i != conflations.end(); i++ ) {
00144
00145 const ConflationPattern* p_cp = (*i).first;
00146 std::string conflation = (*i).second;
00147
00148 tag_pattern** p =
00149 _pattern_table.find( p_cp->tag_name );
00150 tag_pattern* p_tag_pattern;
00151
00152
00153
00154 if ( ! p ) {
00155
00156 p_tag_pattern = new tag_pattern;
00157 p_tag_pattern->match_tag_name = false;
00158 _pattern_table.insert( p_cp->tag_name, p_tag_pattern );
00159
00160 } else {
00161
00162 p_tag_pattern = *p;
00163 }
00164
00165
00166
00167 if ( ( ! p_cp->attribute_name ) &&
00168 ( ! p_cp->value ) ) {
00169
00170 p_tag_pattern->match_tag_name = true;
00171 p_tag_pattern->conflation = _strdup(conflation.c_str());
00172 continue;
00173 }
00174
00175 attribute_pattern** q =
00176 p_tag_pattern->attributes.find( p_cp->attribute_name );
00177 attribute_pattern* p_attribute_pattern;
00178
00179
00180
00181
00182 if ( ! q ) {
00183
00184 p_attribute_pattern = new attribute_pattern;
00185 p_attribute_pattern->match_attribute_name = false;
00186 p_tag_pattern->attributes.insert( p_cp->attribute_name,
00187 p_attribute_pattern );
00188
00189 } else {
00190
00191 p_attribute_pattern = *q;
00192 }
00193
00194
00195
00196
00197 if ( ! p_cp->value ) {
00198
00199 p_attribute_pattern->match_attribute_name = true;
00200 p_attribute_pattern->conflation = _strdup(conflation.c_str());
00201 continue;
00202 }
00203
00204
00205
00206 p_attribute_pattern->values.insert( p_cp->value, _strdup(conflation.c_str()) );
00207
00208 }
00209 }
00210
00211 ~Conflater() {
00212
00213
00214
00215
00216
00217 for ( indri::utility::HashTable<const char*,tag_pattern*>
00218 ::iterator i = _pattern_table.begin();
00219 i != _pattern_table.end(); i++ ) {
00220
00221 tag_pattern* p_tag_pattern = *(*i).second;
00222
00223 for ( indri::utility::HashTable<const char*,attribute_pattern*>
00224 ::iterator j = p_tag_pattern->attributes.begin();
00225 j != p_tag_pattern->attributes.end(); j++ ) {
00226
00227 attribute_pattern* p_attribute_pattern = *(*j).second;
00228
00229 delete p_attribute_pattern;
00230 }
00231
00232 delete p_tag_pattern;
00233 }
00234 }
00235
00236
00237
00238
00239
00240
00241
00242 void conflate( TagEvent* tev ) {
00243
00244 const char *new_name = _lookup( tev->name, tev->attributes );
00245
00246 if ( new_name ) {
00247
00248 tev->name = new_name;
00249 tev->attributes.clear();
00250 }
00251 }
00252
00253 void conflate( TagExtent* tex ) {
00254
00255 const char *new_name = _lookup( tex->name, tex->attributes );
00256
00257 if ( new_name ) {
00258
00259 tex->name = new_name;
00260 tex->attributes.clear();
00261 }
00262 }
00263
00267 const char * conflate( const char* tagname ) {
00268 indri::utility::greedy_vector<AttributeValuePair,2> attributes;
00269 const char *new_name = _lookup( tagname, attributes );
00270
00271 if ( ! new_name ) {
00272 new_name = tagname;
00273 }
00274 return new_name;
00275 }
00276 };
00277 }
00278 }
00279
00280
00281 #endif // INDRI_CONFLATER_HPP