00001 /*========================================================================== 00002 * Copyright (c) 2005 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // ConflationPattern 00015 // 00016 // 15 September 2005 -- mwb 00017 // 00018 00019 // Data structure to support tag-attribute-value conflations at 00020 // parsing time. To illustrate by example, consider these three 00021 // tags encountered in the source document text: 00022 // 00023 // <TAG ... ATT1="VAL1" ... /> 00024 // <TAG ... ATT1="VAL2" ... /> 00025 // <TAG ... ATT2="VAL1" ... /> 00026 // 00027 // The pattern { "tag", null, null } matches all three. 00028 // The pattern { "tag", "att1", null } matches only the top two. 00029 // The pattern { "tag", "att1", "VAL2" } matches only the middle one. 00030 // 00031 // Note that the pattern { "tag", null, "VAL2" } is not valid. 00032 // 00033 // These patterns are defined in FileClassEnvironmentFactory, and are 00034 // passed to the ParserFactory when the FileClassEnvironment is 00035 // constructed. The Parser will replace any tags that match the 00036 // pattern with a tag of a specified name that has no attributes. 00037 00038 #ifndef INDRI_CONFLATIONPATTERN_HPP 00039 #define INDRI_CONFLATIONPATTERN_HPP 00040 00041 #include <string.h> 00042 #include <functional> 00043 00044 namespace indri { 00045 namespace parse { 00046 00047 // The tag_name and attribute_name strings in the 00048 // ConflationPattern should always be downcased, but value should 00049 // appear as it does in the source document. 00050 00051 struct ConflationPattern { 00052 const char* tag_name; 00053 const char* attribute_name; 00054 const char* value; 00055 }; 00056 00057 } 00058 } 00059 00060 namespace std { 00061 00062 template <> 00063 struct less<indri::parse::ConflationPattern *> { 00064 00065 bool operator() ( const indri::parse::ConflationPattern* one, 00066 const indri::parse::ConflationPattern* two ) const { 00067 00068 // First compare tag_name, then attribute_name, then value. 00069 // Comparison is lexical ordering according to strcmp. Recall 00070 // that tag_name and attribute_name should always be downcased 00071 // in a ConflationPattern, so this leads to a case-insensitive 00072 // match. A value NULL for any entry in the pattern ( which is 00073 // interpreted as a wildcard ), always comes first. 00074 00075 // { NULL, NULL, NULL } always comes first. 00076 // { x, NULL, NULL } always comes before { x, y, NULL } 00077 // { x, y, NULL } always comes before { x, y, z } 00078 00079 // Return true if ConflationPattern one precedes 00080 // ConflationPattern two; false otherwise. 00081 00082 int r = 0; 00083 00084 // tag_name 00085 00086 if ( one->tag_name && two->tag_name ) 00087 r = strcmp( one->tag_name, two->tag_name ); 00088 else if ( ! one->tag_name ) return true; 00089 else if ( ! two->tag_name ) return false; 00090 00091 if ( r != 0 ) return ( r < 0 ); 00092 00093 // attribute_name 00094 00095 if ( one->attribute_name && two->attribute_name ) 00096 r = strcmp( one->attribute_name, two->attribute_name ); 00097 else if ( ! one->attribute_name ) return true; 00098 else if ( ! two->attribute_name ) return false; 00099 00100 if ( r != 0 ) return ( r < 0 ); 00101 00102 // value 00103 00104 if ( one->value && two->value ) 00105 r = strcmp( one->value, two->value ); 00106 else if ( ! one->value ) return true; 00107 else if ( ! two->value ) return false; 00108 00109 if ( r != 0 ) return ( r < 0 ); 00110 00111 // If both ConflationPatterns are equal, neither precedes the 00112 // other. 00113 00114 return false; 00115 00116 } 00117 }; 00118 } 00119 00120 #endif // INDRI_CONFLATIONPATTERN_HPP