00001 /*========================================================================== 00002 * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // NumericFieldAnnotator 00015 // 00016 // 25 May 2004 -- tds 00017 // 00018 00019 #ifndef INDRI_NUMERICFIELDANNOTATOR_HPP 00020 #define INDRI_NUMERICFIELDANNOTATOR_HPP 00021 namespace indri 00022 { 00023 namespace parse 00024 { 00025 00026 class NumericFieldAnnotator : public Transformation { 00027 private: 00028 ObjectHandler<indri::api::ParsedDocument>* _handler; 00029 std::string& _field; 00030 bool _foundNonNumeric; 00031 // a buffer for copying the number 00032 int _numberCopyLength; 00033 char * _numberCopy; 00034 00035 public: 00036 NumericFieldAnnotator( std::string& field ) : 00037 _handler(0), 00038 _field(field), 00039 _foundNonNumeric(false), 00040 _numberCopyLength(1024) // should be large enough 00041 { 00042 _numberCopy = new char[ _numberCopyLength + 1 ]; 00043 } 00044 00045 ~NumericFieldAnnotator() { 00046 delete [] _numberCopy; 00047 } 00048 00049 indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) { 00050 for( size_t i=0; i<document->tags.size(); i++ ) { 00051 _foundNonNumeric = false; 00052 TagExtent * extent = document->tags[i]; 00053 00054 if( _field == extent->name && extent->begin != extent->end ) { 00055 char* numberText = document->terms[ extent->begin ]; 00056 // check for non-numeric characters 00057 char * begin = numberText; 00058 char * end = numberText; 00059 // find the first acceptable character 00060 for ( begin = numberText; *begin != '\0'; begin++ ) { 00061 if ( *begin == '-' || 00062 (*begin >= '0' && *begin <= '9') ) { 00063 break; 00064 } else { 00065 if ( _foundNonNumeric == false ) { 00066 _foundNonNumeric = true; 00067 } 00068 } 00069 } 00070 // find the last acceptable numeric character 00071 for ( end = begin; *end != '\0'; end++ ) { 00072 if (! ( *end == '-' || 00073 // *end == '.' || // for now, the recognizer only handles integers 00074 (*end >= '0' && *end <= '9') 00075 ) ) { 00076 break; 00077 } 00078 } 00079 INT64 value = 0; 00080 int len = end - begin; 00081 if ( len > 0 ) { 00082 // make a copy 00083 if ( len > _numberCopyLength ) { 00084 len = _numberCopyLength; 00085 } 00086 _numberCopy[ len ] = '\0'; 00087 strncpy( _numberCopy, begin, len ); 00088 // convert the number 00089 value = string_to_i64( _numberCopy ); 00090 } 00091 extent->number = value; 00092 } 00093 } 00094 00095 return document; 00096 } 00097 00098 void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler ) { 00099 _handler = &handler; 00100 } 00101 00102 void handle( indri::api::ParsedDocument* document ) { 00103 _handler->handle( transform( document ) ); 00104 } 00105 }; 00106 00107 } 00108 } 00109 00110 #endif // INDRI_NUMERICFIELDANNOTATOR_HPP 00111 00112