Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

NumericFieldAnnotator.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // NumericFieldAnnotator
00015 //
00016 // 25 May 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_NUMERICFIELDANNOTATOR_HPP
00020 #define INDRI_NUMERICFIELDANNOTATOR_HPP
00021 namespace indri
00022 {
00023   namespace parse
00024   {
00025 
00026     class NumericFieldAnnotator : public Transformation {
00027     private:
00028       ObjectHandler<indri::api::ParsedDocument>* _handler;
00029       std::string& _field;
00030       bool _foundNonNumeric;
00031       // a buffer for copying the number
00032       int _numberCopyLength;
00033       char * _numberCopy;
00034       
00035     public:
00036       NumericFieldAnnotator( std::string& field ) :
00037         _handler(0),
00038         _field(field),
00039         _foundNonNumeric(false),
00040         _numberCopyLength(1024) // should be large enough
00041       {
00042         _numberCopy = new char[ _numberCopyLength + 1 ];
00043       }
00044       
00045       ~NumericFieldAnnotator() {
00046         delete [] _numberCopy;
00047       }
00048 
00049       indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) {
00050         for( size_t i=0; i<document->tags.size(); i++ ) {
00051           _foundNonNumeric = false;
00052           TagExtent * extent = document->tags[i];
00053 
00054           if( _field == extent->name && extent->begin != extent->end ) {
00055             char* numberText = document->terms[ extent->begin ]; 
00056             // check for non-numeric characters
00057             char * begin = numberText;
00058             char * end = numberText;
00059             // find the first acceptable character 
00060             for ( begin = numberText; *begin != '\0'; begin++ ) {
00061               if ( *begin == '-' ||
00062                    (*begin >= '0' && *begin <= '9') ) {
00063                 break;
00064               } else {
00065                 if ( _foundNonNumeric == false ) {
00066                   _foundNonNumeric = true;
00067                 }
00068               }
00069             }
00070             // find the last acceptable numeric character
00071             for ( end = begin; *end != '\0'; end++ ) {
00072               if (! ( *end == '-' ||
00073                       // *end == '.' || // for now, the recognizer only handles integers
00074                       (*end >= '0' && *end <= '9') 
00075                       ) ) {
00076                 break;
00077               }
00078             }
00079             INT64 value = 0;
00080             int len = end - begin;
00081             if ( len > 0 ) {
00082               // make a copy
00083               if ( len > _numberCopyLength ) {
00084                 len = _numberCopyLength;
00085               }
00086               _numberCopy[ len ] = '\0';
00087               strncpy( _numberCopy, begin, len );
00088               // convert the number
00089               value = string_to_i64( _numberCopy );
00090             }
00091             extent->number = value;
00092           }
00093         }
00094 
00095         return document;
00096       }
00097 
00098       void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler ) {
00099         _handler = &handler;
00100       }
00101 
00102       void handle( indri::api::ParsedDocument* document ) {
00103         _handler->handle( transform( document ) );
00104       }
00105     };
00106  
00107   }
00108 }
00109 
00110 #endif // INDRI_NUMERICFIELDANNOTATOR_HPP
00111 
00112 

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4