Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

DateFieldAnnotator.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2007 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // DateFieldAnnotator
00015 //
00016 // 20 Mar 2007 -- dmf
00017 //
00018 //FIX the href if necessary.
00046 #ifndef INDRI_DATEFIELDANNOTATOR_HPP
00047 #define INDRI_DATEFIELDANNOTATOR_HPP
00048 #include "indri/DateParse.hpp"
00049 namespace indri
00050 {
00051   namespace parse
00052   {
00053 
00054     class DateFieldAnnotator : public Transformation {
00055     private:
00056       ObjectHandler<indri::api::ParsedDocument>* _handler;
00057       std::string& _field;
00058       
00059       void _parseDate(const std::string &date, TagExtent *extent) const
00060       {
00061         std::string day, month, year;
00062         // is it a slash-date, dash-date, or space-date, or a single number?
00063         if (extent->begin == extent->end-1) {
00064           // single number date YYYYMMDD
00065           year = date.substr( 0, 4 ); 
00066           month = date.substr( 4, 2 );
00067           day = date.substr( 6, 2 );
00068           extent->number = indri::parse::DateParse::convertDate( year, month, day );
00069         } else {
00070           bool swapMonth = false;
00071           std::string delim = "/";
00072           int firstDash;        
00073           int secondDash;
00074           if ((firstDash = date.find(delim)) == std::string::npos) {
00075             delim = "-";
00076             if ((firstDash = date.find(delim)) == std::string::npos) {
00077               delim = " ";
00078               if ((firstDash = date.find(delim)) == std::string::npos) 
00079                 // nothing to parse
00080                 return;
00081               else {
00082                 // space date is Month DD YYYY or DD Month YYYY
00083                 if (firstDash > 2) swapMonth = true;
00084               }
00085             }
00086           } else {
00087                 // slash date is MM/DD/YYYY
00088             swapMonth = true;
00089           }
00090 
00091           secondDash = date.find(delim, firstDash+1);
00092           day = date.substr( 0, firstDash ); 
00093           month = date.substr( firstDash+1, secondDash-firstDash-1 );
00094           year = date.substr( secondDash+1 );
00095           
00096           if (firstDash == 4)
00097             // YYYY-MM-DD or YYYY/MM/DD
00098             extent->number = indri::parse::DateParse::convertDate( day, month, year );
00099           else {
00100             // hack for 2 digit years in WSJ
00101             if (year.length() == 2) year = "19" + year;
00102             if (swapMonth)
00103               //  Month DD YYYY MM-DD-YY
00104               extent->number = indri::parse::DateParse::convertDate( year, day, month );
00105             else  
00106               extent->number = indri::parse::DateParse::convertDate( year, month, day );
00107           }
00108         }
00109       }
00110       
00111     public:
00112       DateFieldAnnotator( std::string& field ) :
00113         _handler(0), _field(field) {
00114       }
00115       
00116       ~DateFieldAnnotator() {
00117       }
00118 
00119       indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) {
00120         for( size_t i=0; i<document->tags.size(); i++ ) {
00121           TagExtent * extent = document->tags[i];
00122           // reparse from the document text
00123           if( _field == extent->name ) {
00124             int dateStart = document->positions[extent->begin].begin; 
00125             int dateEnd = document->positions[extent->end-1].end;
00126             int dateLen = dateEnd - dateStart ;
00127             std::string date;
00128             date.assign(document->text + dateStart, dateLen);
00129             _parseDate(date, extent);
00130           }
00131         }
00132         return document;
00133       }
00134 
00135       void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler ) {
00136         _handler = &handler;
00137       }
00138 
00139       void handle( indri::api::ParsedDocument* document ) {
00140         _handler->handle( transform( document ) );
00141       }
00142     };
00143  
00144   }
00145 }
00146 
00147 #endif // INDRI_DATEFIELDANNOTATOR_HPP
00148 
00149 

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4