00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00046 #ifndef INDRI_DATEFIELDANNOTATOR_HPP
00047 #define INDRI_DATEFIELDANNOTATOR_HPP
00048 #include "indri/DateParse.hpp"
00049 namespace indri
00050 {
00051 namespace parse
00052 {
00053
00054 class DateFieldAnnotator : public Transformation {
00055 private:
00056 ObjectHandler<indri::api::ParsedDocument>* _handler;
00057 std::string& _field;
00058
00059 void _parseDate(const std::string &date, TagExtent *extent) const
00060 {
00061 std::string day, month, year;
00062
00063 if (extent->begin == extent->end-1) {
00064
00065 year = date.substr( 0, 4 );
00066 month = date.substr( 4, 2 );
00067 day = date.substr( 6, 2 );
00068 extent->number = indri::parse::DateParse::convertDate( year, month, day );
00069 } else {
00070 bool swapMonth = false;
00071 std::string delim = "/";
00072 int firstDash;
00073 int secondDash;
00074 if ((firstDash = date.find(delim)) == std::string::npos) {
00075 delim = "-";
00076 if ((firstDash = date.find(delim)) == std::string::npos) {
00077 delim = " ";
00078 if ((firstDash = date.find(delim)) == std::string::npos)
00079
00080 return;
00081 else {
00082
00083 if (firstDash > 2) swapMonth = true;
00084 }
00085 }
00086 } else {
00087
00088 swapMonth = true;
00089 }
00090
00091 secondDash = date.find(delim, firstDash+1);
00092 day = date.substr( 0, firstDash );
00093 month = date.substr( firstDash+1, secondDash-firstDash-1 );
00094 year = date.substr( secondDash+1 );
00095
00096 if (firstDash == 4)
00097
00098 extent->number = indri::parse::DateParse::convertDate( day, month, year );
00099 else {
00100
00101 if (year.length() == 2) year = "19" + year;
00102 if (swapMonth)
00103
00104 extent->number = indri::parse::DateParse::convertDate( year, day, month );
00105 else
00106 extent->number = indri::parse::DateParse::convertDate( year, month, day );
00107 }
00108 }
00109 }
00110
00111 public:
00112 DateFieldAnnotator( std::string& field ) :
00113 _handler(0), _field(field) {
00114 }
00115
00116 ~DateFieldAnnotator() {
00117 }
00118
00119 indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) {
00120 for( size_t i=0; i<document->tags.size(); i++ ) {
00121 TagExtent * extent = document->tags[i];
00122
00123 if( _field == extent->name ) {
00124 int dateStart = document->positions[extent->begin].begin;
00125 int dateEnd = document->positions[extent->end-1].end;
00126 int dateLen = dateEnd - dateStart ;
00127 std::string date;
00128 date.assign(document->text + dateStart, dateLen);
00129 _parseDate(date, extent);
00130 }
00131 }
00132 return document;
00133 }
00134
00135 void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler ) {
00136 _handler = &handler;
00137 }
00138
00139 void handle( indri::api::ParsedDocument* document ) {
00140 _handler->handle( transform( document ) );
00141 }
00142 };
00143
00144 }
00145 }
00146
00147 #endif // INDRI_DATEFIELDANNOTATOR_HPP
00148
00149