00001 /*========================================================================== 00002 * Copyright (c) 2003-2005 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // OffsetMetadataAnnotator 00015 // 00016 // 3 November 2005 -- jcb 00017 // 00018 // Reads supplied offset metadata file and adds the metadata to 00019 // the parsed document. 00020 // 00021 00022 // Format of the offset metadata file: 3-column, tab-delimited. 00023 // From left-to-right, those columns are: 00024 // 00025 // docno : external doc id for document to annotate (string) (e.g. 10) 00026 // 00027 // key : the key/name of the metadata element (string) (e.g. origURL) 00028 // 00029 // value : the value of the metadata element (string) (e.g. http://bla) 00030 00031 // While the OffsetMetadataAnnotator is transforming the 00032 // ParsedDocument, it will directly operate on the data structures 00033 // just as if it were the Parser, except that it adds metadata from 00034 // its file as opposed to from the original TokenizedDocument text. 00035 00036 #ifndef INDRI_OFFSETMETADATAANNOTATOR_HPP 00037 #define INDRI_OFFSETMETADATAANNOTATOR_HPP 00038 00039 #include "indri/Transformation.hpp" 00040 #include "indri/MetadataPair.hpp" 00041 #include "indri/ParsedDocument.hpp" 00042 #include "indri/HashTable.hpp" 00043 #include "indri/greedy_vector" 00044 #include <iostream> 00045 #include <vector> 00046 #include <string.h> 00047 #include <string> 00048 00049 namespace indri { 00050 namespace parse { 00051 00052 class OffsetMetadataAnnotator : public Transformation { 00053 00054 private: 00055 std::string _offsetMetadataFile; 00056 00057 indri::utility::HashTable<const char *,indri::utility::greedy_vector<MetadataPair*>*> _annotations; 00058 std::vector<char *> _buffers_allocated; 00059 bool _first_open; 00060 00061 ObjectHandler<indri::api::ParsedDocument>* _handler; 00062 00063 const char *_getDocno( indri::api::ParsedDocument* document ) { 00064 //Find DOCNO attribute in document 00065 for( size_t i=0; i<document->metadata.size(); i++ ) { 00066 const char* attributeName = document->metadata[i].key; 00067 const char* attributeValue = (const char*) document->metadata[i].value; 00068 00069 if( ! strcmp( attributeName, "docno" ) ) return attributeValue; 00070 } 00071 return NULL; 00072 } 00073 00074 00075 void _cleanup() { 00076 //Cleanup _annotations in preparation for object destruction, 00077 // or for an open call on a new offset metadata file. 00078 for( indri::utility::HashTable<const char *,indri::utility::greedy_vector<MetadataPair*>*>::iterator i = _annotations.begin(); 00079 i != _annotations.end(); i++ ) { 00080 indri::utility::greedy_vector<MetadataPair*>* p_vec = *(*i).second; 00081 for( indri::utility::greedy_vector<MetadataPair*>::iterator j = 00082 p_vec->begin(); j != p_vec->end(); j++ ) { 00083 delete (*j); //MetadataPair 00084 } 00085 } 00086 _annotations.clear(); 00087 } 00088 00089 public: 00090 OffsetMetadataAnnotator() { 00091 _handler = NULL; 00092 _first_open = true; 00093 } 00094 00095 ~OffsetMetadataAnnotator() { 00096 _cleanup(); 00097 00098 for ( std::vector<char *>::iterator i = _buffers_allocated.begin(); i != _buffers_allocated.end(); i++ ) 00099 delete[] (*i); 00100 } 00101 00102 void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler ) { 00103 _handler = &handler; 00104 } 00105 00106 void handle( indri::api::ParsedDocument* document ) { 00107 _handler->handle( transform( document ) ); 00108 } 00109 00110 // Defined in OffsetMetadataAnnotator.cpp 00111 void open( const std::string& offsetMetadataFile ); 00112 indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ); 00113 00114 }; 00115 00116 } 00117 } 00118 00119 #endif // INDRI_OFFSETMETADATAANNOTATOR_HPP 00120