Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

OffsetMetadataAnnotator.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2005 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // OffsetMetadataAnnotator
00015 //
00016 // 3 November 2005 -- jcb 
00017 //
00018 // Reads supplied offset metadata file and adds the metadata to
00019 // the parsed document.
00020 //
00021 
00022 // Format of the offset metadata file: 3-column, tab-delimited.
00023 // From left-to-right, those columns are: 
00024 //
00025 //   docno     : external doc id for document to annotate (string) (e.g. 10) 
00026 //
00027 //   key       : the key/name of the metadata element (string) (e.g. origURL)
00028 //
00029 //   value     : the value of the metadata element (string) (e.g. http://bla) 
00030 
00031 // While the OffsetMetadataAnnotator is transforming the
00032 // ParsedDocument, it will directly operate on the data structures
00033 // just as if it were the Parser, except that it adds metadata from
00034 // its file as opposed to from the original TokenizedDocument text.
00035 
00036 #ifndef INDRI_OFFSETMETADATAANNOTATOR_HPP
00037 #define INDRI_OFFSETMETADATAANNOTATOR_HPP
00038 
00039 #include "indri/Transformation.hpp"
00040 #include "indri/MetadataPair.hpp"
00041 #include "indri/ParsedDocument.hpp"
00042 #include "indri/HashTable.hpp"
00043 #include "indri/greedy_vector"
00044 #include <iostream>
00045 #include <vector>
00046 #include <string.h>
00047 #include <string>
00048 
00049 namespace indri {
00050   namespace parse {
00051 
00052     class OffsetMetadataAnnotator : public Transformation {
00053 
00054     private:
00055       std::string _offsetMetadataFile;
00056 
00057       indri::utility::HashTable<const char *,indri::utility::greedy_vector<MetadataPair*>*> _annotations;
00058       std::vector<char *> _buffers_allocated;
00059       bool _first_open;
00060 
00061       ObjectHandler<indri::api::ParsedDocument>* _handler;
00062       
00063       const char *_getDocno( indri::api::ParsedDocument* document ) {
00064         //Find DOCNO attribute in document
00065         for( size_t i=0; i<document->metadata.size(); i++ ) {
00066           const char* attributeName = document->metadata[i].key;
00067           const char* attributeValue = (const char*) document->metadata[i].value;
00068 
00069           if( ! strcmp( attributeName, "docno" ) ) return attributeValue;
00070         }
00071         return NULL;
00072       }
00073 
00074 
00075       void _cleanup() {
00076         //Cleanup _annotations in preparation for object destruction,
00077         // or for an open call on a new offset metadata file.
00078         for( indri::utility::HashTable<const char *,indri::utility::greedy_vector<MetadataPair*>*>::iterator i = _annotations.begin(); 
00079              i != _annotations.end(); i++ ) {
00080           indri::utility::greedy_vector<MetadataPair*>* p_vec = *(*i).second;
00081           for( indri::utility::greedy_vector<MetadataPair*>::iterator j = 
00082                   p_vec->begin(); j != p_vec->end(); j++ ) {
00083             delete (*j); //MetadataPair 
00084           }
00085         }
00086         _annotations.clear();
00087       }
00088 
00089     public:
00090       OffsetMetadataAnnotator() {
00091         _handler = NULL;
00092         _first_open = true;
00093       }
00094 
00095       ~OffsetMetadataAnnotator() {
00096         _cleanup();
00097 
00098         for ( std::vector<char *>::iterator i = _buffers_allocated.begin(); i != _buffers_allocated.end(); i++ )
00099           delete[] (*i);
00100       }
00101 
00102       void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler ) {
00103         _handler = &handler;
00104       }
00105 
00106       void handle( indri::api::ParsedDocument* document ) {
00107         _handler->handle( transform( document ) );
00108       }
00109 
00110       // Defined in OffsetMetadataAnnotator.cpp
00111       void open( const std::string& offsetMetadataFile );
00112       indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document );
00113 
00114     };
00115 
00116   }
00117 }
00118 
00119 #endif // INDRI_OFFSETMETADATAANNOTATOR_HPP
00120 

Generated on Tue Jun 15 11:02:54 2010 for Lemur by doxygen 1.3.4