Lemur: OffsetAnnotationAnnotator.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2003-2005 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // OffsetAnnotationAnnotator
00015 //
00016 // 18 September 2005 -- mwb
00017 //
00018 // Reads a supplied offset annotation file and adds the annotations to
00019 // the parsed document.
00020 //
00021 
00022 // Format of the offset annotation file: 8-column, tab-delimited.
00023 // From left-to-right, those columns are: 
00024 //
00025 //   docno     : external document id corresponding to the document in
00026 //             : which the annotation occurs.
00027 //
00028 //   type      : TAG or ATTRIBUTE
00029 //
00030 //   id        : an id number for the annotation; each line should have a
00031 //             : unique id >= 1.
00032 //
00033 //   name      : for TAG, name or type of the annotation
00034 //             : for ATTRIBUTE, the attribute name, or key
00035 //
00036 //   start     : start and length define the annotation's extent;
00037 //   length    : meaningless for an ATTRIBUTE
00038 //
00039 //   value     : for TAG, an INT64
00040 //             : for ATTRIBUTE, a string that is the attribute's value
00041 //
00042 //   parentid  : for TAG, refers to the id number of another TAG to be
00043 //             : considered the parent of this one; this is how hierarchical
00044 //             : annotations can be expressed.
00045 //             : a TAG that has no parent has parentid = 0
00046 //             : for ATTRIBUTE, refers to the id number of a TAG to which
00047 //             : it belongs and from which it inherits its start and length.
00048 //             : *NOTE: the file must be sorted such that any line that uses 
00049 //             : a given id in this column must be *after* the line that 
00050 //             : uses that id in the id column.
00051 //
00052 //   debug     : ignored by the OffsetAnnotator; can contain any information
00053 //             : that is beneficial to a human reading the file
00054 
00055 // While the OffsetAnnotationAnnotator is transforming the
00056 // ParsedDocument, it will directly operate on the data structures
00057 // just as if it were the Parser, except that it adds annotations from
00058 // its file as opposed to from the original TokenizedDocument text.
00059 
00060 #ifndef INDRI_OFFSETANNOTATIONANNOTATOR_HPP
00061 #define INDRI_OFFSETANNOTATIONANNOTATOR_HPP
00062 
00063 #include <iostream>
00064 #include <fstream>
00065 #include <vector>
00066 #include <string>
00067 #include <set>
00068 #include <utility>
00069 
00070 #include "indri/Buffer.hpp"
00071 #include "indri/Transformation.hpp"
00072 #include "indri/TagExtent.hpp"
00073 #include "indri/ParsedDocument.hpp"
00074 #include "indri/HashTable.hpp"
00075 #include "indri/greedy_vector"
00076 #include "indri/Conflater.hpp"
00077 
00078 namespace indri {
00079   namespace parse {
00080 
00081     enum OffsetAnnotationIndexHint {
00082       OAHintDefault,
00083       OAHintOrderedAnnotations,
00084       OAHintSizeBuffers,
00085       OAHintNone
00086     };
00087 
00088     class OffsetAnnotationAnnotator : public Transformation {
00089 
00090     private:
00091 
00092       // structure to hold a read tag
00093       struct ReadAnnotationTag {
00094         char *docno;
00095         char *name;
00096         char *s_value;
00097         int type; // TAG = 1, ATTRIBUTE = 2
00098         UINT64 id;
00099         UINT64 i_value;
00100         UINT64 parent;
00101         int start;
00102         int length;
00103       };
00104 
00105       // 
00106       OffsetAnnotationIndexHint _indexHintType;
00107 
00108       // path to the offset annotation file
00109       std::string _offsetAnnotationsFile;
00110 
00111       // holds the size of the last allocation of the buffers
00112       // to see if we should re-size it
00113       int lastBufferAllocationSize;
00114 
00115       // Before the actual ParsedDocument is read in, we can not
00116       // convert byte extents from the .oa file to token extents.  The
00117       // TagExtents in this table have their begin and end values
00118       // expressed as byte extents, not token extents.
00119       indri::utility::HashTable<const char *,std::set<TagExtent*>*> *_annotations;
00120 
00121       // After a document's set of annotations has been converted
00122       // to token extents, we store the result in this table in case
00123       // someone asks for that same document's annotations again.
00124       indri::utility::HashTable<const char *,std::set<TagExtent*>*> *_converted_annotations;
00125 
00126       indri::utility::HashTable<UINT64,TagExtent*>          *_tag_id_map;
00127       indri::utility::HashTable<UINT64,AttributeValuePair*> *_attribute_id_map;
00128 
00129       // vector of stored buffers here to allow for
00130       // end cleanup of allocated char* structures
00131       std::vector<char *> *_buffers_allocated;
00132 
00133       bool _first_open;
00134 
00135       ObjectHandler<indri::api::ParsedDocument>* _handler;
00136       Conflater* _p_conflater;
00137 
00138       std::ifstream annotationFile;
00139                         int offsetAnnotationFileLine;
00140 
00141       
00142       const char *_getDocno( indri::api::ParsedDocument* document );
00143       TagExtent *_getTag( UINT64 id );
00144       AttributeValuePair *_getAttribute( UINT64 id );
00145       bool _is_unique_id( UINT64 id, int line );
00146       UINT64 parse_UINT64( const char *str, int n );
00147       void _cleanup();
00148 
00149       ReadAnnotationTag parseLine(char *readLine, int lineCounter);
00150 
00151                         // holds the last annotation tag that was read and parsed
00152                         // (used for ordererd file parsing)
00153                         ReadAnnotationTag lastReadTag;
00154 
00155                         // reads in the tags for a document - used for ordered file parsing
00156                         void readAnnotationTags(const char *docno);
00157 
00158       void convert_annotations( std::set<indri::parse::TagExtent*>* raw_tags,
00159                                 std::set<indri::parse::TagExtent*>* converted_tags, 
00160                                 indri::api::ParsedDocument* document );
00161 
00162     public:
00164       OffsetAnnotationAnnotator( Conflater* p_conflater );
00165       OffsetAnnotationAnnotator();
00166       ~OffsetAnnotationAnnotator();
00167 
00168       void setTags (const char *docno, const std::vector<indri::parse::TagExtent *> &tagset);
00169       
00170       void setConflater(Conflater* p_conflater);
00171       void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler );
00172       void handle( indri::api::ParsedDocument* document );
00173 
00174       void setHint(indri::parse::OffsetAnnotationIndexHint hintType);
00175 
00176       void open( const std::string& offsetAnnotationsFile );
00177       indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document );
00178 
00179     };
00180 
00181   }
00182 }
00183 
00184 #endif // INDRI_OFFSETANNOTATIONANNOTATOR_HPP
00185