Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

TagList.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // TagList
00015 //
00016 // March 2004 -- metzler
00017 //
00018 
00019 #include "indri/Tag.hpp"
00020 #include <stdio.h>
00021 #include <string.h>
00022 #include <indri/greedy_vector>
00023 #include "indri/TagExtent.hpp"
00024 #include <iostream>
00025 #include "indri/MetadataPair.hpp"
00026 #include "indri/Buffer.hpp"
00027 
00028 #ifndef _TAGLIST_HPP
00029 #define _TAGLIST_HPP
00030 namespace indri
00031 {
00032   namespace parse
00033   {
00034     
00035     class TagList {
00036     private:
00037       struct tag_entry {
00038         const char* name;
00039         const char* conflation;
00040         int next;
00041         int begin;
00042         int end;
00043       };
00044 
00045       indri::utility::greedy_vector<tag_entry> _tags;
00046       int _openList;
00047 
00048       // Controls whether the list reconstructs a hierarchy of
00049       // tags when writeTagList is called.
00050       bool _findParents;
00051 
00052     public:
00053       TagList() : _findParents(true) {
00054         clear();
00055       }
00056       
00057       void setFindParents( bool findParents ) {
00058         _findParents = findParents;
00059       }
00060 
00061       void clear() {
00062         _tags.clear();
00063         _openList = -1;
00064       }
00065 
00066       // we assume here that name is more or less immutable
00067       // so we can store a pointer to it.  This is a reasonable
00068       // assumption, because if the tag is indexed, its name is
00069       // in a hash table somewhere, and we can just point to that
00070       // name copy.
00071       void addTag(const char *name, const char* conflation, int begin) {
00072         tag_entry t;
00073         t.name = name;
00074         t.conflation = conflation;
00075         t.begin = begin;
00076         t.end = -1;
00077         t.next = _openList;
00078         _tags.push_back(t);
00079         _openList = (int)_tags.size()-1;
00080       }
00081 
00082       void endTag(const char *name, const char* conflation, int end) {
00083         int list = _openList;
00084         int prev = -1;
00085         // finds the most recent open tag of this name
00086         while( list >= 0 ) {
00087           tag_entry& entry = _tags[list];
00088 
00089           if( !strcmp( entry.name, name ) ) {
00090             // found a tag to close
00091             entry.end = end;
00092             int next = entry.next;
00093 
00094             // unlink from open list
00095             if( prev == -1 ) {
00096               _openList = next;
00097             } else {
00098               _tags[prev].next = next;
00099             }
00100         
00101             return;
00102           } else {
00103             // this wasn't the tag, so keep looking
00104             prev = list;
00105             list = entry.next;
00106           }
00107         }
00108       }
00109 
00110       void writeTagList( indri::utility::greedy_vector<TagExtent *>& tags ) {
00111         // look through the tags vector; they're already in sorted order by open
00112         // position.  Only add closed tags.
00113 
00114         for( size_t i=0; i<_tags.size(); i++ ) {
00115           tag_entry& entry = _tags[i];
00116 
00117           if( entry.end >= 0 ) {// data field might be empty at head of doc
00118             TagExtent * extent = new TagExtent;
00119             extent->begin = entry.begin;
00120             extent->end = entry.end;
00121             extent->name = entry.conflation;
00122             extent->number = 0;
00123 
00124             if ( _findParents && (tags.size() > 0)) {
00125               // find this tag's parent
00126                 TagExtent * parent = tags.back();
00127                 while ( parent != NULL && 
00128                         parent->end <= extent->begin ) {
00129                   if ( parent->begin <= extent->begin &&
00130                        parent->end   >= extent->end ) break;
00131                   parent = parent->parent;
00132                 }
00133                 extent->parent = parent;
00134             } else {
00135               extent->parent = 0;
00136             }
00137 
00138             tags.push_back(extent);
00139           }
00140         }
00141       }
00142 
00143       // in this case, we'll treat the list of tags in this list
00144       // as if they were offsets into a metadata list
00145       void writeMetadataList( indri::utility::greedy_vector<MetadataPair>& pairs, indri::utility::Buffer& buffer, const char* docText ) {
00146         for( size_t i=0; i<_tags.size(); i++ ) {
00147           tag_entry& entry = _tags[i];
00148 
00149           if( entry.end > 0 ) {
00150             MetadataPair pair;
00151         
00152             // copy the text into a buffer
00153             int length = entry.end - entry.begin;
00154             char* spot = buffer.write(length+1);
00155             strncpy( spot, docText + entry.begin, length);
00156             spot[length] = 0;
00157 
00158             pair.key = entry.conflation;
00159             pair.value = spot;
00160             pair.valueLength = length+1;
00161 
00162             // docno is special -- its value must be stripped
00163             if( !strcmp( pair.key, "docno" ) ) {
00164               pair.stripValue();
00165             }
00166 
00167             pairs.push_back(pair);
00168           }
00169         }
00170       }
00171 
00172     };
00173   }
00174 }
00175 
00176     
00177 #endif

Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4