Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

DocumentStructure.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2006 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software (and below), and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 #ifndef INDRI_DOCUMENTSTRUCTURE_HPP
00013 #define INDRI_DOCUMENTSTRUCTURE_HPP
00014 
00015 #include "indri/FieldExtent.hpp"
00016 #include "indri/greedy_vector"
00017 #include <sstream>
00018 #include <vector>
00019 #include <set>
00020 
00021 typedef struct ninf {
00022   int id;
00023   int parent;
00024   int type;
00025 
00026   int begin;
00027   int end;
00028 
00029   UINT64 number;
00030   
00031   int length;
00032   int totalLength;
00033 
00034   int numChildren;
00035   int children;
00036 
00037 } nodeinfo_t;
00038 
00039 
00040 namespace indri {
00041   namespace index {
00042     
00043     class Index;
00044 
00045 class DocumentStructure {
00046 
00047 public:
00048 
00049   DocumentStructure(Index & index, const indri::utility::greedy_vector<indri::index::FieldExtent> & fields) 
00050     : _index( &index ) ,
00051       _numNodes( 0 ),
00052       _nodes( ),
00053       _childrenBuff( )
00054   {     
00055     loadStructure(fields);
00056   }
00057 
00058   DocumentStructure(Index & index) :
00059     _index( &index ) ,
00060     _numNodes( 0 ),
00061     _nodes( ),
00062     _childrenBuff( )
00063   { 
00064   }
00065 
00066   DocumentStructure() :
00067     _index( 0 ) ,
00068     _numNodes( 0 ),
00069     _nodes( ),
00070     _childrenBuff( )
00071   { 
00072   }
00073 
00074   void loadStructure(const indri::utility::greedy_vector<indri::index::FieldExtent> & fields);
00075   void setIndex( indri::index::Index & index );
00076   indri::index::Index * getIndex() { return _index; }
00077 
00078   ~DocumentStructure();
00079 
00080   int parent(int node);
00081   int length(int node);
00082   int begin(int node);
00083   int end(int node);
00084   int accumulatedLength(int node);
00085   int type(int node);
00086   UINT64 number(int node);
00087 
00088   typedef indri::utility::greedy_vector<int>::iterator child_iterator;
00089   // Gets the start of a list containing children of a node.  childrenBegin( 0 )
00090   // returns the start of a list containing the roots of trees in the document.
00091   child_iterator childrenBegin( int node );
00092   // Gets the end of a list containing children of a node. childrenEnd( 0 )
00093   // returns the end of the list containing the roots of trees in the document.
00094   child_iterator childrenEnd( int node );
00095   
00096 
00097   int nodeCount() { return _numNodes; }
00098 
00099   // Finds a the smallest leaf in a subtree that contains the extent.
00100   // By default searches the tree rooted by node 1 (BROKEN)
00101   int findLeaf(int begin, int end, int root = -1);
00102 
00103   // Finds the deepest nodess that contain the extent.
00104   // If exact is true, then the leaf must exactly match (rather than contain) the extent.
00105   // Caller is responsible for deleting the returned set.
00106   std::set<int> * findLeafs(int begin, int end, bool exact = false);
00107   void findLeafs(std::set<int> * leafs, int begin, int end, bool exact = false);
00108 
00109   // returns true if node == possibleAncestor or
00110   // if possibleAncestor is a parent or parent's parent ... of node
00111   bool ancestor(int node, int possibleAncestor);
00112 
00113   std::string path(int node);
00114   // returns the ordinal matching a string, 0 if unable to find the ordinal matching the xpath
00115   int fieldId( const std::string path );
00116 
00117   // builds an ordering of nodes that allows walking the trees in a breadth first fashion
00118   indri::utility::greedy_vector<int> & topDownOrder();
00119   void topDownOrder(std::set<int> & roots, indri::utility::greedy_vector<int> & order);
00120   
00121 protected:
00122 
00123   int _numNodes;
00124 
00125   indri::utility::greedy_vector<nodeinfo_t> _nodes;
00126   indri::utility::greedy_vector<int> _childrenBuff;
00127   
00128   Index * _index;
00129   void _constructNodePath(std::stringstream & path, int node);
00130   
00131   indri::utility::greedy_vector<int> _topDownOrder;
00132   // builds a breadth first walk and stores it in  _topDownOrder
00133   void _computeTopDownOrder();
00134 
00135   // push leafs matching the extent rooted at node into the set. 
00136   // will not insert a node if a descedant matches
00137   bool _findLeafs(int node, int begin, int end, bool exact, std::set<int> * leafs);
00138 
00139 };
00140 
00141   }
00142 }
00143 #endif
00144 

Generated on Tue Jun 15 11:02:53 2010 for Lemur by doxygen 1.3.4