Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

WordDocumentExtractor.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // WordDocumentExtractor
00015 //
00016 // 14 June 2004 -- tds
00017 //
00018 // Code is based in part on the AutoWord class
00019 // by Poonam Bajaj.
00020 //
00021 #ifndef INDRI_WORDDOCUMENTEXTRACTOR_HPP
00022 #define INDRI_WORDDOCUMENTEXTRACTOR_HPP
00023 
00024 #ifdef WIN32
00025 
00026 #include "lemur-compat.hpp"
00027 #include "indri/Buffer.hpp"
00028 #include "indri/UnparsedDocument.hpp"
00029 #include "Exception.hpp"
00030 #include <string>
00031 #include "indri/DocumentIterator.hpp"
00032 #include "indri/OfficeHelper.hpp"
00033 
00034 namespace indri
00035 {
00036   namespace parse
00037   {
00038     
00039     class WordDocumentExtractor : public DocumentIterator {
00040     private:
00041       void* _internal;
00042       indri::utility::Buffer _documentTextBuffer;
00043       UnparsedDocument _unparsedDocument;
00044 
00045       std::string _documentPath;
00046 
00047       OfficeHelper _officeHelper;
00048 
00049       bool _documentWaiting;
00050 
00051       void initialize();
00052       void uninitialize();
00053       void closeWord(IDispatch* documentDispatch, bool quit);
00054     public:
00055       WordDocumentExtractor();
00056       ~WordDocumentExtractor();
00057       void open( const std::string& filename );
00058       UnparsedDocument* nextDocument( );
00059       void quit();
00060       void close();
00061     };
00062   }
00063 }
00064 
00065 #endif
00066 #endif // INDRI_WORDDOCUMENTEXTRACTOR_HPP

Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4