00001 /*========================================================================== 00002 * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // PDFDocumentExtractor 00015 // 00016 // 25 June 2004 -- tds 00017 // 00018 00019 #ifndef INDRI_PDFDOCUMENTEXTRACTOR_HPP 00020 #define INDRI_PDFDOCUMENTEXTRACTOR_HPP 00021 00022 #include "lemur-compat.hpp" 00023 #include "indri/Buffer.hpp" 00024 #include "indri/UnparsedDocument.hpp" 00025 #include "indri/DocumentIterator.hpp" 00026 #include "indri/XMLReader.hpp" 00027 #include "indri/XMLNode.hpp" 00028 #include "indri/XMLWriter.hpp" 00029 #include <string> 00030 namespace indri 00031 { 00032 namespace parse 00033 { 00034 00035 class PDFDocumentExtractor : public DocumentIterator { 00036 indri::utility::Buffer _documentTextBuffer; 00037 UnparsedDocument _unparsedDocument; 00038 std::string _documentPath; 00039 00040 public: 00041 PDFDocumentExtractor(); 00042 ~PDFDocumentExtractor(); 00043 00044 void open( const std::string& filename ); 00045 UnparsedDocument* nextDocument(); 00046 void appendPdfMetaData(indri::xml::XMLNode* node); 00047 void seekValue(indri::xml::XMLNode* node, std::string &metaTag); 00048 void close(); 00049 private: 00050 std::string _title; 00051 std::string _author; 00052 00053 }; 00054 } 00055 } 00056 00057 #endif // INDRI_PDFDOCUMENTEXTRACTOR_HPP