00001 /*========================================================================== 00002 * Copyright (c) 2003-2005 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // Tokenizer 00015 // 00016 // 15 September 2005 -- mwb 00017 // 00018 00019 #ifndef INDRI_TOKENIZER_HPP 00020 #define INDRI_TOKENIZER_HPP 00021 00022 #include "indri/ObjectHandler.hpp" 00023 #include "indri/UnparsedDocument.hpp" 00024 #include "indri/TokenizedDocument.hpp" 00025 // #include <map> 00026 // #include <vector> 00027 00028 namespace indri { 00029 namespace parse { 00030 00031 class Tokenizer : public ObjectHandler<UnparsedDocument> { 00032 public: 00033 Tokenizer() {} 00034 virtual ~Tokenizer() {} 00035 00036 virtual TokenizedDocument* tokenize( UnparsedDocument* document ) = 0; 00037 virtual void handle( UnparsedDocument* document ) = 0; 00038 virtual void setHandler( ObjectHandler<TokenizedDocument>& handler ) = 0; 00039 }; 00040 } 00041 } 00042 00043 #endif // INDRI_TOKENIZER_HPP 00044 00045