00001 /*========================================================================== 00002 * Copyright (c) 2003-2005 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 // 00013 // TextTokenizer 00014 // 00015 // 15 September 2005 -- mwb 00016 // 00017 00018 #ifndef INDRI_TEXTTOKENIZER_HPP 00019 #define INDRI_TEXTTOKENIZER_HPP 00020 00021 #include <stdio.h> 00022 #include <string> 00023 #include <map> 00024 00025 #include "indri/IndriTokenizer.hpp" 00026 #include "indri/Buffer.hpp" 00027 #include "indri/TagEvent.hpp" 00028 #include "indri/UnparsedDocument.hpp" 00029 #include "indri/TokenizedDocument.hpp" 00030 #include "indri/UTF8Transcoder.hpp" 00031 00032 namespace indri { 00033 namespace parse { 00034 00035 class TextTokenizer : public Tokenizer { 00036 00037 public: 00038 TextTokenizer( bool tokenize_markup = true, bool tokenize_entire_words = true ) : _handler(0) { 00039 00040 _tokenize_markup = tokenize_markup; 00041 _tokenize_entire_words = tokenize_entire_words; 00042 } 00043 00044 ~TextTokenizer() {} 00045 00046 TokenizedDocument* tokenize( UnparsedDocument* document ); 00047 00048 void handle( UnparsedDocument* document ); 00049 void setHandler( ObjectHandler<TokenizedDocument>& h ); 00050 00051 protected: 00052 void processASCIIToken(); 00053 void processUTF8Token(); 00054 void processTag(); 00055 00056 indri::utility::Buffer _termBuffer; 00057 UTF8Transcoder _transcoder; 00058 00059 bool _tokenize_markup; 00060 bool _tokenize_entire_words; 00061 00062 private: 00063 ObjectHandler<TokenizedDocument>* _handler; 00064 TokenizedDocument _document; 00065 00066 void writeToken( char* token, int token_len, int extent_begin, 00067 int extent_end ); 00068 }; 00069 } 00070 } 00071 00072 #endif // INDRI_TEXTTOKENIZER_HPP 00073