Lemur: TextTokenizer.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2003-2005 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 //
00013 // TextTokenizer
00014 //
00015 // 15 September 2005 -- mwb
00016 //
00017 
00018 #ifndef INDRI_TEXTTOKENIZER_HPP
00019 #define INDRI_TEXTTOKENIZER_HPP
00020 
00021 #include <stdio.h>
00022 #include <string>
00023 #include <map>
00024 
00025 #include "indri/IndriTokenizer.hpp"
00026 #include "indri/Buffer.hpp"
00027 #include "indri/TagEvent.hpp"
00028 #include "indri/UnparsedDocument.hpp"
00029 #include "indri/TokenizedDocument.hpp"
00030 #include "indri/UTF8Transcoder.hpp"
00031 
00032 namespace indri {
00033   namespace parse {
00034     
00035     class TextTokenizer : public Tokenizer {
00036 
00037     public:
00038       TextTokenizer( bool tokenize_markup = true, bool tokenize_entire_words = true ) : _handler(0) {
00039 
00040         _tokenize_markup = tokenize_markup;
00041         _tokenize_entire_words = tokenize_entire_words;
00042       }
00043 
00044       ~TextTokenizer() {}
00045   
00046       TokenizedDocument* tokenize( UnparsedDocument* document );
00047 
00048       void handle( UnparsedDocument* document );
00049       void setHandler( ObjectHandler<TokenizedDocument>& h );
00050 
00051     protected:
00052       void processASCIIToken();
00053       void processUTF8Token();
00054       void processTag();
00055 
00056       indri::utility::Buffer _termBuffer;
00057       UTF8Transcoder _transcoder;
00058 
00059       bool _tokenize_markup;
00060       bool _tokenize_entire_words;
00061 
00062     private:
00063       ObjectHandler<TokenizedDocument>* _handler;
00064       TokenizedDocument _document;
00065 
00066       void writeToken( char* token, int token_len, int extent_begin, 
00067                        int extent_end );
00068     };
00069   }
00070 }
00071 
00072 #endif // INDRI_TEXTTOKENIZER_HPP
00073