#include <TextTokenizer.hpp>
Inheritance diagram for indri::parse::TextTokenizer:
Public Member Functions | |
TextTokenizer (bool tokenize_markup=true, bool tokenize_entire_words=true) | |
~TextTokenizer () | |
TokenizedDocument * | tokenize (UnparsedDocument *document) |
void | handle (UnparsedDocument *document) |
void | setHandler (ObjectHandler< TokenizedDocument > &h) |
Protected Member Functions | |
void | processASCIIToken () |
void | processUTF8Token () |
void | processTag () |
Protected Attributes | |
indri::utility::Buffer | _termBuffer |
UTF8Transcoder | _transcoder |
bool | _tokenize_markup |
bool | _tokenize_entire_words |
Private Member Functions | |
void | writeToken (char *token, int token_len, int extent_begin, int extent_end) |
Private Attributes | |
ObjectHandler< TokenizedDocument > * | _handler |
TokenizedDocument | _document |
|
|
|
|
|
Implements indri::parse::Tokenizer. |
|
|
|
|
|
|
|
Implements indri::parse::Tokenizer. |
|
Implements indri::parse::Tokenizer. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|