#include <TextTokenizer.hpp>
Inheritance diagram for indri::parse::TextTokenizer:

Public Member Functions | |
| TextTokenizer (bool tokenize_markup=true, bool tokenize_entire_words=true) | |
| ~TextTokenizer () | |
| TokenizedDocument * | tokenize (UnparsedDocument *document) |
| void | handle (UnparsedDocument *document) |
| void | setHandler (ObjectHandler< TokenizedDocument > &h) |
Protected Member Functions | |
| void | processASCIIToken () |
| void | processUTF8Token () |
| void | processTag () |
Protected Attributes | |
| indri::utility::Buffer | _termBuffer |
| UTF8Transcoder | _transcoder |
| bool | _tokenize_markup |
| bool | _tokenize_entire_words |
Private Member Functions | |
| void | writeToken (char *token, int token_len, int extent_begin, int extent_end) |
Private Attributes | |
| ObjectHandler< TokenizedDocument > * | _handler |
| TokenizedDocument | _document |
|
||||||||||||
|
|
|
|
|
|
|
Implements indri::parse::Tokenizer. |
|
|
|
|
|
|
|
|
|
|
|
Implements indri::parse::Tokenizer. |
|
|
Implements indri::parse::Tokenizer. |
|
||||||||||||||||||||
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1.3.4