Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

UTF8CaseNormalizationTransformation.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2005 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 //
00013 // UTF8CaseNormalizationTransformation
00014 //
00015 // 16 September 2005 -- mwb
00016 //
00017 
00018 #ifndef INDRI_UTF8CASENORMALIZATIONTRANSFORMATION_HPP
00019 #define INDRI_UTF8CASENORMALIZATIONTRANSFORMATION_HPP
00020 
00021 #include "indri/indri-platform.h"
00022 #include "indri/Transformation.hpp"
00023 #include "indri/UTF8Transcoder.hpp"
00024 #include "indri/ParsedDocument.hpp"
00025 #include "indri/HashTable.hpp"
00026 #include <vector>
00027 #include <map>
00028 
00029 namespace indri {
00030   namespace parse {
00031     
00032     class UTF8CaseNormalizationTransformation : public Transformation {
00033 
00034     private:
00035       ObjectHandler<indri::api::ParsedDocument>* _handler;
00036       UTF8Transcoder _transcoder;
00037       std::vector<char*> _buffers_allocated;
00038 
00039       indri::utility::HashTable<UINT64,UINT64> _downcase;
00040       void _initHT();
00041 
00042     public:
00043       UTF8CaseNormalizationTransformation();
00044       ~UTF8CaseNormalizationTransformation();
00045 
00046       void handle( indri::api::ParsedDocument* document );
00047       void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler );
00048 
00049       indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document );
00050 
00051     };
00052   }
00053 }
00054 
00055 #endif // INDRI_UTF8CASENORMALIZATIONTRANSFORMATION_HPP
00056 

Generated on Tue Jun 15 11:02:56 2010 for Lemur by doxygen 1.3.4