00001 /*========================================================================== 00002 * Copyright (c) 2003-2005 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 // 00013 // UTF8CaseNormalizationTransformation 00014 // 00015 // 16 September 2005 -- mwb 00016 // 00017 00018 #ifndef INDRI_UTF8CASENORMALIZATIONTRANSFORMATION_HPP 00019 #define INDRI_UTF8CASENORMALIZATIONTRANSFORMATION_HPP 00020 00021 #include "indri/indri-platform.h" 00022 #include "indri/Transformation.hpp" 00023 #include "indri/UTF8Transcoder.hpp" 00024 #include "indri/ParsedDocument.hpp" 00025 #include "indri/HashTable.hpp" 00026 #include <vector> 00027 #include <map> 00028 00029 namespace indri { 00030 namespace parse { 00031 00032 class UTF8CaseNormalizationTransformation : public Transformation { 00033 00034 private: 00035 ObjectHandler<indri::api::ParsedDocument>* _handler; 00036 UTF8Transcoder _transcoder; 00037 std::vector<char*> _buffers_allocated; 00038 00039 indri::utility::HashTable<UINT64,UINT64> _downcase; 00040 void _initHT(); 00041 00042 public: 00043 UTF8CaseNormalizationTransformation(); 00044 ~UTF8CaseNormalizationTransformation(); 00045 00046 void handle( indri::api::ParsedDocument* document ); 00047 void setHandler( ObjectHandler<indri::api::ParsedDocument>& handler ); 00048 00049 indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ); 00050 00051 }; 00052 } 00053 } 00054 00055 #endif // INDRI_UTF8CASENORMALIZATIONTRANSFORMATION_HPP 00056