00001 /*========================================================================== 00002 * Copyright (c) 2004 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 // 00014 // UTF8Transcoder 00015 // 00016 // 15 September 2005 -- mwb 00017 // 00018 00019 // Simple class that converts Unicode characters to and from UTF-8 00020 // encoding. This class is used by the TextTokenizer for interpreting 00021 // strings identified as possibly being in UTF-8 encoding, and by the 00022 // UTF8CaseNormalizationTransformation. 00023 00024 #ifndef INDRI_UTF8TRANSCODER_HPP 00025 #define INDRI_UTF8TRANSCODER_HPP 00026 00027 #include "indri/indri-platform.h" 00028 #include "indri/HashTable.hpp" 00029 #include <string.h> 00030 00031 namespace indri { 00032 namespace parse { 00033 00034 namespace CharClass { 00035 00036 const int apostrophe = 1; 00037 const int percent = 2; 00038 const int control = 3; 00039 const int currency = 4; 00040 const int symbol = 5; 00041 const int letter = 6; 00042 const int digit = 7; 00043 const int punctuation = 8; 00044 const int whitespace = 9; 00045 const int decimal = 10; 00046 const int hyphen = 11; 00047 const int thousand = 12; 00048 } 00049 00050 class UTF8Transcoder { 00051 00052 private: 00053 00054 // This function checks for a sequence of bytes between and 00055 // inclusive of 0x80 and 0xBF and returns how many exist. 00056 00057 int _count_bytes( unsigned char* buf, int index, int max_index, int how_many ); 00058 00059 indri::utility::HashTable<UINT64,const int> u; 00060 void store_interval( indri::utility::HashTable<UINT64,const int>& 00061 table, UINT64 start, UINT64 end, 00062 const int cls ); 00063 00064 void _initHT() ; 00065 00066 public: 00067 UTF8Transcoder(); 00068 ~UTF8Transcoder(); 00069 00070 // This function computes the UTF-8 byte sequence for the 00071 // specified Unicode character code. The bytes are written into 00072 // the specified buffer, which must be large enough to hold the 00073 // byte sequence (always <= 6 bytes in length) as well as the 00074 // terminating null. The number of octets, which equals the 00075 // number of bytes written to the buffer not including the 00076 // terminating null, is stored in the octets integer. 00077 00078 void utf8_encode( UINT64 code, char* buf, int* octets ); 00079 00080 // This function decodes a char[] assumed to be in UTF-8 00081 // encoding. Results are stored in as unicode codes in the 00082 // supplied UINT64[] array, which must have as many elements as 00083 // the char[] array because in the worst case, it contains an 00084 // ASCII string. The number of characters decoded is stored in 00085 // the characters integer, and the number of malformed bytes 00086 // skipped is stored in the malformed integer. Offsets and 00087 // lengths are optional parameters; if not NULL, they will be 00088 // filled with byte offsets where the UTF-8 characters begin and 00089 // lengths of each encoding in bytes as they occurr in the input 00090 // buffer. Offsets and lengths must have as many elements as 00091 // the input buffer has bytes. 00092 00093 void utf8_decode( const char* buf_in, UINT64** codes, int* characters, 00094 int* malformed, int** offsets, int** lengths ); 00095 00096 indri::utility::HashTable<UINT64,const int>& unicode() { 00097 return u; 00098 } 00099 }; 00100 00101 } 00102 } 00103 00104 #endif // INDRI_UTF8TRANSCODER_HPP 00105