Lemur: UTF8Transcoder.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 //
00014 // UTF8Transcoder
00015 //
00016 // 15 September 2005 -- mwb
00017 //
00018 
00019 // Simple class that converts Unicode characters to and from UTF-8
00020 // encoding.  This class is used by the TextTokenizer for interpreting
00021 // strings identified as possibly being in UTF-8 encoding, and by the
00022 // UTF8CaseNormalizationTransformation.
00023 
00024 #ifndef INDRI_UTF8TRANSCODER_HPP
00025 #define INDRI_UTF8TRANSCODER_HPP
00026 
00027 #include "indri/indri-platform.h"
00028 #include "indri/HashTable.hpp"
00029 #include <string.h>
00030 
00031 namespace indri {
00032   namespace parse {
00033     
00034     namespace CharClass {
00035 
00036       const int apostrophe = 1;
00037       const int percent = 2;
00038       const int control = 3;
00039       const int currency = 4;
00040       const int symbol = 5;
00041       const int letter = 6;
00042       const int digit = 7;
00043       const int punctuation = 8;
00044       const int whitespace = 9;
00045       const int decimal = 10;
00046       const int hyphen = 11;
00047       const int thousand = 12;
00048     }
00049 
00050     class UTF8Transcoder {
00051 
00052     private:
00053 
00054       // This function checks for a sequence of bytes between and
00055       // inclusive of 0x80 and 0xBF and returns how many exist.
00056 
00057       int _count_bytes( unsigned char* buf, int index, int max_index, int how_many );
00058 
00059       indri::utility::HashTable<UINT64,const int> u;
00060       void store_interval( indri::utility::HashTable<UINT64,const int>&
00061                                   table, UINT64 start, UINT64 end, 
00062                                   const int cls );
00063 
00064       void _initHT() ;
00065 
00066     public:
00067       UTF8Transcoder();
00068       ~UTF8Transcoder();
00069 
00070       // This function computes the UTF-8 byte sequence for the
00071       // specified Unicode character code.  The bytes are written into
00072       // the specified buffer, which must be large enough to hold the
00073       // byte sequence (always <= 6 bytes in length) as well as the
00074       // terminating null.  The number of octets, which equals the
00075       // number of bytes written to the buffer not including the
00076       // terminating null, is stored in the octets integer.
00077 
00078       void utf8_encode( UINT64 code, char* buf, int* octets );
00079 
00080       // This function decodes a char[] assumed to be in UTF-8
00081       // encoding.  Results are stored in as unicode codes in the
00082       // supplied UINT64[] array, which must have as many elements as
00083       // the char[] array because in the worst case, it contains an
00084       // ASCII string.  The number of characters decoded is stored in
00085       // the characters integer, and the number of malformed bytes
00086       // skipped is stored in the malformed integer.  Offsets and
00087       // lengths are optional parameters; if not NULL, they will be
00088       // filled with byte offsets where the UTF-8 characters begin and
00089       // lengths of each encoding in bytes as they occurr in the input
00090       // buffer.  Offsets and lengths must have as many elements as
00091       // the input buffer has bytes.
00092 
00093       void utf8_decode( const char* buf_in, UINT64** codes, int* characters,
00094                         int* malformed, int** offsets, int** lengths );
00095       
00096       indri::utility::HashTable<UINT64,const int>& unicode() {
00097         return u;
00098       }      
00099     };
00100 
00101   }
00102 }
00103 
00104 #endif // INDRI_UTF8TRANSCODER_HPP
00105