libs/convert.h

   1 /*
   2    Copyright (C) 2001-2006, William Joseph.
   3    All Rights Reserved.
   4
   5    This file is part of GtkRadiant.
   6
   7    GtkRadiant is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 2 of the License, or
  10    (at your option) any later version.
  11
  12    GtkRadiant is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GtkRadiant; if not, write to the Free Software
  19    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  20  */
  21
  22 #if !defined( INCLUDED_CONVERT_H )
  23 #define INCLUDED_CONVERT_H
  24
  25 /// \file
  26 /// \brief Character encoding conversion.
  27
  28 #include "debugging/debugging.h"
  29 #include <algorithm>
  30 #include <glib/gunicode.h>
  31 #include <glib/gconvert.h>
  32
  33 #include "character.h"
  34
  35 /// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
  36 inline std::size_t utf8_character_length( const char* character ){
  37         if ( ( *character & 0xE0 ) == 0xC0 ) { // 110xxxxx
  38                 return 2;
  39         }
  40         else if ( ( *character & 0xF0 ) == 0xE0 ) { // 1110xxxx
  41                 return 3;
  42         }
  43         else if ( ( *character & 0xF8 ) == 0xF0 ) { // 11110xxx
  44                 return 4;
  45         }
  46         else if ( ( *character & 0xFC ) == 0xF8 ) { // 111110xx
  47                 return 5;
  48         }
  49         else if ( ( *character & 0xFE ) == 0xFC ) { // 1111110x
  50                 return 6;
  51         }
  52         ERROR_MESSAGE( "" );
  53         return 0;
  54 }
  55
  56 struct UTF8Character
  57 {
  58         const char* buffer;
  59         std::size_t length;
  60         UTF8Character() : buffer( 0 ), length( 0 ){
  61         }
  62         UTF8Character( const char* bytes ) : buffer( bytes ), length( utf8_character_length( bytes ) ){
  63         }
  64 };
  65
  66 inline bool operator<( const UTF8Character& self, const UTF8Character& other ){
  67         return std::lexicographical_compare( self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length );
  68 }
  69
  70 /// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
  71 template<typename TextOutputStreamType>
  72 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const UTF8Character& c ){
  73         for ( const char* p = c.buffer; p != c.buffer + c.length; ++p )
  74         {
  75                 ostream << HexChar( *p );
  76         }
  77         return ostream;
  78 }
  79
  80
  81
  82 /// \brief The character-set encoding for the current C locale.
  83 ///
  84 /// Obtain the global instance with globalCharacterSet().
  85 class CharacterSet
  86 {
  87 const char* m_charSet;
  88 public:
  89 CharacterSet(){
  90         if ( g_get_charset( &m_charSet ) != FALSE ) {
  91                 m_charSet = 0;
  92         }
  93 }
  94 bool isUTF8() const {
  95         return m_charSet == 0;
  96 }
  97 const char* get() const {
  98         return m_charSet;
  99 }
 100 };
 101
 102 typedef LazyStatic<CharacterSet> GlobalCharacterSet;
 103
 104 /// \brief Returns the global instance of CharacterSet.
 105 inline CharacterSet& globalCharacterSet(){
 106         return GlobalCharacterSet::instance();
 107 }
 108
 109
 110 class UTF8CharacterToExtendedASCII
 111 {
 112 public:
 113 UTF8Character m_utf8;
 114 char m_c;
 115 UTF8CharacterToExtendedASCII() : m_c( '\0' ){
 116 }
 117 UTF8CharacterToExtendedASCII( const UTF8Character& utf8, char c ) : m_utf8( utf8 ), m_c( c ){
 118 }
 119 };
 120
 121 inline bool operator<( const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other ){
 122         return self.m_utf8 < other.m_utf8;
 123 }
 124
 125 inline std::size_t extended_ascii_to_index( char c ){
 126         return static_cast<std::size_t>( c & 0x7F );
 127 }
 128
 129 inline char extended_ascii_for_index( std::size_t i ){
 130         return static_cast<char>( i | 0x80 );
 131 }
 132
 133 /// \brief The active extended-ascii character set encoding.
 134 /// Performs UTF-8 encoding and decoding of extended-ascii characters.
 135 ///
 136 /// Obtain the global instance with globalExtendedASCIICharacterSet().
 137 class ExtendedASCIICharacterSet
 138 {
 139 typedef char UTF8CharBuffer[6];
 140 UTF8CharBuffer m_converted[128];
 141 UTF8Character m_decodeMap[128];
 142 UTF8CharacterToExtendedASCII m_encodeMap[128];
 143 public:
 144 ExtendedASCIICharacterSet(){
 145         if ( !globalCharacterSet().isUTF8() ) {
 146                 GIConv descriptor = g_iconv_open( "UTF-8", globalCharacterSet().get() );
 147                 for ( std::size_t i = 1; i < 128; ++i )
 148                 {
 149                         char c = extended_ascii_for_index( i );
 150                         char* inbuf = &c;
 151                         std::size_t inbytesleft = 1;
 152                         char* outbuf = m_converted[i];
 153                         std::size_t outbytesleft = 6;
 154                         if ( g_iconv( descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft ) != (size_t)( -1 ) ) {
 155                                 UTF8Character utf8( m_converted[i] );
 156                                 m_decodeMap[i] = utf8;
 157                                 m_encodeMap[i] = UTF8CharacterToExtendedASCII( utf8, c );
 158                         }
 159                 }
 160                 g_iconv_close( descriptor );
 161                 std::sort( m_encodeMap, m_encodeMap + 128 );
 162         }
 163 }
 164 /// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
 165 /// Useful for debugging.
 166 void print() const {
 167         globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
 168         for ( std::size_t i = 1; i < 128; ++i )
 169         {
 170                 if ( m_decodeMap[i].buffer != 0 ) {
 171                         globalOutputStream() << extended_ascii_for_index( i ) << " = " << m_decodeMap[i] << "\n";
 172                 }
 173         }
 174 }
 175 /// \brief Returns \p c decoded from extended-ascii to UTF-8.
 176 /// \p c must be an extended-ascii character.
 177 const UTF8Character& decode( char c ) const {
 178         ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
 179         ASSERT_MESSAGE( !char_is_ascii( c ), "decode: ascii character" );
 180         ASSERT_MESSAGE( m_decodeMap[extended_ascii_to_index( c )].buffer != 0, "decode: invalid character: " << HexChar( c ) );
 181         return m_decodeMap[extended_ascii_to_index( c )];
 182 }
 183 /// \brief Returns \p c encoded to extended-ascii from UTF-8.
 184 /// \p c must map to an extended-ascii character.
 185 char encode( const UTF8Character& c ) const {
 186         ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
 187         ASSERT_MESSAGE( !char_is_ascii( *c.buffer ), "encode: ascii character" );
 188         std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
 189                 = std::equal_range( m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII( c, 0 ) );
 190         ASSERT_MESSAGE( range.first != range.second, "encode: invalid character: " << c );
 191         return ( *range.first ).m_c;
 192 }
 193 };
 194
 195 typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
 196
 197 /// \brief Returns the global instance of ExtendedASCIICharacterSet.
 198 inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet(){
 199         return GlobalExtendedASCIICharacterSet::instance();
 200 }
 201
 202 class ConvertUTF8ToLocale
 203 {
 204 public:
 205 StringRange m_range;
 206 ConvertUTF8ToLocale( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
 207 }
 208 ConvertUTF8ToLocale( const StringRange& range ) : m_range( range ){
 209 }
 210 };
 211
 212 /// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
 213 template<typename TextOutputStreamType>
 214 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert ){
 215         if ( globalCharacterSet().isUTF8() ) {
 216                 return ostream << convert.m_range;
 217         }
 218
 219         for ( const char* p = convert.m_range.first; p != convert.m_range.last; )
 220         {
 221                 if ( !char_is_ascii( *p ) ) {
 222                         UTF8Character c( p );
 223                         ostream << globalExtendedASCIICharacterSet().encode( c );
 224                         p += c.length;
 225                 }
 226                 else
 227                 {
 228                         ostream << *p++;
 229                 }
 230         }
 231         return ostream;
 232 }
 233
 234
 235 class ConvertLocaleToUTF8
 236 {
 237 public:
 238 StringRange m_range;
 239 ConvertLocaleToUTF8( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
 240 }
 241 ConvertLocaleToUTF8( const StringRange& range ) : m_range( range ){
 242 }
 243 };
 244
 245 /// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
 246 template<typename TextOutputStreamType>
 247 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert ){
 248         if ( globalCharacterSet().isUTF8() ) {
 249                 return ostream << convert.m_range;
 250         }
 251
 252         for ( const char* p = convert.m_range.first; p != convert.m_range.last; ++p )
 253         {
 254                 if ( !char_is_ascii( *p ) ) {
 255                         UTF8Character c( globalExtendedASCIICharacterSet().decode( *p ) );
 256                         ostream.write( c.buffer, c.length );
 257                 }
 258                 else
 259                 {
 260                         ostream << *p;
 261                 }
 262         }
 263         return ostream;
 264 }
 265
 266
 267 #endif