libs/convert.h

   1 /*
   2 Copyright (C) 2001-2006, William Joseph.
   3 All Rights Reserved.
   4
   5 This file is part of GtkRadiant.
   6
   7 GtkRadiant is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2 of the License, or
  10 (at your option) any later version.
  11
  12 GtkRadiant is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GtkRadiant; if not, write to the Free Software
  19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  20 */
  21
  22 #if !defined(INCLUDED_CONVERT_H)
  23 #define INCLUDED_CONVERT_H
  24
  25 /// \file
  26 /// \brief Character encoding conversion.
  27
  28 #include "debugging/debugging.h"
  29 #include <algorithm>
  30 #include <glib/gunicode.h>
  31 #include <glib/gconvert.h>
  32
  33 #include "character.h"
  34
  35 /// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
  36 inline std::size_t utf8_character_length(const char* character)
  37 {
  38   if((*character & 0xE0) == 0xC0) // 110xxxxx
  39   {
  40     return 2;
  41   }
  42   else if((*character & 0xF0) == 0xE0) // 1110xxxx
  43   {
  44     return 3;
  45   }
  46   else if((*character & 0xF8) == 0xF0) // 11110xxx
  47   {
  48     return 4;
  49   }
  50   else if((*character & 0xFC) == 0xF8) // 111110xx
  51   {
  52     return 5;
  53   }
  54   else if((*character & 0xFE) == 0xFC) // 1111110x
  55   {
  56     return 6;
  57   }
  58   ERROR_MESSAGE("");
  59   return 0;
  60 }
  61
  62 struct UTF8Character
  63 {
  64   const char* buffer;
  65   std::size_t length;
  66   UTF8Character() : buffer(0), length(0)
  67   {
  68   }
  69   UTF8Character(const char* bytes) : buffer(bytes), length(utf8_character_length(bytes))
  70   {
  71   }
  72 };
  73
  74 inline bool operator<(const UTF8Character& self, const UTF8Character& other)
  75 {
  76   return std::lexicographical_compare(self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length);
  77 }
  78
  79 /// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
  80 template<typename TextOutputStreamType>
  81 inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const UTF8Character& c)
  82 {
  83   for(const char* p = c.buffer; p != c.buffer + c.length; ++p)
  84   {
  85     ostream << HexChar(*p);
  86   }
  87   return ostream;
  88 }
  89
  90
  91
  92 /// \brief The character-set encoding for the current C locale.
  93 ///
  94 /// Obtain the global instance with globalCharacterSet().
  95 class CharacterSet
  96 {
  97   const char* m_charSet;
  98 public:
  99   CharacterSet()
 100   {
 101     if(g_get_charset(&m_charSet) != FALSE)
 102     {
 103       m_charSet = 0;
 104     }
 105   }
 106   bool isUTF8() const
 107   {
 108     return m_charSet == 0;
 109   }
 110   const char* get() const
 111   {
 112     return m_charSet;
 113   }
 114 };
 115
 116 typedef LazyStatic<CharacterSet> GlobalCharacterSet;
 117
 118 /// \brief Returns the global instance of CharacterSet.
 119 inline CharacterSet& globalCharacterSet()
 120 {
 121   return GlobalCharacterSet::instance();
 122 }
 123
 124
 125 class UTF8CharacterToExtendedASCII
 126 {
 127 public:
 128   UTF8Character m_utf8;
 129   char m_c;
 130   UTF8CharacterToExtendedASCII() : m_c('\0')
 131   {
 132   }
 133   UTF8CharacterToExtendedASCII(const UTF8Character& utf8, char c) : m_utf8(utf8), m_c(c)
 134   {
 135   }
 136 };
 137
 138 inline bool operator<(const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other)
 139 {
 140   return self.m_utf8 < other.m_utf8;
 141 }
 142
 143 inline std::size_t extended_ascii_to_index(char c)
 144 {
 145   return static_cast<std::size_t>(c & 0x7F);
 146 }
 147
 148 inline char extended_ascii_for_index(std::size_t i)
 149 {
 150   return static_cast<char>(i | 0x80);
 151 }
 152
 153 /// \brief The active extended-ascii character set encoding.
 154 /// Performs UTF-8 encoding and decoding of extended-ascii characters.
 155 ///
 156 /// Obtain the global instance with globalExtendedASCIICharacterSet().
 157 class ExtendedASCIICharacterSet
 158 {
 159   typedef char UTF8CharBuffer[6];
 160   UTF8CharBuffer m_converted[128];
 161   UTF8Character m_decodeMap[128];
 162   UTF8CharacterToExtendedASCII m_encodeMap[128];
 163 public:
 164   ExtendedASCIICharacterSet()
 165   {
 166     if(!globalCharacterSet().isUTF8())
 167     {
 168       GIConv descriptor = g_iconv_open("UTF-8", globalCharacterSet().get());
 169       for(std::size_t i = 1; i < 128; ++i)
 170       {
 171         char c = extended_ascii_for_index(i);
 172         char* inbuf = &c;
 173         std::size_t inbytesleft = 1;
 174         char* outbuf = m_converted[i];
 175         std::size_t outbytesleft = 6;
 176         if(g_iconv(descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft) != (size_t)(-1))
 177         {
 178           UTF8Character utf8(m_converted[i]);
 179           m_decodeMap[i] = utf8;
 180           m_encodeMap[i] = UTF8CharacterToExtendedASCII(utf8, c);
 181         }
 182       }
 183       g_iconv_close(descriptor);
 184       std::sort(m_encodeMap, m_encodeMap + 128);
 185     }
 186   }
 187   /// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
 188   /// Useful for debugging.
 189   void print() const
 190   {
 191     globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
 192     for(std::size_t i = 1; i < 128; ++i)
 193     {
 194       if(m_decodeMap[i].buffer != 0)
 195       {
 196         globalOutputStream() << extended_ascii_for_index(i) << " = " << m_decodeMap[i] << "\n";
 197       }
 198     }
 199   }
 200   /// \brief Returns \p c decoded from extended-ascii to UTF-8.
 201   /// \p c must be an extended-ascii character.
 202   const UTF8Character& decode(char c) const
 203   {
 204     ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
 205     ASSERT_MESSAGE(!char_is_ascii(c), "decode: ascii character");
 206     ASSERT_MESSAGE(m_decodeMap[extended_ascii_to_index(c)].buffer != 0, "decode: invalid character: " << HexChar(c));
 207     return m_decodeMap[extended_ascii_to_index(c)];
 208   }
 209   /// \brief Returns \p c encoded to extended-ascii from UTF-8.
 210   /// \p c must map to an extended-ascii character.
 211   char encode(const UTF8Character& c) const
 212   {
 213     ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
 214     ASSERT_MESSAGE(!char_is_ascii(*c.buffer), "encode: ascii character");
 215     std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
 216       = std::equal_range(m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII(c, 0));
 217     ASSERT_MESSAGE(range.first != range.second, "encode: invalid character: " << c);
 218     return (*range.first).m_c;
 219   }
 220 };
 221
 222 typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
 223
 224 /// \brief Returns the global instance of ExtendedASCIICharacterSet.
 225 inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet()
 226 {
 227   return GlobalExtendedASCIICharacterSet::instance();
 228 }
 229
 230 class ConvertUTF8ToLocale
 231 {
 232 public:
 233   StringRange m_range;
 234   ConvertUTF8ToLocale(const char* string) : m_range(StringRange(string, string + strlen(string)))
 235   {
 236   }
 237   ConvertUTF8ToLocale(const StringRange& range) : m_range(range)
 238   {
 239   }
 240 };
 241
 242 /// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
 243 template<typename TextOutputStreamType>
 244 inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert)
 245 {
 246   if(globalCharacterSet().isUTF8())
 247   {
 248     return ostream << convert.m_range;
 249   }
 250
 251   for(const char* p = convert.m_range.first; p != convert.m_range.last;)
 252   {
 253     if(!char_is_ascii(*p))
 254     {
 255       UTF8Character c(p);
 256       ostream << globalExtendedASCIICharacterSet().encode(c);
 257       p += c.length;
 258     }
 259     else
 260     {
 261       ostream << *p++;
 262     }
 263   }
 264   return ostream;
 265 }
 266
 267
 268 class ConvertLocaleToUTF8
 269 {
 270 public:
 271   StringRange m_range;
 272   ConvertLocaleToUTF8(const char* string) : m_range(StringRange(string, string + strlen(string)))
 273   {
 274   }
 275   ConvertLocaleToUTF8(const StringRange& range) : m_range(range)
 276   {
 277   }
 278 };
 279
 280 /// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
 281 template<typename TextOutputStreamType>
 282 inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert)
 283 {
 284   if(globalCharacterSet().isUTF8())
 285   {
 286     return ostream << convert.m_range;
 287   }
 288
 289   for(const char* p = convert.m_range.first; p != convert.m_range.last; ++p)
 290   {
 291     if(!char_is_ascii(*p))
 292     {
 293       UTF8Character c(globalExtendedASCIICharacterSet().decode(*p));
 294       ostream.write(c.buffer, c.length);
 295     }
 296     else
 297     {
 298       ostream << *p;
 299     }
 300   }
 301   return ostream;
 302 }
 303
 304
 305 #endif