2 Copyright (C) 2001-2006, William Joseph.
5 This file is part of GtkRadiant.
7 GtkRadiant is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
12 GtkRadiant is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GtkRadiant; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #if !defined(INCLUDED_CONVERT_H)
23 #define INCLUDED_CONVERT_H
26 /// \brief Character encoding conversion.
28 #include "debugging/debugging.h"
30 #include <glib/gunicode.h>
31 #include <glib/gconvert.h>
33 #include "character.h"
35 /// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
36 inline std::size_t utf8_character_length(const char* character)
38 if((*character & 0xE0) == 0xC0) // 110xxxxx
42 else if((*character & 0xF0) == 0xE0) // 1110xxxx
46 else if((*character & 0xF8) == 0xF0) // 11110xxx
50 else if((*character & 0xFC) == 0xF8) // 111110xx
54 else if((*character & 0xFE) == 0xFC) // 1111110x
66 UTF8Character() : buffer(0), length(0)
69 UTF8Character(const char* bytes) : buffer(bytes), length(utf8_character_length(bytes))
74 inline bool operator<(const UTF8Character& self, const UTF8Character& other)
76 return std::lexicographical_compare(self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length);
79 /// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
80 template<typename TextOutputStreamType>
81 inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const UTF8Character& c)
83 for(const char* p = c.buffer; p != c.buffer + c.length; ++p)
85 ostream << HexChar(*p);
92 /// \brief The character-set encoding for the current C locale.
94 /// Obtain the global instance with globalCharacterSet().
97 const char* m_charSet;
101 if(g_get_charset(&m_charSet) != FALSE)
108 return m_charSet == 0;
110 const char* get() const
116 typedef LazyStatic<CharacterSet> GlobalCharacterSet;
118 /// \brief Returns the global instance of CharacterSet.
119 inline CharacterSet& globalCharacterSet()
121 return GlobalCharacterSet::instance();
125 class UTF8CharacterToExtendedASCII
128 UTF8Character m_utf8;
130 UTF8CharacterToExtendedASCII() : m_c('\0')
133 UTF8CharacterToExtendedASCII(const UTF8Character& utf8, char c) : m_utf8(utf8), m_c(c)
138 inline bool operator<(const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other)
140 return self.m_utf8 < other.m_utf8;
143 inline std::size_t extended_ascii_to_index(char c)
145 return static_cast<std::size_t>(c & 0x7F);
148 inline char extended_ascii_for_index(std::size_t i)
150 return static_cast<char>(i | 0x80);
153 /// \brief The active extended-ascii character set encoding.
154 /// Performs UTF-8 encoding and decoding of extended-ascii characters.
156 /// Obtain the global instance with globalExtendedASCIICharacterSet().
157 class ExtendedASCIICharacterSet
159 typedef char UTF8CharBuffer[6];
160 UTF8CharBuffer m_converted[128];
161 UTF8Character m_decodeMap[128];
162 UTF8CharacterToExtendedASCII m_encodeMap[128];
164 ExtendedASCIICharacterSet()
166 if(!globalCharacterSet().isUTF8())
168 GIConv descriptor = g_iconv_open("UTF-8", globalCharacterSet().get());
169 for(std::size_t i = 1; i < 128; ++i)
171 char c = extended_ascii_for_index(i);
173 std::size_t inbytesleft = 1;
174 char* outbuf = m_converted[i];
175 std::size_t outbytesleft = 6;
176 if(g_iconv(descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft) != (size_t)(-1))
178 UTF8Character utf8(m_converted[i]);
179 m_decodeMap[i] = utf8;
180 m_encodeMap[i] = UTF8CharacterToExtendedASCII(utf8, c);
183 g_iconv_close(descriptor);
184 std::sort(m_encodeMap, m_encodeMap + 128);
187 /// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
188 /// Useful for debugging.
191 globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
192 for(std::size_t i = 1; i < 128; ++i)
194 if(m_decodeMap[i].buffer != 0)
196 globalOutputStream() << extended_ascii_for_index(i) << " = " << m_decodeMap[i] << "\n";
200 /// \brief Returns \p c decoded from extended-ascii to UTF-8.
201 /// \p c must be an extended-ascii character.
202 const UTF8Character& decode(char c) const
204 ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
205 ASSERT_MESSAGE(!char_is_ascii(c), "decode: ascii character");
206 ASSERT_MESSAGE(m_decodeMap[extended_ascii_to_index(c)].buffer != 0, "decode: invalid character: " << HexChar(c));
207 return m_decodeMap[extended_ascii_to_index(c)];
209 /// \brief Returns \p c encoded to extended-ascii from UTF-8.
210 /// \p c must map to an extended-ascii character.
211 char encode(const UTF8Character& c) const
213 ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
214 ASSERT_MESSAGE(!char_is_ascii(*c.buffer), "encode: ascii character");
215 std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
216 = std::equal_range(m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII(c, 0));
217 ASSERT_MESSAGE(range.first != range.second, "encode: invalid character: " << c);
218 return (*range.first).m_c;
222 typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
224 /// \brief Returns the global instance of ExtendedASCIICharacterSet.
225 inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet()
227 return GlobalExtendedASCIICharacterSet::instance();
230 class ConvertUTF8ToLocale
234 ConvertUTF8ToLocale(const char* string) : m_range(StringRange(string, string + strlen(string)))
237 ConvertUTF8ToLocale(const StringRange& range) : m_range(range)
242 /// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
243 template<typename TextOutputStreamType>
244 inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert)
246 if(globalCharacterSet().isUTF8())
248 return ostream << convert.m_range;
251 for(const char* p = convert.m_range.first; p != convert.m_range.last;)
253 if(!char_is_ascii(*p))
256 ostream << globalExtendedASCIICharacterSet().encode(c);
268 class ConvertLocaleToUTF8
272 ConvertLocaleToUTF8(const char* string) : m_range(StringRange(string, string + strlen(string)))
275 ConvertLocaleToUTF8(const StringRange& range) : m_range(range)
280 /// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
281 template<typename TextOutputStreamType>
282 inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert)
284 if(globalCharacterSet().isUTF8())
286 return ostream << convert.m_range;
289 for(const char* p = convert.m_range.first; p != convert.m_range.last; ++p)
291 if(!char_is_ascii(*p))
293 UTF8Character c(globalExtendedASCIICharacterSet().decode(*p));
294 ostream.write(c.buffer, c.length);