]> git.xonotic.org Git - xonotic/netradiant.git/blob - libs/convert.h
Merge commit '830125fad042fad35dc029b6eb57c8156ad7e176'
[xonotic/netradiant.git] / libs / convert.h
1 /*
2    Copyright (C) 2001-2006, William Joseph.
3    All Rights Reserved.
4
5    This file is part of GtkRadiant.
6
7    GtkRadiant is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 2 of the License, or
10    (at your option) any later version.
11
12    GtkRadiant is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with GtkRadiant; if not, write to the Free Software
19    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20  */
21
22 #if !defined( INCLUDED_CONVERT_H )
23 #define INCLUDED_CONVERT_H
24
25 /// \file
26 /// \brief Character encoding conversion.
27
28 #include "debugging/debugging.h"
29 #include <algorithm>
30 #include <glib/gunicode.h>
31 #include <glib/gconvert.h>
32
33 #include "character.h"
34
35 /// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
36 inline std::size_t utf8_character_length( const char* character ){
37         if ( ( *character & 0xE0 ) == 0xC0 ) { // 110xxxxx
38                 return 2;
39         }
40         else if ( ( *character & 0xF0 ) == 0xE0 ) { // 1110xxxx
41                 return 3;
42         }
43         else if ( ( *character & 0xF8 ) == 0xF0 ) { // 11110xxx
44                 return 4;
45         }
46         else if ( ( *character & 0xFC ) == 0xF8 ) { // 111110xx
47                 return 5;
48         }
49         else if ( ( *character & 0xFE ) == 0xFC ) { // 1111110x
50                 return 6;
51         }
52         ERROR_MESSAGE( "" );
53         return 0;
54 }
55
56 struct UTF8Character
57 {
58         const char* buffer;
59         std::size_t length;
60         UTF8Character() : buffer( 0 ), length( 0 ){
61         }
62         UTF8Character( const char* bytes ) : buffer( bytes ), length( utf8_character_length( bytes ) ){
63         }
64 };
65
66 inline bool operator<( const UTF8Character& self, const UTF8Character& other ){
67         return std::lexicographical_compare( self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length );
68 }
69
70 /// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
71 template<typename TextOutputStreamType>
72 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const UTF8Character& c ){
73         for ( const char* p = c.buffer; p != c.buffer + c.length; ++p )
74         {
75                 ostream << HexChar( *p );
76         }
77         return ostream;
78 }
79
80
81
82 /// \brief The character-set encoding for the current C locale.
83 ///
84 /// Obtain the global instance with globalCharacterSet().
85 class CharacterSet
86 {
87 const char* m_charSet;
88 public:
89 CharacterSet(){
90         if ( g_get_charset( &m_charSet ) != FALSE ) {
91                 m_charSet = 0;
92         }
93 }
94 bool isUTF8() const {
95         return m_charSet == 0;
96 }
97 const char* get() const {
98         return m_charSet;
99 }
100 };
101
102 typedef LazyStatic<CharacterSet> GlobalCharacterSet;
103
104 /// \brief Returns the global instance of CharacterSet.
105 inline CharacterSet& globalCharacterSet(){
106         return GlobalCharacterSet::instance();
107 }
108
109
110 class UTF8CharacterToExtendedASCII
111 {
112 public:
113 UTF8Character m_utf8;
114 char m_c;
115 UTF8CharacterToExtendedASCII() : m_c( '\0' ){
116 }
117 UTF8CharacterToExtendedASCII( const UTF8Character& utf8, char c ) : m_utf8( utf8 ), m_c( c ){
118 }
119 };
120
121 inline bool operator<( const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other ){
122         return self.m_utf8 < other.m_utf8;
123 }
124
125 inline std::size_t extended_ascii_to_index( char c ){
126         return static_cast<std::size_t>( c & 0x7F );
127 }
128
129 inline char extended_ascii_for_index( std::size_t i ){
130         return static_cast<char>( i | 0x80 );
131 }
132
133 /// \brief The active extended-ascii character set encoding.
134 /// Performs UTF-8 encoding and decoding of extended-ascii characters.
135 ///
136 /// Obtain the global instance with globalExtendedASCIICharacterSet().
137 class ExtendedASCIICharacterSet
138 {
139 typedef char UTF8CharBuffer[6];
140 UTF8CharBuffer m_converted[128];
141 UTF8Character m_decodeMap[128];
142 UTF8CharacterToExtendedASCII m_encodeMap[128];
143 public:
144 ExtendedASCIICharacterSet(){
145         if ( !globalCharacterSet().isUTF8() ) {
146                 GIConv descriptor = g_iconv_open( "UTF-8", globalCharacterSet().get() );
147                 for ( std::size_t i = 1; i < 128; ++i )
148                 {
149                         char c = extended_ascii_for_index( i );
150                         char* inbuf = &c;
151                         std::size_t inbytesleft = 1;
152                         char* outbuf = m_converted[i];
153                         std::size_t outbytesleft = 6;
154                         if ( g_iconv( descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft ) != (size_t)( -1 ) ) {
155                                 UTF8Character utf8( m_converted[i] );
156                                 m_decodeMap[i] = utf8;
157                                 m_encodeMap[i] = UTF8CharacterToExtendedASCII( utf8, c );
158                         }
159                 }
160                 g_iconv_close( descriptor );
161                 std::sort( m_encodeMap, m_encodeMap + 128 );
162         }
163 }
164 /// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
165 /// Useful for debugging.
166 void print() const {
167         globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
168         for ( std::size_t i = 1; i < 128; ++i )
169         {
170                 if ( m_decodeMap[i].buffer != 0 ) {
171                         globalOutputStream() << extended_ascii_for_index( i ) << " = " << m_decodeMap[i] << "\n";
172                 }
173         }
174 }
175 /// \brief Returns \p c decoded from extended-ascii to UTF-8.
176 /// \p c must be an extended-ascii character.
177 const UTF8Character& decode( char c ) const {
178         ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
179         ASSERT_MESSAGE( !char_is_ascii( c ), "decode: ascii character" );
180         ASSERT_MESSAGE( m_decodeMap[extended_ascii_to_index( c )].buffer != 0, "decode: invalid character: " << HexChar( c ) );
181         return m_decodeMap[extended_ascii_to_index( c )];
182 }
183 /// \brief Returns \p c encoded to extended-ascii from UTF-8.
184 /// \p c must map to an extended-ascii character.
185 char encode( const UTF8Character& c ) const {
186         ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
187         ASSERT_MESSAGE( !char_is_ascii( *c.buffer ), "encode: ascii character" );
188         std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
189                 = std::equal_range( m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII( c, 0 ) );
190         ASSERT_MESSAGE( range.first != range.second, "encode: invalid character: " << c );
191         return ( *range.first ).m_c;
192 }
193 };
194
195 typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
196
197 /// \brief Returns the global instance of ExtendedASCIICharacterSet.
198 inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet(){
199         return GlobalExtendedASCIICharacterSet::instance();
200 }
201
202 class ConvertUTF8ToLocale
203 {
204 public:
205 StringRange m_range;
206 ConvertUTF8ToLocale( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
207 }
208 ConvertUTF8ToLocale( const StringRange& range ) : m_range( range ){
209 }
210 };
211
212 /// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
213 template<typename TextOutputStreamType>
214 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert ){
215         if ( globalCharacterSet().isUTF8() ) {
216                 return ostream << convert.m_range;
217         }
218
219         for ( const char* p = convert.m_range.first; p != convert.m_range.last; )
220         {
221                 if ( !char_is_ascii( *p ) ) {
222                         UTF8Character c( p );
223                         ostream << globalExtendedASCIICharacterSet().encode( c );
224                         p += c.length;
225                 }
226                 else
227                 {
228                         ostream << *p++;
229                 }
230         }
231         return ostream;
232 }
233
234
235 class ConvertLocaleToUTF8
236 {
237 public:
238 StringRange m_range;
239 ConvertLocaleToUTF8( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
240 }
241 ConvertLocaleToUTF8( const StringRange& range ) : m_range( range ){
242 }
243 };
244
245 /// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
246 template<typename TextOutputStreamType>
247 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert ){
248         if ( globalCharacterSet().isUTF8() ) {
249                 return ostream << convert.m_range;
250         }
251
252         for ( const char* p = convert.m_range.first; p != convert.m_range.last; ++p )
253         {
254                 if ( !char_is_ascii( *p ) ) {
255                         UTF8Character c( globalExtendedASCIICharacterSet().decode( *p ) );
256                         ostream.write( c.buffer, c.length );
257                 }
258                 else
259                 {
260                         ostream << *p;
261                 }
262         }
263         return ostream;
264 }
265
266
267 #endif