2 Copyright (C) 2001-2006, William Joseph.
5 This file is part of GtkRadiant.
7 GtkRadiant is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
12 GtkRadiant is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GtkRadiant; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #if !defined( INCLUDED_CONVERT_H )
23 #define INCLUDED_CONVERT_H
26 /// \brief Character encoding conversion.
28 #include "debugging/debugging.h"
30 #include <glib/gunicode.h>
31 #include <glib/gconvert.h>
33 #include "character.h"
35 /// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
36 inline std::size_t utf8_character_length( const char* character ){
37 if ( ( *character & 0xE0 ) == 0xC0 ) { // 110xxxxx
40 else if ( ( *character & 0xF0 ) == 0xE0 ) { // 1110xxxx
43 else if ( ( *character & 0xF8 ) == 0xF0 ) { // 11110xxx
46 else if ( ( *character & 0xFC ) == 0xF8 ) { // 111110xx
49 else if ( ( *character & 0xFE ) == 0xFC ) { // 1111110x
60 UTF8Character() : buffer( 0 ), length( 0 ){
62 UTF8Character( const char* bytes ) : buffer( bytes ), length( utf8_character_length( bytes ) ){
66 inline bool operator<( const UTF8Character& self, const UTF8Character& other ){
67 return std::lexicographical_compare( self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length );
70 /// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
71 template<typename TextOutputStreamType>
72 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const UTF8Character& c ){
73 for ( const char* p = c.buffer; p != c.buffer + c.length; ++p )
75 ostream << HexChar( *p );
82 /// \brief The character-set encoding for the current C locale.
84 /// Obtain the global instance with globalCharacterSet().
87 const char* m_charSet;
90 if ( g_get_charset( &m_charSet ) != FALSE ) {
95 return m_charSet == 0;
97 const char* get() const {
102 typedef LazyStatic<CharacterSet> GlobalCharacterSet;
104 /// \brief Returns the global instance of CharacterSet.
105 inline CharacterSet& globalCharacterSet(){
106 return GlobalCharacterSet::instance();
110 class UTF8CharacterToExtendedASCII
113 UTF8Character m_utf8;
115 UTF8CharacterToExtendedASCII() : m_c( '\0' ){
117 UTF8CharacterToExtendedASCII( const UTF8Character& utf8, char c ) : m_utf8( utf8 ), m_c( c ){
121 inline bool operator<( const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other ){
122 return self.m_utf8 < other.m_utf8;
125 inline std::size_t extended_ascii_to_index( char c ){
126 return static_cast<std::size_t>( c & 0x7F );
129 inline char extended_ascii_for_index( std::size_t i ){
130 return static_cast<char>( i | 0x80 );
133 /// \brief The active extended-ascii character set encoding.
134 /// Performs UTF-8 encoding and decoding of extended-ascii characters.
136 /// Obtain the global instance with globalExtendedASCIICharacterSet().
137 class ExtendedASCIICharacterSet
139 typedef char UTF8CharBuffer[6];
140 UTF8CharBuffer m_converted[128];
141 UTF8Character m_decodeMap[128];
142 UTF8CharacterToExtendedASCII m_encodeMap[128];
144 ExtendedASCIICharacterSet(){
145 if ( !globalCharacterSet().isUTF8() ) {
146 GIConv descriptor = g_iconv_open( "UTF-8", globalCharacterSet().get() );
147 for ( std::size_t i = 1; i < 128; ++i )
149 char c = extended_ascii_for_index( i );
151 std::size_t inbytesleft = 1;
152 char* outbuf = m_converted[i];
153 std::size_t outbytesleft = 6;
154 if ( g_iconv( descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft ) != (size_t)( -1 ) ) {
155 UTF8Character utf8( m_converted[i] );
156 m_decodeMap[i] = utf8;
157 m_encodeMap[i] = UTF8CharacterToExtendedASCII( utf8, c );
160 g_iconv_close( descriptor );
161 std::sort( m_encodeMap, m_encodeMap + 128 );
164 /// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
165 /// Useful for debugging.
167 globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
168 for ( std::size_t i = 1; i < 128; ++i )
170 if ( m_decodeMap[i].buffer != 0 ) {
171 globalOutputStream() << extended_ascii_for_index( i ) << " = " << m_decodeMap[i] << "\n";
175 /// \brief Returns \p c decoded from extended-ascii to UTF-8.
176 /// \p c must be an extended-ascii character.
177 const UTF8Character& decode( char c ) const {
178 ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
179 ASSERT_MESSAGE( !char_is_ascii( c ), "decode: ascii character" );
180 ASSERT_MESSAGE( m_decodeMap[extended_ascii_to_index( c )].buffer != 0, "decode: invalid character: " << HexChar( c ) );
181 return m_decodeMap[extended_ascii_to_index( c )];
183 /// \brief Returns \p c encoded to extended-ascii from UTF-8.
184 /// \p c must map to an extended-ascii character.
185 char encode( const UTF8Character& c ) const {
186 ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
187 ASSERT_MESSAGE( !char_is_ascii( *c.buffer ), "encode: ascii character" );
188 std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
189 = std::equal_range( m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII( c, 0 ) );
190 ASSERT_MESSAGE( range.first != range.second, "encode: invalid character: " << c );
191 return ( *range.first ).m_c;
195 typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
197 /// \brief Returns the global instance of ExtendedASCIICharacterSet.
198 inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet(){
199 return GlobalExtendedASCIICharacterSet::instance();
202 class ConvertUTF8ToLocale
206 ConvertUTF8ToLocale( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
208 ConvertUTF8ToLocale( const StringRange& range ) : m_range( range ){
212 /// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
213 template<typename TextOutputStreamType>
214 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert ){
215 if ( globalCharacterSet().isUTF8() ) {
216 return ostream << convert.m_range;
219 for ( const char* p = convert.m_range.first; p != convert.m_range.last; )
221 if ( !char_is_ascii( *p ) ) {
222 UTF8Character c( p );
223 ostream << globalExtendedASCIICharacterSet().encode( c );
235 class ConvertLocaleToUTF8
239 ConvertLocaleToUTF8( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
241 ConvertLocaleToUTF8( const StringRange& range ) : m_range( range ){
245 /// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
246 template<typename TextOutputStreamType>
247 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert ){
248 if ( globalCharacterSet().isUTF8() ) {
249 return ostream << convert.m_range;
252 for ( const char* p = convert.m_range.first; p != convert.m_range.last; ++p )
254 if ( !char_is_ascii( *p ) ) {
255 UTF8Character c( globalExtendedASCIICharacterSet().decode( *p ) );
256 ostream.write( c.buffer, c.length );