2 Copyright (C) 2001-2006, William Joseph.
5 This file is part of GtkRadiant.
7 GtkRadiant is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2 of the License, or
10 (at your option) any later version.
12 GtkRadiant is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GtkRadiant; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #if !defined( INCLUDED_CONVERT_H )
23 #define INCLUDED_CONVERT_H
26 /// \brief Character encoding conversion.
28 #include "debugging/debugging.h"
32 #include "character.h"
34 /// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
35 inline std::size_t utf8_character_length( const char* character ){
36 if ( ( *character & 0xE0 ) == 0xC0 ) { // 110xxxxx
39 else if ( ( *character & 0xF0 ) == 0xE0 ) { // 1110xxxx
42 else if ( ( *character & 0xF8 ) == 0xF0 ) { // 11110xxx
45 else if ( ( *character & 0xFC ) == 0xF8 ) { // 111110xx
48 else if ( ( *character & 0xFE ) == 0xFC ) { // 1111110x
59 UTF8Character() : buffer( 0 ), length( 0 ){
61 UTF8Character( const char* bytes ) : buffer( bytes ), length( utf8_character_length( bytes ) ){
65 inline bool operator<( const UTF8Character& self, const UTF8Character& other ){
66 return std::lexicographical_compare( self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length );
69 /// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
70 template<typename TextOutputStreamType>
71 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const UTF8Character& c ){
72 for ( const char* p = c.buffer; p != c.buffer + c.length; ++p )
74 ostream << HexChar( *p );
81 /// \brief The character-set encoding for the current C locale.
83 /// Obtain the global instance with globalCharacterSet().
86 const char* m_charSet;
89 if ( g_get_charset( &m_charSet ) != FALSE ) {
94 return m_charSet == 0;
96 const char* get() const {
101 typedef LazyStatic<CharacterSet> GlobalCharacterSet;
103 /// \brief Returns the global instance of CharacterSet.
104 inline CharacterSet& globalCharacterSet(){
105 return GlobalCharacterSet::instance();
109 class UTF8CharacterToExtendedASCII
112 UTF8Character m_utf8;
114 UTF8CharacterToExtendedASCII() : m_c( '\0' ){
116 UTF8CharacterToExtendedASCII( const UTF8Character& utf8, char c ) : m_utf8( utf8 ), m_c( c ){
120 inline bool operator<( const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other ){
121 return self.m_utf8 < other.m_utf8;
124 inline std::size_t extended_ascii_to_index( char c ){
125 return static_cast<std::size_t>( c & 0x7F );
128 inline char extended_ascii_for_index( std::size_t i ){
129 return static_cast<char>( i | 0x80 );
132 /// \brief The active extended-ascii character set encoding.
133 /// Performs UTF-8 encoding and decoding of extended-ascii characters.
135 /// Obtain the global instance with globalExtendedASCIICharacterSet().
136 class ExtendedASCIICharacterSet
138 typedef char UTF8CharBuffer[6];
139 UTF8CharBuffer m_converted[128];
140 UTF8Character m_decodeMap[128];
141 UTF8CharacterToExtendedASCII m_encodeMap[128];
143 ExtendedASCIICharacterSet(){
144 if ( !globalCharacterSet().isUTF8() ) {
145 GIConv descriptor = g_iconv_open( "UTF-8", globalCharacterSet().get() );
146 for ( std::size_t i = 1; i < 128; ++i )
148 char c = extended_ascii_for_index( i );
150 std::size_t inbytesleft = 1;
151 char* outbuf = m_converted[i];
152 std::size_t outbytesleft = 6;
153 if ( g_iconv( descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft ) != (size_t)( -1 ) ) {
154 UTF8Character utf8( m_converted[i] );
155 m_decodeMap[i] = utf8;
156 m_encodeMap[i] = UTF8CharacterToExtendedASCII( utf8, c );
159 g_iconv_close( descriptor );
160 std::sort( m_encodeMap, m_encodeMap + 128 );
163 /// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
164 /// Useful for debugging.
166 globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
167 for ( std::size_t i = 1; i < 128; ++i )
169 if ( m_decodeMap[i].buffer != 0 ) {
170 globalOutputStream() << extended_ascii_for_index( i ) << " = " << m_decodeMap[i] << "\n";
174 /// \brief Returns \p c decoded from extended-ascii to UTF-8.
175 /// \p c must be an extended-ascii character.
176 const UTF8Character& decode( char c ) const {
177 ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
178 ASSERT_MESSAGE( !char_is_ascii( c ), "decode: ascii character" );
179 ASSERT_MESSAGE( m_decodeMap[extended_ascii_to_index( c )].buffer != 0, "decode: invalid character: " << HexChar( c ) );
180 return m_decodeMap[extended_ascii_to_index( c )];
182 /// \brief Returns \p c encoded to extended-ascii from UTF-8.
183 /// \p c must map to an extended-ascii character.
184 char encode( const UTF8Character& c ) const {
185 ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
186 ASSERT_MESSAGE( !char_is_ascii( *c.buffer ), "encode: ascii character" );
187 std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
188 = std::equal_range( m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII( c, 0 ) );
189 ASSERT_MESSAGE( range.first != range.second, "encode: invalid character: " << c );
190 return ( *range.first ).m_c;
194 typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
196 /// \brief Returns the global instance of ExtendedASCIICharacterSet.
197 inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet(){
198 return GlobalExtendedASCIICharacterSet::instance();
201 class ConvertUTF8ToLocale
205 ConvertUTF8ToLocale( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
207 ConvertUTF8ToLocale( const StringRange& range ) : m_range( range ){
211 /// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
212 template<typename TextOutputStreamType>
213 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert ){
214 if ( globalCharacterSet().isUTF8() ) {
215 return ostream << convert.m_range;
218 for ( const char* p = convert.m_range.first; p != convert.m_range.last; )
220 if ( !char_is_ascii( *p ) ) {
221 UTF8Character c( p );
222 ostream << globalExtendedASCIICharacterSet().encode( c );
234 class ConvertLocaleToUTF8
238 ConvertLocaleToUTF8( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
240 ConvertLocaleToUTF8( const StringRange& range ) : m_range( range ){
244 /// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
245 template<typename TextOutputStreamType>
246 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert ){
247 if ( globalCharacterSet().isUTF8() ) {
248 return ostream << convert.m_range;
251 for ( const char* p = convert.m_range.first; p != convert.m_range.last; ++p )
253 if ( !char_is_ascii( *p ) ) {
254 UTF8Character c( globalExtendedASCIICharacterSet().decode( *p ) );
255 ostream.write( c.buffer, c.length );