Fix MSYS2 issues
[xonotic/netradiant.git] / libs / convert.h
1 /*
2    Copyright (C) 2001-2006, William Joseph.
3    All Rights Reserved.
4
5    This file is part of GtkRadiant.
6
7    GtkRadiant is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 2 of the License, or
10    (at your option) any later version.
11
12    GtkRadiant is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with GtkRadiant; if not, write to the Free Software
19    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20  */
21
22 #if !defined( INCLUDED_CONVERT_H )
23 #define INCLUDED_CONVERT_H
24
25 /// \file
26 /// \brief Character encoding conversion.
27
28 #include "debugging/debugging.h"
29 #include <algorithm>
30 #include <glib.h>
31
32 #include "character.h"
33
34 /// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
35 inline std::size_t utf8_character_length( const char* character ){
36         if ( ( *character & 0xE0 ) == 0xC0 ) { // 110xxxxx
37                 return 2;
38         }
39         else if ( ( *character & 0xF0 ) == 0xE0 ) { // 1110xxxx
40                 return 3;
41         }
42         else if ( ( *character & 0xF8 ) == 0xF0 ) { // 11110xxx
43                 return 4;
44         }
45         else if ( ( *character & 0xFC ) == 0xF8 ) { // 111110xx
46                 return 5;
47         }
48         else if ( ( *character & 0xFE ) == 0xFC ) { // 1111110x
49                 return 6;
50         }
51         ERROR_MESSAGE( "" );
52         return 0;
53 }
54
55 struct UTF8Character
56 {
57         const char* buffer;
58         std::size_t length;
59         UTF8Character() : buffer( 0 ), length( 0 ){
60         }
61         UTF8Character( const char* bytes ) : buffer( bytes ), length( utf8_character_length( bytes ) ){
62         }
63 };
64
65 inline bool operator<( const UTF8Character& self, const UTF8Character& other ){
66         return std::lexicographical_compare( self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length );
67 }
68
69 /// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
70 template<typename TextOutputStreamType>
71 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const UTF8Character& c ){
72         for ( const char* p = c.buffer; p != c.buffer + c.length; ++p )
73         {
74                 ostream << HexChar( *p );
75         }
76         return ostream;
77 }
78
79
80
81 /// \brief The character-set encoding for the current C locale.
82 ///
83 /// Obtain the global instance with globalCharacterSet().
84 class CharacterSet
85 {
86 const char* m_charSet;
87 public:
88 CharacterSet(){
89         if ( g_get_charset( &m_charSet ) != FALSE ) {
90                 m_charSet = 0;
91         }
92 }
93 bool isUTF8() const {
94         return m_charSet == 0;
95 }
96 const char* get() const {
97         return m_charSet;
98 }
99 };
100
101 typedef LazyStatic<CharacterSet> GlobalCharacterSet;
102
103 /// \brief Returns the global instance of CharacterSet.
104 inline CharacterSet& globalCharacterSet(){
105         return GlobalCharacterSet::instance();
106 }
107
108
109 class UTF8CharacterToExtendedASCII
110 {
111 public:
112 UTF8Character m_utf8;
113 char m_c;
114 UTF8CharacterToExtendedASCII() : m_c( '\0' ){
115 }
116 UTF8CharacterToExtendedASCII( const UTF8Character& utf8, char c ) : m_utf8( utf8 ), m_c( c ){
117 }
118 };
119
120 inline bool operator<( const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other ){
121         return self.m_utf8 < other.m_utf8;
122 }
123
124 inline std::size_t extended_ascii_to_index( char c ){
125         return static_cast<std::size_t>( c & 0x7F );
126 }
127
128 inline char extended_ascii_for_index( std::size_t i ){
129         return static_cast<char>( i | 0x80 );
130 }
131
132 /// \brief The active extended-ascii character set encoding.
133 /// Performs UTF-8 encoding and decoding of extended-ascii characters.
134 ///
135 /// Obtain the global instance with globalExtendedASCIICharacterSet().
136 class ExtendedASCIICharacterSet
137 {
138 typedef char UTF8CharBuffer[6];
139 UTF8CharBuffer m_converted[128];
140 UTF8Character m_decodeMap[128];
141 UTF8CharacterToExtendedASCII m_encodeMap[128];
142 public:
143 ExtendedASCIICharacterSet(){
144         if ( !globalCharacterSet().isUTF8() ) {
145                 GIConv descriptor = g_iconv_open( "UTF-8", globalCharacterSet().get() );
146                 for ( std::size_t i = 1; i < 128; ++i )
147                 {
148                         char c = extended_ascii_for_index( i );
149                         char* inbuf = &c;
150                         std::size_t inbytesleft = 1;
151                         char* outbuf = m_converted[i];
152                         std::size_t outbytesleft = 6;
153                         if ( g_iconv( descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft ) != (size_t)( -1 ) ) {
154                                 UTF8Character utf8( m_converted[i] );
155                                 m_decodeMap[i] = utf8;
156                                 m_encodeMap[i] = UTF8CharacterToExtendedASCII( utf8, c );
157                         }
158                 }
159                 g_iconv_close( descriptor );
160                 std::sort( m_encodeMap, m_encodeMap + 128 );
161         }
162 }
163 /// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
164 /// Useful for debugging.
165 void print() const {
166         globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
167         for ( std::size_t i = 1; i < 128; ++i )
168         {
169                 if ( m_decodeMap[i].buffer != 0 ) {
170                         globalOutputStream() << extended_ascii_for_index( i ) << " = " << m_decodeMap[i] << "\n";
171                 }
172         }
173 }
174 /// \brief Returns \p c decoded from extended-ascii to UTF-8.
175 /// \p c must be an extended-ascii character.
176 const UTF8Character& decode( char c ) const {
177         ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
178         ASSERT_MESSAGE( !char_is_ascii( c ), "decode: ascii character" );
179         ASSERT_MESSAGE( m_decodeMap[extended_ascii_to_index( c )].buffer != 0, "decode: invalid character: " << HexChar( c ) );
180         return m_decodeMap[extended_ascii_to_index( c )];
181 }
182 /// \brief Returns \p c encoded to extended-ascii from UTF-8.
183 /// \p c must map to an extended-ascii character.
184 char encode( const UTF8Character& c ) const {
185         ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
186         ASSERT_MESSAGE( !char_is_ascii( *c.buffer ), "encode: ascii character" );
187         std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
188                 = std::equal_range( m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII( c, 0 ) );
189         ASSERT_MESSAGE( range.first != range.second, "encode: invalid character: " << c );
190         return ( *range.first ).m_c;
191 }
192 };
193
194 typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
195
196 /// \brief Returns the global instance of ExtendedASCIICharacterSet.
197 inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet(){
198         return GlobalExtendedASCIICharacterSet::instance();
199 }
200
201 class ConvertUTF8ToLocale
202 {
203 public:
204 StringRange m_range;
205 ConvertUTF8ToLocale( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
206 }
207 ConvertUTF8ToLocale( const StringRange& range ) : m_range( range ){
208 }
209 };
210
211 /// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
212 template<typename TextOutputStreamType>
213 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert ){
214         if ( globalCharacterSet().isUTF8() ) {
215                 return ostream << convert.m_range;
216         }
217
218         for ( const char* p = convert.m_range.first; p != convert.m_range.last; )
219         {
220                 if ( !char_is_ascii( *p ) ) {
221                         UTF8Character c( p );
222                         ostream << globalExtendedASCIICharacterSet().encode( c );
223                         p += c.length;
224                 }
225                 else
226                 {
227                         ostream << *p++;
228                 }
229         }
230         return ostream;
231 }
232
233
234 class ConvertLocaleToUTF8
235 {
236 public:
237 StringRange m_range;
238 ConvertLocaleToUTF8( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
239 }
240 ConvertLocaleToUTF8( const StringRange& range ) : m_range( range ){
241 }
242 };
243
244 /// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
245 template<typename TextOutputStreamType>
246 inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert ){
247         if ( globalCharacterSet().isUTF8() ) {
248                 return ostream << convert.m_range;
249         }
250
251         for ( const char* p = convert.m_range.first; p != convert.m_range.last; ++p )
252         {
253                 if ( !char_is_ascii( *p ) ) {
254                         UTF8Character c( globalExtendedASCIICharacterSet().decode( *p ) );
255                         ostream.write( c.buffer, c.length );
256                 }
257                 else
258                 {
259                         ostream << *p;
260                 }
261         }
262         return ostream;
263 }
264
265
266 #endif