+++ /dev/null
-/*
-Copyright (C) 2001-2006, William Joseph.
-All Rights Reserved.
-
-This file is part of GtkRadiant.
-
-GtkRadiant is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-GtkRadiant is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with GtkRadiant; if not, write to the Free Software
-Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-*/
-
-#if !defined(INCLUDED_CONVERT_H)
-#define INCLUDED_CONVERT_H
-
-/// \file
-/// \brief Character encoding conversion.
-
-#include "debugging/debugging.h"
-#include <algorithm>
-#include <glib/gunicode.h>
-#include <glib/gconvert.h>
-
-#include "character.h"
-
-/// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
-inline std::size_t utf8_character_length(const char* character)
-{
- if((*character & 0xE0) == 0xC0) // 110xxxxx
- {
- return 2;
- }
- else if((*character & 0xF0) == 0xE0) // 1110xxxx
- {
- return 3;
- }
- else if((*character & 0xF8) == 0xF0) // 11110xxx
- {
- return 4;
- }
- else if((*character & 0xFC) == 0xF8) // 111110xx
- {
- return 5;
- }
- else if((*character & 0xFE) == 0xFC) // 1111110x
- {
- return 6;
- }
- ERROR_MESSAGE("");
- return 0;
-}
-
-struct UTF8Character
-{
- const char* buffer;
- std::size_t length;
- UTF8Character() : buffer(0), length(0)
- {
- }
- UTF8Character(const char* bytes) : buffer(bytes), length(utf8_character_length(bytes))
- {
- }
-};
-
-inline bool operator<(const UTF8Character& self, const UTF8Character& other)
-{
- return std::lexicographical_compare(self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length);
-}
-
-/// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
-template<typename TextOutputStreamType>
-inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const UTF8Character& c)
-{
- for(const char* p = c.buffer; p != c.buffer + c.length; ++p)
- {
- ostream << HexChar(*p);
- }
- return ostream;
-}
-
-
-
-/// \brief The character-set encoding for the current C locale.
-///
-/// Obtain the global instance with globalCharacterSet().
-class CharacterSet
-{
- const char* m_charSet;
-public:
- CharacterSet()
- {
- if(g_get_charset(&m_charSet) != FALSE)
- {
- m_charSet = 0;
- }
- }
- bool isUTF8() const
- {
- return m_charSet == 0;
- }
- const char* get() const
- {
- return m_charSet;
- }
-};
-
-typedef LazyStatic<CharacterSet> GlobalCharacterSet;
-
-/// \brief Returns the global instance of CharacterSet.
-inline CharacterSet& globalCharacterSet()
-{
- return GlobalCharacterSet::instance();
-}
-
-
-class UTF8CharacterToExtendedASCII
-{
-public:
- UTF8Character m_utf8;
- char m_c;
- UTF8CharacterToExtendedASCII() : m_c('\0')
- {
- }
- UTF8CharacterToExtendedASCII(const UTF8Character& utf8, char c) : m_utf8(utf8), m_c(c)
- {
- }
-};
-
-inline bool operator<(const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other)
-{
- return self.m_utf8 < other.m_utf8;
-}
-
-inline std::size_t extended_ascii_to_index(char c)
-{
- return static_cast<std::size_t>(c & 0x7F);
-}
-
-inline char extended_ascii_for_index(std::size_t i)
-{
- return static_cast<char>(i | 0x80);
-}
-
-/// \brief The active extended-ascii character set encoding.
-/// Performs UTF-8 encoding and decoding of extended-ascii characters.
-///
-/// Obtain the global instance with globalExtendedASCIICharacterSet().
-class ExtendedASCIICharacterSet
-{
- typedef char UTF8CharBuffer[6];
- UTF8CharBuffer m_converted[128];
- UTF8Character m_decodeMap[128];
- UTF8CharacterToExtendedASCII m_encodeMap[128];
-public:
- ExtendedASCIICharacterSet()
- {
- if(!globalCharacterSet().isUTF8())
- {
- GIConv descriptor = g_iconv_open("UTF-8", globalCharacterSet().get());
- for(std::size_t i = 1; i < 128; ++i)
- {
- char c = extended_ascii_for_index(i);
- char* inbuf = &c;
- std::size_t inbytesleft = 1;
- char* outbuf = m_converted[i];
- std::size_t outbytesleft = 6;
- if(g_iconv(descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft) != (size_t)(-1))
- {
- UTF8Character utf8(m_converted[i]);
- m_decodeMap[i] = utf8;
- m_encodeMap[i] = UTF8CharacterToExtendedASCII(utf8, c);
- }
- }
- g_iconv_close(descriptor);
- std::sort(m_encodeMap, m_encodeMap + 128);
- }
- }
- /// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
- /// Useful for debugging.
- void print() const
- {
- globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
- for(std::size_t i = 1; i < 128; ++i)
- {
- if(m_decodeMap[i].buffer != 0)
- {
- globalOutputStream() << extended_ascii_for_index(i) << " = " << m_decodeMap[i] << "\n";
- }
- }
- }
- /// \brief Returns \p c decoded from extended-ascii to UTF-8.
- /// \p c must be an extended-ascii character.
- const UTF8Character& decode(char c) const
- {
- ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
- ASSERT_MESSAGE(!char_is_ascii(c), "decode: ascii character");
- ASSERT_MESSAGE(m_decodeMap[extended_ascii_to_index(c)].buffer != 0, "decode: invalid character: " << HexChar(c));
- return m_decodeMap[extended_ascii_to_index(c)];
- }
- /// \brief Returns \p c encoded to extended-ascii from UTF-8.
- /// \p c must map to an extended-ascii character.
- char encode(const UTF8Character& c) const
- {
- ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
- ASSERT_MESSAGE(!char_is_ascii(*c.buffer), "encode: ascii character");
- std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
- = std::equal_range(m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII(c, 0));
- ASSERT_MESSAGE(range.first != range.second, "encode: invalid character: " << c);
- return (*range.first).m_c;
- }
-};
-
-typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
-
-/// \brief Returns the global instance of ExtendedASCIICharacterSet.
-inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet()
-{
- return GlobalExtendedASCIICharacterSet::instance();
-}
-
-class ConvertUTF8ToLocale
-{
-public:
- StringRange m_range;
- ConvertUTF8ToLocale(const char* string) : m_range(StringRange(string, string + strlen(string)))
- {
- }
- ConvertUTF8ToLocale(const StringRange& range) : m_range(range)
- {
- }
-};
-
-/// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
-template<typename TextOutputStreamType>
-inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert)
-{
- if(globalCharacterSet().isUTF8())
- {
- return ostream << convert.m_range;
- }
-
- for(const char* p = convert.m_range.first; p != convert.m_range.last;)
- {
- if(!char_is_ascii(*p))
- {
- UTF8Character c(p);
- ostream << globalExtendedASCIICharacterSet().encode(c);
- p += c.length;
- }
- else
- {
- ostream << *p++;
- }
- }
- return ostream;
-}
-
-
-class ConvertLocaleToUTF8
-{
-public:
- StringRange m_range;
- ConvertLocaleToUTF8(const char* string) : m_range(StringRange(string, string + strlen(string)))
- {
- }
- ConvertLocaleToUTF8(const StringRange& range) : m_range(range)
- {
- }
-};
-
-/// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
-template<typename TextOutputStreamType>
-inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert)
-{
- if(globalCharacterSet().isUTF8())
- {
- return ostream << convert.m_range;
- }
-
- for(const char* p = convert.m_range.first; p != convert.m_range.last; ++p)
- {
- if(!char_is_ascii(*p))
- {
- UTF8Character c(globalExtendedASCIICharacterSet().decode(*p));
- ostream.write(c.buffer, c.length);
- }
- else
- {
- ostream << *p;
- }
- }
- return ostream;
-}
-
-
-#endif