5 ================================================================================
6 Initialization of UTF-8 support and new cvars.
7 ================================================================================
9 // for compatibility this defaults to 0
10 cvar_t utf8_enable = {CVAR_SAVE, "utf8_enable", "0", "Enable UTF-8 support. For compatibility, this is disabled by default in most games."};
14 Cvar_RegisterVariable(&utf8_enable);
18 ================================================================================
19 UTF-8 encoding and decoding functions follow.
20 ================================================================================
23 unsigned char utf8_lengths[256] = { // 0 = invalid
24 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // ascii characters
25 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
26 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
27 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
28 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
30 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
32 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0xBF are within multibyte sequences
33 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // they could be interpreted as 2-byte starts but
34 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // the codepoint would be < 127
35 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
36 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 and C1 would also result in overlong encodings
37 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
38 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
39 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
40 // with F5 the codepoint is above 0x10FFFF,
41 // F8-FB would start 5-byte sequences
42 // FC-FD would start 6-byte sequences
45 Uchar utf8_range[5] = {
46 1, // invalid - let's not allow the creation of 0-bytes :P
48 0x80, // 2-byte minimum
49 0x800, // 3-byte minimum
50 0x10000, // 4-byte minimum
53 /** Analyze the next character and return various information if requested.
54 * @param _s An utf-8 string.
55 * @param _start Filled with the start byte-offset of the next valid character
56 * @param _len Fileed with the length of the next valid character
57 * @param _ch Filled with the unicode value of the next character
58 * @param _maxlen Maximum number of bytes to read from _s
59 * @return Whether or not another valid character is in the string
61 #define U8_ANALYZE_INFINITY 7
62 static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch, size_t _maxlen)
64 const unsigned char *s = (const unsigned char*)_s;
71 while (i < _maxlen && s[i] && (bits = utf8_lengths[s[i]]) == 0)
74 if (i >= _maxlen || !s[i]) {
75 if (_start) *_start = i;
80 if (bits == 1) { // ascii
81 if (_start) *_start = i;
83 if (_ch) *_ch = (Uchar)s[i];
87 ch = (s[i] & (0xFF >> bits));
88 for (j = 1; j < bits; ++j)
90 if ( (s[i+j] & 0xC0) != 0x80 )
95 ch = (ch << 6) | (s[i+j] & 0x3F);
97 if (ch < utf8_range[bits] || ch >= 0x10FFFF)
103 // <0xC2 is always an overlong encoding, they're invalid, thus skipped
104 while (i < _maxlen && s[i] && s[i] >= 0x80 && s[i] < 0xC2) {
105 //fprintf(stderr, "skipping\n");
109 // If we hit the end, well, we're out and invalid
110 if(i >= _maxlen || !s[i]) {
111 if (_start) *_start = i;
116 // I'll leave that in - if you remove it, also change the part below
117 // to support 1-byte chars correctly
120 if (_start) *_start = i;
122 if (_ch) *_ch = (Uchar)s[i];
123 //fprintf(stderr, "valid ascii\n");
127 // Figure out the next char's length
130 // count the 1 bits, they're the # of bytes
131 for (bt = 0x40; bt && (bc & bt); bt >>= 1, ++bits);
134 //fprintf(stderr, "superlong\n");
138 if(i + bits > _maxlen) {
140 if (_start) *_start = i;
147 // turn bt into a mask and give ch a starting value
150 // check the byte sequence for invalid bytes
151 for (j = 1; j < bits; ++j)
153 // valid bit value: 10xx xxxx
154 //if (s[i+j] < 0x80 || s[i+j] >= 0xC0)
155 if ( (s[i+j] & 0xC0) != 0x80 )
157 //fprintf(stderr, "sequence of %i f'd at %i by %x\n", bits, j, (unsigned int)s[i+j]);
158 // this byte sequence is invalid, skip it
160 // find a character after it
163 // at the same time, decode the character
164 ch = (ch << 6) | (s[i+j] & 0x3F);
167 // Now check the decoded byte for an overlong encoding
168 if ( (bits >= 2 && ch < 0x80) ||
169 (bits >= 3 && ch < 0x800) ||
170 (bits >= 4 && ch < 0x10000) ||
171 ch >= 0x10FFFF // RFC 3629
175 //fprintf(stderr, "overlong: %i bytes for %x\n", bits, ch);
186 //fprintf(stderr, "valid utf8\n");
190 /** Get the number of characters in an UTF-8 string.
191 * @param _s An utf-8 encoded null-terminated string.
192 * @return The number of unicode characters in the string.
194 size_t u8_strlen(const char *_s)
198 const unsigned char *s = (const unsigned char*)_s;
200 if (!utf8_enable.integer)
205 // ascii char, skip u8_analyze
213 // invalid, skip u8_analyze
220 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
222 // valid character, skip after it
229 /** Get the number of characters in a part of an UTF-8 string.
230 * @param _s An utf-8 encoded null-terminated string.
231 * @param n The maximum number of bytes.
232 * @return The number of unicode characters in the string.
234 size_t u8_strnlen(const char *_s, size_t n)
238 const unsigned char *s = (const unsigned char*)_s;
240 if (!utf8_enable.integer)
243 return (len < n) ? len : n;
248 // ascii char, skip u8_analyze
257 // invalid, skip u8_analyze
265 if (!u8_analyze((const char*)s, &st, &ln, NULL, n))
267 // valid character, see if it's still inside the range specified by n:
277 /** Get the number of bytes used in a string to represent an amount of characters.
278 * @param _s An utf-8 encoded null-terminated string.
279 * @param n The number of characters we want to know the byte-size for.
280 * @return The number of bytes used to represent n characters.
282 size_t u8_bytelen(const char *_s, size_t n)
286 const unsigned char *s = (const unsigned char*)_s;
288 if (!utf8_enable.integer) {
290 return (len < n) ? len : n;
295 // ascii char, skip u8_analyze
304 // invalid, skip u8_analyze
312 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
321 /** Get the byte-index for a character-index.
322 * @param _s An utf-8 encoded string.
323 * @param i The character-index for which you want the byte offset.
324 * @param len If not null, character's length will be stored in there.
325 * @return The byte-index at which the character begins, or -1 if the string is too short.
327 int u8_byteofs(const char *_s, size_t i, size_t *len)
331 const unsigned char *s = (const unsigned char*)_s;
333 if (!utf8_enable.integer)
349 if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
358 /** Get the char-index for a byte-index.
359 * @param _s An utf-8 encoded string.
360 * @param i The byte offset for which you want the character index.
361 * @param len If not null, the offset within the character is stored here.
362 * @return The character-index, or -1 if the string is too short.
364 int u8_charidx(const char *_s, size_t i, size_t *len)
370 const unsigned char *s = (const unsigned char*)_s;
372 if (!utf8_enable.integer)
378 while (ofs < i && s[ofs])
380 // ascii character, skip u8_analyze
389 // invalid, skip u8_analyze
396 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
398 // see if next char is after the bytemark
408 // see if bytemark is within the char
420 /** Get the byte offset of the previous byte.
422 * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
423 * @param _s An utf-8 encoded string.
424 * @param i The current byte offset.
425 * @return The byte offset of the previous character
427 size_t u8_prevbyte(const char *_s, size_t i)
430 const unsigned char *s = (const unsigned char*)_s;
434 if (!utf8_enable.integer)
441 while (ofs < i && s[ofs])
443 // ascii character, skip u8_analyze
450 // invalid, skip u8_analyze
457 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
461 if (ofs + st + ln >= i)
470 Uchar u8_quake2utf8map[256] = {
471 0xE000, 0xE001, 0xE002, 0xE003, 0xE004, 0xE005, 0xE006, 0xE007, 0xE008, 0xE009, 0xE00A, 0xE00B, 0xE00C, 0xE00D, 0xE00E, 0xE00F, // specials
472 0xE010, 0xE011, 0xE012, 0xE013, 0xE014, 0xE015, 0xE016, 0xE017, 0xE018, 0xE019, 0xE01A, 0xE01B, 0xE01C, 0xE01D, 0xE01E, 0xE01F, // specials
473 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // shift+digit line
474 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // digits
475 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // caps
476 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // caps
477 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // small
478 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // small
479 0xE080, 0xE081, 0xE082, 0xE083, 0xE084, 0xE085, 0xE086, 0xE087, 0xE088, 0xE089, 0xE08A, 0xE08B, 0xE08C, 0xE08D, 0xE08E, 0xE08F, // specials
480 0xE090, 0xE091, 0xE092, 0xE093, 0xE094, 0xE095, 0xE096, 0xE097, 0xE098, 0xE099, 0xE09A, 0xE09B, 0xE09C, 0xE09D, 0xE09E, 0xE09F, // faces
481 0xE0A0, 0xE0A1, 0xE0A2, 0xE0A3, 0xE0A4, 0xE0A5, 0xE0A6, 0xE0A7, 0xE0A8, 0xE0A9, 0xE0AA, 0xE0AB, 0xE0AC, 0xE0AD, 0xE0AE, 0xE0AF,
482 0xE0B0, 0xE0B1, 0xE0B2, 0xE0B3, 0xE0B4, 0xE0B5, 0xE0B6, 0xE0B7, 0xE0B8, 0xE0B9, 0xE0BA, 0xE0BB, 0xE0BC, 0xE0BD, 0xE0BE, 0xE0BF,
483 0xE0C0, 0xE0C1, 0xE0C2, 0xE0C3, 0xE0C4, 0xE0C5, 0xE0C6, 0xE0C7, 0xE0C8, 0xE0C9, 0xE0CA, 0xE0CB, 0xE0CC, 0xE0CD, 0xE0CE, 0xE0CF,
484 0xE0D0, 0xE0D1, 0xE0D2, 0xE0D3, 0xE0D4, 0xE0D5, 0xE0D6, 0xE0D7, 0xE0D8, 0xE0D9, 0xE0DA, 0xE0DB, 0xE0DC, 0xE0DD, 0xE0DE, 0xE0DF,
485 0xE0E0, 0xE0E1, 0xE0E2, 0xE0E3, 0xE0E4, 0xE0E5, 0xE0E6, 0xE0E7, 0xE0E8, 0xE0E9, 0xE0EA, 0xE0EB, 0xE0EC, 0xE0ED, 0xE0EE, 0xE0EF,
486 0xE0F0, 0xE0F1, 0xE0F2, 0xE0F3, 0xE0F4, 0xE0F5, 0xE0F6, 0xE0F7, 0xE0F8, 0xE0F9, 0xE0FA, 0xE0FB, 0xE0FC, 0xE0FD, 0xE0FE, 0xE0FF,
489 /** Fetch a character from an utf-8 encoded string.
490 * @param _s The start of an utf-8 encoded multi-byte character.
491 * @param _end Will point to after the first multi-byte character.
492 * @return The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
494 Uchar u8_getchar_utf8_enabled(const char *_s, const char **_end)
499 if (!u8_analyze(_s, &st, &ln, &ch, U8_ANALYZE_INFINITY))
502 *_end = _s + st + ln;
506 /** Fetch a character from an utf-8 encoded string.
507 * @param _s The start of an utf-8 encoded multi-byte character.
508 * @param _end Will point to after the first multi-byte character.
509 * @return The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
511 Uchar u8_getnchar_utf8_enabled(const char *_s, const char **_end, size_t _maxlen)
516 if (!u8_analyze(_s, &st, &ln, &ch, _maxlen))
519 *_end = _s + st + ln;
523 /** Encode a wide-character into utf-8.
524 * @param w The wide character to encode.
525 * @param to The target buffer the utf-8 encoded string is stored to.
526 * @param maxlen The maximum number of bytes that fit into the target buffer.
527 * @return Number of bytes written to the buffer not including the terminating null.
528 * Less or equal to 0 if the buffer is too small.
530 int u8_fromchar(Uchar w, char *to, size_t maxlen)
538 if (w >= 0xE000 && !utf8_enable.integer)
541 if (w < 0x80 || !utf8_enable.integer)
549 // for a little speedup
558 to[1] = 0x80 | (w & 0x3F); w >>= 6;
570 to[2] = 0x80 | (w & 0x3F); w >>= 6;
571 to[1] = 0x80 | (w & 0x3F); w >>= 6;
585 to[3] = 0x80 | (w & 0x3F); w >>= 6;
586 to[2] = 0x80 | (w & 0x3F); w >>= 6;
587 to[1] = 0x80 | (w & 0x3F); w >>= 6;
594 /** uses u8_fromchar on a static buffer
595 * @param ch The unicode character to convert to encode
596 * @param l The number of bytes without the terminating null.
597 * @return A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
599 char *u8_encodech(Uchar ch, size_t *l)
603 len = u8_fromchar(ch, buf, sizeof(buf));
612 /** Convert a utf-8 multibyte string to a wide character string.
613 * @param wcs The target wide-character buffer.
614 * @param mb The utf-8 encoded multibyte string to convert.
615 * @param maxlen The maximum number of wide-characters that fit into the target buffer.
616 * @return The number of characters written to the target buffer.
618 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
624 for (i = 0; *mb && i < maxlen-1; ++i)
626 ch = u8_getchar(mb, &mb);
635 /** Convert a wide-character string to a utf-8 multibyte string.
636 * @param mb The target buffer the utf-8 string is written to.
637 * @param wcs The wide-character string to convert.
638 * @param maxlen The number bytes that fit into the multibyte target buffer.
639 * @return The number of bytes written, not including the terminating \0
641 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
644 const char *start = mb;
647 for (i = 0; wcs[i] && i < maxlen-1; ++i)
651 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
655 mb += u8_fromchar(wcs[i], mb, maxlen - i);
663 UTF-8 aware COM_StringLengthNoColors
665 calculates the visible width of a color coded string.
667 *valid is filled with TRUE if the string is a valid colored string (that is, if
668 it does not end with an unfinished color code). If it gets filled with FALSE, a
669 fix would be adding a STRING_COLOR_TAG at the end of the string.
671 valid can be set to NULL if the caller doesn't care.
673 For size_s, specify the maximum number of characters from s to use, or 0 to use
674 all characters until the zero terminator.
678 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
680 u8_COM_StringLengthNoColors(const char *_s, size_t size_s, qboolean *valid)
682 const unsigned char *s = (const unsigned char*)_s;
683 const unsigned char *end;
687 if (!utf8_enable.integer)
688 return COM_StringLengthNoColors(_s, size_s, valid);
690 end = size_s ? (s + size_s) : NULL;
694 switch((s >= end) ? 0 : *s)
700 case STRING_COLOR_TAG:
702 switch((s == end) ? 0 : *s)
704 case STRING_COLOR_RGB_TAG_CHAR:
705 if (s+1 != end && isxdigit(s[1]) &&
706 s+2 != end && isxdigit(s[2]) &&
707 s+3 != end && isxdigit(s[3]) )
712 ++len; // STRING_COLOR_TAG
713 ++len; // STRING_COLOR_RGB_TAG_CHAR
715 case 0: // ends with unfinished color code!
720 case STRING_COLOR_TAG: // escaped ^
723 case '0': case '1': case '2': case '3': case '4':
724 case '5': case '6': case '7': case '8': case '9': // color code
726 default: // not a color code
727 ++len; // STRING_COLOR_TAG
728 ++len; // the character
736 // ascii char, skip u8_analyze
744 // invalid, skip u8_analyze
751 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
753 // we CAN end up here, if an invalid char is between this one and the end of the string
759 if(s + st + ln >= end)
761 // string length exceeded by new character
767 // valid character, skip after it
774 /** Pads a utf-8 string
775 * @param out The target buffer the utf-8 string is written to.
776 * @param outsize The size of the target buffer, including the final NUL
777 * @param in The input utf-8 buffer
778 * @param leftalign Left align the output string (by default right alignment is done)
779 * @param minwidth The minimum output width
780 * @param maxwidth The maximum output width
781 * @return The number of bytes written, not including the terminating \0
783 size_t u8_strpad(char *out, size_t outsize, const char *in, qboolean leftalign, size_t minwidth, size_t maxwidth)
785 if(!utf8_enable.integer)
787 return dpsnprintf(out, outsize, "%*.*s", leftalign ? -(int) minwidth : (int) minwidth, (int) maxwidth, in);
791 size_t l = u8_bytelen(in, maxwidth);
792 size_t actual_width = u8_strnlen(in, l);
793 int pad = (actual_width >= minwidth) ? 0 : (minwidth - actual_width);
795 int lpad = leftalign ? 0 : pad;
796 int rpad = leftalign ? pad : 0;
797 return dpsnprintf(out, outsize, "%*s%.*s%*s", lpad, "", prec, in, rpad, "");