5 ================================================================================
6 Initialization of UTF-8 support and new cvars.
7 ================================================================================
9 // for compatibility this defaults to 0
10 cvar_t utf8_enable = {CVAR_SAVE, "utf8_enable", "0", "Enable UTF-8 support. For compatibility, this is disabled by default in most games."};
14 Cvar_RegisterVariable(&utf8_enable);
18 ================================================================================
19 UTF-8 encoding and decoding functions follow.
20 ================================================================================
23 /** Analyze the next character and return various information if requested.
24 * @param _s An utf-8 string.
25 * @param _start Filled with the start byte-offset of the next valid character
26 * @param _len Fileed with the length of the next valid character
27 * @param _ch Filled with the unicode value of the next character
28 * @param _maxlen Maximum number of bytes to read from _s
29 * @return Whether or not another valid character is in the string
31 #define U8_ANALYZE_INFINITY 7
32 static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch, size_t _maxlen)
34 const unsigned char *s = (const unsigned char*)_s;
43 // <0xC2 is always an overlong encoding, they're invalid, thus skipped
44 while (i < _maxlen && s[i] && s[i] >= 0x80 && s[i] < 0xC2) {
45 //fprintf(stderr, "skipping\n");
49 //fprintf(stderr, "checking\n");
50 // If we hit the end, well, we're out and invalid
51 if(i >= _maxlen || !s[i]) {
52 if (_start) *_start = i;
57 //fprintf(stderr, "checking ascii\n");
61 if (_start) *_start = i;
63 if (_ch) *_ch = (Uchar)s[i];
64 //fprintf(stderr, "valid ascii\n");
67 //fprintf(stderr, "checking length\n");
69 // Figure out the next char's length
72 // count the 1 bits, they're the # of bytes
73 for (bt = 0x40; bt && (bc & bt); bt >>= 1, ++bits);
76 //fprintf(stderr, "superlong\n");
80 if(i + bits > _maxlen) {
81 if (_start) *_start = i;
85 // turn bt into a mask and give ch a starting value
88 // check the byte sequence for invalid bytes
89 for (j = 1; j < bits; ++j)
91 // valid bit value: 10xx xxxx
92 //if (s[i+j] < 0x80 || s[i+j] >= 0xC0)
93 if ( (s[i+j] & 0xC0) != 0x80 )
95 //fprintf(stderr, "sequence of %i f'd at %i by %x\n", bits, j, (unsigned int)s[i+j]);
96 // this byte sequence is invalid, skip it
98 // find a character after it
101 // at the same time, decode the character
102 ch = (ch << 6) | (s[i+j] & 0x3F);
105 // Now check the decoded byte for an overlong encoding
106 if ( (bits >= 2 && ch < 0x80) ||
107 (bits >= 3 && ch < 0x800) ||
108 (bits >= 4 && ch < 0x10000) ||
109 ch >= 0x10FFFF // RFC 3629
113 //fprintf(stderr, "overlong: %i bytes for %x\n", bits, ch);
123 //fprintf(stderr, "valid utf8\n");
127 /** Get the number of characters in an UTF-8 string.
128 * @param _s An utf-8 encoded null-terminated string.
129 * @return The number of unicode characters in the string.
131 size_t u8_strlen(const char *_s)
135 const unsigned char *s = (const unsigned char*)_s;
137 if (!utf8_enable.integer)
142 // ascii char, skip u8_analyze
150 // invalid, skip u8_analyze
157 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
159 // valid character, skip after it
166 /** Get the number of characters in a part of an UTF-8 string.
167 * @param _s An utf-8 encoded null-terminated string.
168 * @param n The maximum number of bytes.
169 * @return The number of unicode characters in the string.
171 size_t u8_strnlen(const char *_s, size_t n)
175 const unsigned char *s = (const unsigned char*)_s;
177 if (!utf8_enable.integer)
180 return (len < n) ? len : n;
185 // ascii char, skip u8_analyze
194 // invalid, skip u8_analyze
202 if (!u8_analyze((const char*)s, &st, &ln, NULL, n))
204 // valid character, see if it's still inside the range specified by n:
214 /** Get the number of bytes used in a string to represent an amount of characters.
215 * @param _s An utf-8 encoded null-terminated string.
216 * @param n The number of characters we want to know the byte-size for.
217 * @return The number of bytes used to represent n characters.
219 size_t u8_bytelen(const char *_s, size_t n)
223 const unsigned char *s = (const unsigned char*)_s;
225 if (!utf8_enable.integer) {
227 return (len < n) ? len : n;
232 // ascii char, skip u8_analyze
241 // invalid, skip u8_analyze
249 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
258 /** Get the byte-index for a character-index.
259 * @param _s An utf-8 encoded string.
260 * @param i The character-index for which you want the byte offset.
261 * @param len If not null, character's length will be stored in there.
262 * @return The byte-index at which the character begins, or -1 if the string is too short.
264 int u8_byteofs(const char *_s, size_t i, size_t *len)
268 const unsigned char *s = (const unsigned char*)_s;
270 if (!utf8_enable.integer)
286 if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
295 /** Get the char-index for a byte-index.
296 * @param _s An utf-8 encoded string.
297 * @param i The byte offset for which you want the character index.
298 * @param len If not null, the offset within the character is stored here.
299 * @return The character-index, or -1 if the string is too short.
301 int u8_charidx(const char *_s, size_t i, size_t *len)
307 const unsigned char *s = (const unsigned char*)_s;
309 if (!utf8_enable.integer)
315 while (ofs < i && s[ofs])
317 // ascii character, skip u8_analyze
326 // invalid, skip u8_analyze
333 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
335 // see if next char is after the bytemark
345 // see if bytemark is within the char
357 /** Get the byte offset of the previous byte.
359 * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
360 * @param _s An utf-8 encoded string.
361 * @param i The current byte offset.
362 * @return The byte offset of the previous character
364 size_t u8_prevbyte(const char *_s, size_t i)
367 const unsigned char *s = (const unsigned char*)_s;
371 if (!utf8_enable.integer)
378 while (ofs < i && s[ofs])
380 // ascii character, skip u8_analyze
387 // invalid, skip u8_analyze
394 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
398 if (ofs + st + ln >= i)
407 static int char_usefont[256] = {
408 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials
409 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials
410 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // shift+digit line
411 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // digits
412 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // caps
413 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // caps
414 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // small
415 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // small
416 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials
417 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // faces
418 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
422 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
423 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
427 /** Fetch a character from an utf-8 encoded string.
428 * @param _s The start of an utf-8 encoded multi-byte character.
429 * @param _end Will point to after the first multi-byte character.
430 * @return The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
432 Uchar u8_getchar(const char *_s, const char **_end)
437 if (!utf8_enable.integer)
441 /* Careful: if we disable utf8 but not freetype, we wish to see freetype chars
442 * for normal letters. So use E000+x for special chars, but leave the freetype stuff for the
445 if (!char_usefont[(unsigned int)*(const unsigned char*)_s])
446 return 0xE000 + (Uchar)*(const unsigned char*)_s;
447 return (Uchar)*(const unsigned char*)_s;
450 if (!u8_analyze(_s, &st, &ln, &ch, U8_ANALYZE_INFINITY))
453 *_end = _s + st + ln;
457 /** Fetch a character from an utf-8 encoded string.
458 * @param _s The start of an utf-8 encoded multi-byte character.
459 * @param _end Will point to after the first multi-byte character.
460 * @return The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
462 Uchar u8_getnchar(const char *_s, const char **_end, size_t _maxlen)
467 if (!utf8_enable.integer)
471 /* Careful: if we disable utf8 but not freetype, we wish to see freetype chars
472 * for normal letters. So use E000+x for special chars, but leave the freetype stuff for the
475 if (!char_usefont[(unsigned int)*(const unsigned char*)_s])
476 return 0xE000 + (Uchar)*(const unsigned char*)_s;
477 return (Uchar)*(const unsigned char*)_s;
480 if (!u8_analyze(_s, &st, &ln, &ch, _maxlen))
483 *_end = _s + st + ln;
487 /** Encode a wide-character into utf-8.
488 * @param w The wide character to encode.
489 * @param to The target buffer the utf-8 encoded string is stored to.
490 * @param maxlen The maximum number of bytes that fit into the target buffer.
491 * @return Number of bytes written to the buffer not including the terminating null.
492 * Less or equal to 0 if the buffer is too small.
494 int u8_fromchar(Uchar w, char *to, size_t maxlen)
502 if (w >= 0xE000 && !utf8_enable.integer)
505 if (w < 0x80 || !utf8_enable.integer)
513 // for a little speedup
522 to[1] = 0x80 | (w & 0x3F); w >>= 6;
534 to[2] = 0x80 | (w & 0x3F); w >>= 6;
535 to[1] = 0x80 | (w & 0x3F); w >>= 6;
549 to[3] = 0x80 | (w & 0x3F); w >>= 6;
550 to[2] = 0x80 | (w & 0x3F); w >>= 6;
551 to[1] = 0x80 | (w & 0x3F); w >>= 6;
558 /** uses u8_fromchar on a static buffer
559 * @param ch The unicode character to convert to encode
560 * @param l The number of bytes without the terminating null.
561 * @return A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
563 char *u8_encodech(Uchar ch, size_t *l)
567 len = u8_fromchar(ch, buf, sizeof(buf));
576 /** Convert a utf-8 multibyte string to a wide character string.
577 * @param wcs The target wide-character buffer.
578 * @param mb The utf-8 encoded multibyte string to convert.
579 * @param maxlen The maximum number of wide-characters that fit into the target buffer.
580 * @return The number of characters written to the target buffer.
582 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
588 for (i = 0; *mb && i < maxlen-1; ++i)
590 ch = u8_getchar(mb, &mb);
599 /** Convert a wide-character string to a utf-8 multibyte string.
600 * @param mb The target buffer the utf-8 string is written to.
601 * @param wcs The wide-character string to convert.
602 * @param maxlen The number bytes that fit into the multibyte target buffer.
603 * @return The number of bytes written, not including the terminating \0
605 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
608 const char *start = mb;
611 for (i = 0; wcs[i] && i < maxlen-1; ++i)
615 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
619 mb += u8_fromchar(wcs[i], mb, maxlen - i);
627 UTF-8 aware COM_StringLengthNoColors
629 calculates the visible width of a color coded string.
631 *valid is filled with TRUE if the string is a valid colored string (that is, if
632 it does not end with an unfinished color code). If it gets filled with FALSE, a
633 fix would be adding a STRING_COLOR_TAG at the end of the string.
635 valid can be set to NULL if the caller doesn't care.
637 For size_s, specify the maximum number of characters from s to use, or 0 to use
638 all characters until the zero terminator.
642 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
644 u8_COM_StringLengthNoColors(const char *_s, size_t size_s, qboolean *valid)
646 const unsigned char *s = (const unsigned char*)_s;
647 const unsigned char *end;
650 if (!utf8_enable.integer)
651 return COM_StringLengthNoColors(_s, size_s, valid);
653 end = size_s ? (s + size_s) : NULL;
657 switch((s == end) ? 0 : *s)
663 case STRING_COLOR_TAG:
665 switch((s == end) ? 0 : *s)
667 case STRING_COLOR_RGB_TAG_CHAR:
668 if (s+1 != end && isxdigit(s[1]) &&
669 s+2 != end && isxdigit(s[2]) &&
670 s+3 != end && isxdigit(s[3]) )
675 ++len; // STRING_COLOR_TAG
676 ++len; // STRING_COLOR_RGB_TAG_CHAR
678 case 0: // ends with unfinished color code!
683 case STRING_COLOR_TAG: // escaped ^
686 case '0': case '1': case '2': case '3': case '4':
687 case '5': case '6': case '7': case '8': case '9': // color code
689 default: // not a color code
690 ++len; // STRING_COLOR_TAG
691 ++len; // the character
700 // start of a wide character
703 for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
706 // part of a wide character, we ignore that one
714 /** Pads a utf-8 string
715 * @param out The target buffer the utf-8 string is written to.
716 * @param outsize The size of the target buffer, including the final NUL
717 * @param in The input utf-8 buffer
718 * @param leftalign Left align the output string (by default right alignment is done)
719 * @param minwidth The minimum output width
720 * @param maxwidth The maximum output width
721 * @return The number of bytes written, not including the terminating \0
723 size_t u8_strpad(char *out, size_t outsize, const char *in, qboolean leftalign, size_t minwidth, size_t maxwidth)
725 if(!utf8_enable.integer)
727 return dpsnprintf(out, outsize, "%*.*s", leftalign ? -(int) minwidth : (int) minwidth, (int) maxwidth, in);
731 size_t l = u8_bytelen(in, maxwidth);
732 size_t actual_width = u8_strnlen(in, l);
733 int pad = (actual_width >= minwidth) ? 0 : (minwidth - actual_width);
735 int lpad = leftalign ? 0 : pad;
736 int rpad = leftalign ? pad : 0;
737 return dpsnprintf(out, outsize, "%*s%.*s%*s", lpad, "", prec, in, rpad, "");