utf8lib.c

   1 #include "quakedef.h"
   2 #include "utf8lib.h"
   3
   4 /*
   5 ================================================================================
   6 Initialization of UTF-8 support and new cvars.
   7 ================================================================================
   8 */
   9 // for compatibility this defaults to 0
  10 cvar_t    utf8_enable = {CVAR_SAVE, "utf8_enable", "0", "Enable UTF-8 support. For compatibility, this is disabled by default in most games."};
  11
  12 void   u8_Init(void)
  13 {
  14         Cvar_RegisterVariable(&utf8_enable);
  15 }
  16
  17 /*
  18 ================================================================================
  19 UTF-8 encoding and decoding functions follow.
  20 ================================================================================
  21 */
  22
  23 /** Analyze the next character and return various information if requested.
  24  * @param _s      An utf-8 string.
  25  * @param _start  Filled with the start byte-offset of the next valid character
  26  * @param _len    Fileed with the length of the next valid character
  27  * @param _ch     Filled with the unicode value of the next character
  28  * @param _maxlen Maximum number of bytes to read from _s
  29  * @return        Whether or not another valid character is in the string
  30  */
  31 #define U8_ANALYZE_INFINITY 7
  32 static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch, size_t _maxlen)
  33 {
  34         const unsigned char *s = (const unsigned char*)_s;
  35         unsigned char bt, bc;
  36         size_t i;
  37         size_t bits, j;
  38         Uchar ch;
  39
  40         i = 0;
  41 findchar:
  42
  43         // <0xC2 is always an overlong encoding, they're invalid, thus skipped
  44         while (i < _maxlen && s[i] && s[i] >= 0x80 && s[i] <= 0xC2) {
  45                 //fprintf(stderr, "skipping\n");
  46                 ++i;
  47         }
  48         if(i >= _maxlen)
  49                 return false;
  50         //fprintf(stderr, "checking\n");
  51
  52         // If we hit the end, well, we're out and invalid
  53         if (!s[i])
  54                 return false;
  55         //fprintf(stderr, "checking ascii\n");
  56
  57         // ascii characters
  58         if (s[i] < 0x80)
  59         {
  60                 if (_start) *_start = i;
  61                 if (_len) *_len = 1;
  62                 if (_ch) *_ch = (Uchar)s[i];
  63                 //fprintf(stderr, "valid ascii\n");
  64                 return true;
  65         }
  66         //fprintf(stderr, "checking length\n");
  67
  68         // Figure out the next char's length
  69         bc = s[i];
  70         bits = 1;
  71         // count the 1 bits, they're the # of bytes
  72         for (bt = 0x40; bt && (bc & bt); bt >>= 1, ++bits);
  73         if (!bt)
  74         {
  75                 //fprintf(stderr, "superlong\n");
  76                 ++i;
  77                 goto findchar;
  78         }
  79         if(i + bits > _maxlen)
  80                 return false;
  81         // turn bt into a mask and give ch a starting value
  82         --bt;
  83         ch = (s[i] & bt);
  84         // check the byte sequence for invalid bytes
  85         for (j = 1; j < bits; ++j)
  86         {
  87                 // valid bit value: 10xx xxxx
  88                 //if (s[i+j] < 0x80 || s[i+j] >= 0xC0)
  89                 if ( (s[i+j] & 0xC0) != 0x80 )
  90                 {
  91                         //fprintf(stderr, "sequence of %i f'd at %i by %x\n", bits, j, (unsigned int)s[i+j]);
  92                         // this byte sequence is invalid, skip it
  93                         i += j;
  94                         // find a character after it
  95                         goto findchar;
  96                 }
  97                 // at the same time, decode the character
  98                 ch = (ch << 6) | (s[i+j] & 0x3F);
  99         }
 100
 101         // Now check the decoded byte for an overlong encoding
 102         if ( (bits >= 2 && ch < 0x80) ||
 103              (bits >= 3 && ch < 0x800) ||
 104              (bits >= 4 && ch < 0x10000) ||
 105              ch >= 0x10FFFF // RFC 3629
 106                 )
 107         {
 108                 i += bits;
 109                 //fprintf(stderr, "overlong: %i bytes for %x\n", bits, ch);
 110                 goto findchar;
 111         }
 112
 113         if (_start)
 114                 *_start = i;
 115         if (_len)
 116                 *_len = bits;
 117         if (_ch)
 118                 *_ch = ch;
 119         //fprintf(stderr, "valid utf8\n");
 120         return true;
 121 }
 122
 123 /** Get the number of characters in an UTF-8 string.
 124  * @param _s    An utf-8 encoded null-terminated string.
 125  * @return      The number of unicode characters in the string.
 126  */
 127 size_t u8_strlen(const char *_s)
 128 {
 129         size_t st, ln;
 130         size_t len = 0;
 131         const unsigned char *s = (const unsigned char*)_s;
 132
 133         if (!utf8_enable.integer)
 134                 return strlen(_s);
 135
 136         while (*s)
 137         {
 138                 // ascii char, skip u8_analyze
 139                 if (*s < 0x80)
 140                 {
 141                         ++len;
 142                         ++s;
 143                         continue;
 144                 }
 145
 146                 // invalid, skip u8_analyze
 147                 if (*s <= 0xC2)
 148                 {
 149                         ++s;
 150                         continue;
 151                 }
 152
 153                 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 154                         break;
 155                 // valid character, skip after it
 156                 s += st + ln;
 157                 ++len;
 158         }
 159         return len;
 160 }
 161
 162 /** Get the number of characters in a part of an UTF-8 string.
 163  * @param _s    An utf-8 encoded null-terminated string.
 164  * @param n     The maximum number of bytes.
 165  * @return      The number of unicode characters in the string.
 166  */
 167 size_t u8_strnlen(const char *_s, size_t n)
 168 {
 169         size_t st, ln;
 170         size_t len = 0;
 171         const unsigned char *s = (const unsigned char*)_s;
 172
 173         if (!utf8_enable.integer)
 174         {
 175                 len = strlen(_s);
 176                 return (len < n) ? len : n;
 177         }
 178
 179         while (*s && n)
 180         {
 181                 // ascii char, skip u8_analyze
 182                 if (*s < 0x80)
 183                 {
 184                         ++len;
 185                         ++s;
 186                         --n;
 187                         continue;
 188                 }
 189
 190                 // invalid, skip u8_analyze
 191                 if (*s <= 0xC2)
 192                 {
 193                         ++s;
 194                         --n;
 195                         continue;
 196                 }
 197
 198                 if (!u8_analyze((const char*)s, &st, &ln, NULL, n))
 199                         break;
 200                 // valid character, see if it's still inside the range specified by n:
 201                 if (n < st + ln)
 202                         return len;
 203                 ++len;
 204                 n -= st + ln;
 205                 s += st + ln;
 206         }
 207         return len;
 208 }
 209
 210 /** Get the number of bytes used in a string to represent an amount of characters.
 211  * @param _s    An utf-8 encoded null-terminated string.
 212  * @param n     The number of characters we want to know the byte-size for.
 213  * @return      The number of bytes used to represent n characters.
 214  */
 215 size_t u8_bytelen(const char *_s, size_t n)
 216 {
 217         size_t st, ln;
 218         size_t len = 0;
 219         const unsigned char *s = (const unsigned char*)_s;
 220
 221         if (!utf8_enable.integer)
 222                 return n;
 223
 224         while (*s && n)
 225         {
 226                 // ascii char, skip u8_analyze
 227                 if (*s < 0x80)
 228                 {
 229                         ++len;
 230                         ++s;
 231                         --n;
 232                         continue;
 233                 }
 234
 235                 // invalid, skip u8_analyze
 236                 if (*s <= 0xC2)
 237                 {
 238                         ++s;
 239                         ++len;
 240                         continue;
 241                 }
 242
 243                 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 244                         break;
 245                 --n;
 246                 s += st + ln;
 247                 len += st + ln;
 248         }
 249         return len;
 250 }
 251
 252 /** Get the byte-index for a character-index.
 253  * @param _s      An utf-8 encoded string.
 254  * @param i       The character-index for which you want the byte offset.
 255  * @param len     If not null, character's length will be stored in there.
 256  * @return        The byte-index at which the character begins, or -1 if the string is too short.
 257  */
 258 int u8_byteofs(const char *_s, size_t i, size_t *len)
 259 {
 260         size_t st, ln;
 261         size_t ofs = 0;
 262         const unsigned char *s = (const unsigned char*)_s;
 263
 264         if (!utf8_enable.integer)
 265         {
 266                 if (len) *len = 1;
 267                 return i;
 268         }
 269
 270         st = ln = 0;
 271         do
 272         {
 273                 ofs += ln;
 274                 if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 275                         return -1;
 276                 ofs += st;
 277         } while(i-- > 0);
 278         if (len)
 279                 *len = ln;
 280         return ofs;
 281 }
 282
 283 /** Get the char-index for a byte-index.
 284  * @param _s      An utf-8 encoded string.
 285  * @param i       The byte offset for which you want the character index.
 286  * @param len     If not null, the offset within the character is stored here.
 287  * @return        The character-index, or -1 if the string is too short.
 288  */
 289 int u8_charidx(const char *_s, size_t i, size_t *len)
 290 {
 291         size_t st, ln;
 292         size_t ofs = 0;
 293         size_t pofs = 0;
 294         int idx = 0;
 295         const unsigned char *s = (const unsigned char*)_s;
 296
 297         if (!utf8_enable.integer)
 298         {
 299                 if (len) *len = 0;
 300                 return i;
 301         }
 302
 303         while (ofs < i && s[ofs])
 304         {
 305                 // ascii character, skip u8_analyze
 306                 if (s[ofs] < 0x80)
 307                 {
 308                         pofs = ofs;
 309                         ++idx;
 310                         ++ofs;
 311                         continue;
 312                 }
 313
 314                 // invalid, skip u8_analyze
 315                 if (s[ofs] <= 0xC2)
 316                 {
 317                         ++ofs;
 318                         continue;
 319                 }
 320
 321                 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 322                         return -1;
 323                 // see if next char is after the bytemark
 324                 if (ofs + st > i)
 325                 {
 326                         if (len)
 327                                 *len = i - pofs;
 328                         return idx;
 329                 }
 330                 ++idx;
 331                 pofs = ofs + st;
 332                 ofs += st + ln;
 333                 // see if bytemark is within the char
 334                 if (ofs > i)
 335                 {
 336                         if (len)
 337                                 *len = i - pofs;
 338                         return idx;
 339                 }
 340         }
 341         if (len) *len = 0;
 342         return idx;
 343 }
 344
 345 /** Get the byte offset of the previous byte.
 346  * The result equals:
 347  * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
 348  * @param _s      An utf-8 encoded string.
 349  * @param i       The current byte offset.
 350  * @return        The byte offset of the previous character
 351  */
 352 size_t u8_prevbyte(const char *_s, size_t i)
 353 {
 354         size_t st, ln;
 355         const unsigned char *s = (const unsigned char*)_s;
 356         size_t lastofs = 0;
 357         size_t ofs = 0;
 358
 359         if (!utf8_enable.integer)
 360         {
 361                 if (i > 0)
 362                         return i-1;
 363                 return 0;
 364         }
 365
 366         while (ofs < i && s[ofs])
 367         {
 368                 // ascii character, skip u8_analyze
 369                 if (s[ofs] < 0x80)
 370                 {
 371                         lastofs = ofs++;
 372                         continue;
 373                 }
 374
 375                 // invalid, skip u8_analyze
 376                 if (s[ofs] <= 0xC2)
 377                 {
 378                         ++ofs;
 379                         continue;
 380                 }
 381
 382                 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 383                         return lastofs;
 384                 if (ofs + st > i)
 385                         return lastofs;
 386                 if (ofs + st + ln >= i)
 387                         return ofs + st;
 388
 389                 lastofs = ofs;
 390                 ofs += st + ln;
 391         }
 392         return lastofs;
 393 }
 394
 395 static int char_usefont[256] = {
 396         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials
 397         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials
 398         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // shift+digit line
 399         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // digits
 400         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // caps
 401         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // caps
 402         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // small
 403         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // small
 404         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials
 405         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // faces
 406         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 407         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 408         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 409         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 410         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 411         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 412 };
 413
 414
 415 /** Fetch a character from an utf-8 encoded string.
 416  * @param _s      The start of an utf-8 encoded multi-byte character.
 417  * @param _end    Will point to after the first multi-byte character.
 418  * @return        The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
 419  */
 420 Uchar u8_getchar(const char *_s, const char **_end)
 421 {
 422         size_t st, ln;
 423         Uchar ch;
 424
 425         if (!utf8_enable.integer)
 426         {
 427                 if (_end)
 428                         *_end = _s + 1;
 429                 /* Careful: if we disable utf8 but not freetype, we wish to see freetype chars
 430                  * for normal letters. So use E000+x for special chars, but leave the freetype stuff for the
 431                  * rest:
 432                  */
 433                 if (!char_usefont[(unsigned int)*(const unsigned char*)_s])
 434                         return 0xE000 + (Uchar)*(const unsigned char*)_s;
 435                 return (Uchar)*(const unsigned char*)_s;
 436         }
 437
 438         if (!u8_analyze(_s, &st, &ln, &ch, U8_ANALYZE_INFINITY))
 439                 return 0;
 440         if (_end)
 441                 *_end = _s + st + ln;
 442         return ch;
 443 }
 444
 445 /** Fetch a character from an utf-8 encoded string.
 446  * @param _s      The start of an utf-8 encoded multi-byte character.
 447  * @param _end    Will point to after the first multi-byte character.
 448  * @return        The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
 449  */
 450 Uchar u8_getnchar(const char *_s, const char **_end, size_t _maxlen)
 451 {
 452         size_t st, ln;
 453         Uchar ch;
 454
 455         if (!utf8_enable.integer)
 456         {
 457                 if (_end)
 458                         *_end = _s + 1;
 459                 /* Careful: if we disable utf8 but not freetype, we wish to see freetype chars
 460                  * for normal letters. So use E000+x for special chars, but leave the freetype stuff for the
 461                  * rest:
 462                  */
 463                 if (!char_usefont[(unsigned int)*(const unsigned char*)_s])
 464                         return 0xE000 + (Uchar)*(const unsigned char*)_s;
 465                 return (Uchar)*(const unsigned char*)_s;
 466         }
 467
 468         if (!u8_analyze(_s, &st, &ln, &ch, _maxlen))
 469                 return 0;
 470         if (_end)
 471                 *_end = _s + st + ln;
 472         return ch;
 473 }
 474
 475 /** Encode a wide-character into utf-8.
 476  * @param w        The wide character to encode.
 477  * @param to       The target buffer the utf-8 encoded string is stored to.
 478  * @param maxlen   The maximum number of bytes that fit into the target buffer.
 479  * @return         Number of bytes written to the buffer not including the terminating null.
 480  *                 Less or equal to 0 if the buffer is too small.
 481  */
 482 int u8_fromchar(Uchar w, char *to, size_t maxlen)
 483 {
 484         if (maxlen < 1)
 485                 return -2;
 486
 487         if (!w)
 488                 return -5;
 489
 490         if (w >= 0xE000 && !utf8_enable.integer)
 491                 w -= 0xE000;
 492
 493         if (w < 0x80 || !utf8_enable.integer)
 494         {
 495                 to[0] = (char)w;
 496                 if (maxlen < 2)
 497                         return -1;
 498                 to[1] = 0;
 499                 return 1;
 500         }
 501         // for a little speedup
 502         if (w < 0x800)
 503         {
 504                 if (maxlen < 3)
 505                 {
 506                         to[0] = 0;
 507                         return -1;
 508                 }
 509                 to[2] = 0;
 510                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 511                 to[0] = 0xC0 | w;
 512                 return 2;
 513         }
 514         if (w < 0x10000)
 515         {
 516                 if (maxlen < 4)
 517                 {
 518                         to[0] = 0;
 519                         return -1;
 520                 }
 521                 to[3] = 0;
 522                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 523                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 524                 to[0] = 0xE0 | w;
 525                 return 3;
 526         }
 527
 528         // RFC 3629
 529         if (w <= 0x10FFFF)
 530         {
 531                 if (maxlen < 5)
 532                 {
 533                         to[0] = 0;
 534                         return -1;
 535                 }
 536                 to[4] = 0;
 537                 to[3] = 0x80 | (w & 0x3F); w >>= 6;
 538                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 539                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 540                 to[0] = 0xE0 | w;
 541                 return 4;
 542         }
 543         return -1;
 544 }
 545
 546 /** uses u8_fromchar on a static buffer
 547  * @param ch        The unicode character to convert to encode
 548  * @param l         The number of bytes without the terminating null.
 549  * @return          A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
 550  */
 551 char *u8_encodech(Uchar ch, size_t *l)
 552 {
 553         static char buf[16];
 554         size_t len;
 555         len = u8_fromchar(ch, buf, sizeof(buf));
 556         if (len > 0)
 557         {
 558                 if (l) *l = len;
 559                 return buf;
 560         }
 561         return NULL;
 562 }
 563
 564 /** Convert a utf-8 multibyte string to a wide character string.
 565  * @param wcs       The target wide-character buffer.
 566  * @param mb        The utf-8 encoded multibyte string to convert.
 567  * @param maxlen    The maximum number of wide-characters that fit into the target buffer.
 568  * @return          The number of characters written to the target buffer.
 569  */
 570 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
 571 {
 572         size_t i;
 573         Uchar ch;
 574         if (maxlen < 1)
 575                 return 0;
 576         for (i = 0; *mb && i < maxlen-1; ++i)
 577         {
 578                 ch = u8_getchar(mb, &mb);
 579                 if (!ch)
 580                         break;
 581                 wcs[i] = ch;
 582         }
 583         wcs[i] = 0;
 584         return i;
 585 }
 586
 587 /** Convert a wide-character string to a utf-8 multibyte string.
 588  * @param mb      The target buffer the utf-8 string is written to.
 589  * @param wcs     The wide-character string to convert.
 590  * @param maxlen  The number bytes that fit into the multibyte target buffer.
 591  * @return        The number of bytes written, not including the terminating \0
 592  */
 593 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
 594 {
 595         size_t i;
 596         const char *start = mb;
 597         if (maxlen < 2)
 598                 return 0;
 599         for (i = 0; wcs[i] && i < maxlen-1; ++i)
 600         {
 601                 int len;
 602                 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
 603                         return (mb - start);
 604                 mb += len;
 605         }
 606         *mb = 0;
 607         return (mb - start);
 608 }
 609
 610 /*
 611 ============
 612 UTF-8 aware COM_StringLengthNoColors
 613
 614 calculates the visible width of a color coded string.
 615
 616 *valid is filled with TRUE if the string is a valid colored string (that is, if
 617 it does not end with an unfinished color code). If it gets filled with FALSE, a
 618 fix would be adding a STRING_COLOR_TAG at the end of the string.
 619
 620 valid can be set to NULL if the caller doesn't care.
 621
 622 For size_s, specify the maximum number of characters from s to use, or 0 to use
 623 all characters until the zero terminator.
 624 ============
 625 */
 626 size_t
 627 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
 628 size_t
 629 u8_COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid)
 630 {
 631         const char *end;
 632         size_t len = 0;
 633
 634         if (!utf8_enable.integer)
 635                 return COM_StringLengthNoColors(s, size_s, valid);
 636
 637         end = size_s ? (s + size_s) : NULL;
 638
 639         for(;;)
 640         {
 641                 switch((s == end) ? 0 : *s)
 642                 {
 643                         case 0:
 644                                 if(valid)
 645                                         *valid = TRUE;
 646                                 return len;
 647                         case STRING_COLOR_TAG:
 648                                 ++s;
 649                                 switch((s == end) ? 0 : *s)
 650                                 {
 651                                         case STRING_COLOR_RGB_TAG_CHAR:
 652                                                 if (s+1 != end && isxdigit(s[1]) &&
 653                                                         s+2 != end && isxdigit(s[2]) &&
 654                                                         s+3 != end && isxdigit(s[3]) )
 655                                                 {
 656                                                         s+=3;
 657                                                         break;
 658                                                 }
 659                                                 ++len; // STRING_COLOR_TAG
 660                                                 ++len; // STRING_COLOR_RGB_TAG_CHAR
 661                                                 break;
 662                                         case 0: // ends with unfinished color code!
 663                                                 ++len;
 664                                                 if(valid)
 665                                                         *valid = FALSE;
 666                                                 return len;
 667                                         case STRING_COLOR_TAG: // escaped ^
 668                                                 ++len;
 669                                                 break;
 670                                         case '0': case '1': case '2': case '3': case '4':
 671                                         case '5': case '6': case '7': case '8': case '9': // color code
 672                                                 break;
 673                                         default: // not a color code
 674                                                 ++len; // STRING_COLOR_TAG
 675                                                 ++len; // the character
 676                                                 break;
 677                                 }
 678                                 break;
 679                         default:
 680                                 ++len;
 681                                 break;
 682                 }
 683
 684                 // start of a wide character
 685                 if (*s & 0xC0)
 686                 {
 687                         for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
 688                         continue;
 689                 }
 690                 // part of a wide character, we ignore that one
 691                 if (*s <= 0xBF)
 692                         --len;
 693                 ++s;
 694         }
 695         // never get here
 696 }