utf8lib.c

   1 #include "quakedef.h"
   2 #include "utf8lib.h"
   3
   4 /*
   5 ================================================================================
   6 Initialization of UTF-8 support and new cvars.
   7 ================================================================================
   8 */
   9 // for compatibility this defaults to 0
  10 cvar_t    utf8_enable = {CVAR_SAVE, "utf8_enable", "0", "Enable UTF-8 support. For compatibility, this is disabled by default in most games."};
  11
  12 void   u8_Init(void)
  13 {
  14         Cvar_RegisterVariable(&utf8_enable);
  15 }
  16
  17 /*
  18 ================================================================================
  19 UTF-8 encoding and decoding functions follow.
  20 ================================================================================
  21 */
  22
  23 unsigned char utf8_lengths[256] = { // 0 = invalid
  24         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // ascii characters
  25         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  26         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  27         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  28         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  29         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  30         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  31         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  32         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0xBF are within multibyte sequences
  33         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // they could be interpreted as 2-byte starts but
  34         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // the codepoint would be < 127
  35         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  36         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 and C1 would also result in overlong encodings
  37         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  38         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  39         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  40         // with F5 the codepoint is above 0x10FFFF,
  41         // F8-FB would start 5-byte sequences
  42         // FC-FD would start 6-byte sequences
  43         // ...
  44 };
  45 Uchar utf8_range[5] = {
  46         1,       // invalid - let's not allow the creation of 0-bytes :P
  47         1,       // ascii minimum
  48         0x80,    // 2-byte minimum
  49         0x800,   // 3-byte minimum
  50         0x10000, // 4-byte minimum
  51 };
  52
  53 /** Analyze the next character and return various information if requested.
  54  * @param _s      An utf-8 string.
  55  * @param _start  Filled with the start byte-offset of the next valid character
  56  * @param _len    Fileed with the length of the next valid character
  57  * @param _ch     Filled with the unicode value of the next character
  58  * @param _maxlen Maximum number of bytes to read from _s
  59  * @return        Whether or not another valid character is in the string
  60  */
  61 #define U8_ANALYZE_INFINITY 7
  62 static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch, size_t _maxlen)
  63 {
  64         const unsigned char *s = (const unsigned char*)_s;
  65         unsigned char bt;//, bc;
  66         size_t i;
  67         size_t bits, j;
  68         Uchar ch;
  69
  70         i = 0;
  71 findchar:
  72         while (i < _maxlen && s[i] && (bits = utf8_lengths[s[i]]) == 0)
  73                 ++i;
  74
  75         if (i >= _maxlen || !s[i]) {
  76                 if (_start) *_start = i;
  77                 if (_len) *_len = 0;
  78                 return false;
  79         }
  80
  81         if (bits == 1) { // ascii
  82                 if (_start) *_start = i;
  83                 if (_len) *_len = 1;
  84                 if (_ch) *_ch = (Uchar)s[i];
  85                 return true;
  86         }
  87
  88         ch = (s[i] & (0xFF >> bits));
  89         for (j = 1; j < bits; ++j)
  90         {
  91                 if ( (s[i+j] & 0xC0) != 0x80 )
  92                 {
  93                         i += j;
  94                         goto findchar;
  95                 }
  96                 ch = (ch << 6) | (s[i+j] & 0x3F);
  97         }
  98         if (ch < utf8_range[bits] || ch >= 0x10FFFF)
  99         {
 100                 i += bits;
 101                 goto findchar;
 102         }
 103 #if 0
 104         // <0xC2 is always an overlong encoding, they're invalid, thus skipped
 105         while (i < _maxlen && s[i] && s[i] >= 0x80 && s[i] < 0xC2) {
 106                 //fprintf(stderr, "skipping\n");
 107                 ++i;
 108         }
 109
 110         // If we hit the end, well, we're out and invalid
 111         if(i >= _maxlen || !s[i]) {
 112                 if (_start) *_start = i;
 113                 if (_len) *_len = 0;
 114                 return false;
 115         }
 116
 117         // I'll leave that in - if you remove it, also change the part below
 118         // to support 1-byte chars correctly
 119         if (s[i] < 0x80)
 120         {
 121                 if (_start) *_start = i;
 122                 if (_len) *_len = 1;
 123                 if (_ch) *_ch = (Uchar)s[i];
 124                 //fprintf(stderr, "valid ascii\n");
 125                 return true;
 126         }
 127
 128         // Figure out the next char's length
 129         bc = s[i];
 130         bits = 1;
 131         // count the 1 bits, they're the # of bytes
 132         for (bt = 0x40; bt && (bc & bt); bt >>= 1, ++bits);
 133         if (!bt)
 134         {
 135                 //fprintf(stderr, "superlong\n");
 136                 ++i;
 137                 goto findchar;
 138         }
 139         if(i + bits > _maxlen) {
 140                 /*
 141                 if (_start) *_start = i;
 142                 if (_len) *_len = 0;
 143                 return false;
 144                 */
 145                 ++i;
 146                 goto findchar;
 147         }
 148         // turn bt into a mask and give ch a starting value
 149         --bt;
 150         ch = (s[i] & bt);
 151         // check the byte sequence for invalid bytes
 152         for (j = 1; j < bits; ++j)
 153         {
 154                 // valid bit value: 10xx xxxx
 155                 //if (s[i+j] < 0x80 || s[i+j] >= 0xC0)
 156                 if ( (s[i+j] & 0xC0) != 0x80 )
 157                 {
 158                         //fprintf(stderr, "sequence of %i f'd at %i by %x\n", bits, j, (unsigned int)s[i+j]);
 159                         // this byte sequence is invalid, skip it
 160                         i += j;
 161                         // find a character after it
 162                         goto findchar;
 163                 }
 164                 // at the same time, decode the character
 165                 ch = (ch << 6) | (s[i+j] & 0x3F);
 166         }
 167
 168         // Now check the decoded byte for an overlong encoding
 169         if ( (bits >= 2 && ch < 0x80) ||
 170              (bits >= 3 && ch < 0x800) ||
 171              (bits >= 4 && ch < 0x10000) ||
 172              ch >= 0x10FFFF // RFC 3629
 173                 )
 174         {
 175                 i += bits;
 176                 //fprintf(stderr, "overlong: %i bytes for %x\n", bits, ch);
 177                 goto findchar;
 178         }
 179 #endif
 180
 181         if (_start)
 182                 *_start = i;
 183         if (_len)
 184                 *_len = bits;
 185         if (_ch)
 186                 *_ch = ch;
 187         //fprintf(stderr, "valid utf8\n");
 188         return true;
 189 }
 190
 191 /** Get the number of characters in an UTF-8 string.
 192  * @param _s    An utf-8 encoded null-terminated string.
 193  * @return      The number of unicode characters in the string.
 194  */
 195 size_t u8_strlen(const char *_s)
 196 {
 197         size_t st, ln;
 198         size_t len = 0;
 199         const unsigned char *s = (const unsigned char*)_s;
 200
 201         if (!utf8_enable.integer)
 202                 return strlen(_s);
 203
 204         while (*s)
 205         {
 206                 // ascii char, skip u8_analyze
 207                 if (*s < 0x80)
 208                 {
 209                         ++len;
 210                         ++s;
 211                         continue;
 212                 }
 213
 214                 // invalid, skip u8_analyze
 215                 if (*s < 0xC2)
 216                 {
 217                         ++s;
 218                         continue;
 219                 }
 220
 221                 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 222                         break;
 223                 // valid character, skip after it
 224                 s += st + ln;
 225                 ++len;
 226         }
 227         return len;
 228 }
 229
 230 /** Get the number of characters in a part of an UTF-8 string.
 231  * @param _s    An utf-8 encoded null-terminated string.
 232  * @param n     The maximum number of bytes.
 233  * @return      The number of unicode characters in the string.
 234  */
 235 size_t u8_strnlen(const char *_s, size_t n)
 236 {
 237         size_t st, ln;
 238         size_t len = 0;
 239         const unsigned char *s = (const unsigned char*)_s;
 240
 241         if (!utf8_enable.integer)
 242         {
 243                 len = strlen(_s);
 244                 return (len < n) ? len : n;
 245         }
 246
 247         while (*s && n)
 248         {
 249                 // ascii char, skip u8_analyze
 250                 if (*s < 0x80)
 251                 {
 252                         ++len;
 253                         ++s;
 254                         --n;
 255                         continue;
 256                 }
 257
 258                 // invalid, skip u8_analyze
 259                 if (*s < 0xC2)
 260                 {
 261                         ++s;
 262                         --n;
 263                         continue;
 264                 }
 265
 266                 if (!u8_analyze((const char*)s, &st, &ln, NULL, n))
 267                         break;
 268                 // valid character, see if it's still inside the range specified by n:
 269                 if (n < st + ln)
 270                         return len;
 271                 ++len;
 272                 n -= st + ln;
 273                 s += st + ln;
 274         }
 275         return len;
 276 }
 277
 278 /** Get the number of bytes used in a string to represent an amount of characters.
 279  * @param _s    An utf-8 encoded null-terminated string.
 280  * @param n     The number of characters we want to know the byte-size for.
 281  * @return      The number of bytes used to represent n characters.
 282  */
 283 size_t u8_bytelen(const char *_s, size_t n)
 284 {
 285         size_t st, ln;
 286         size_t len = 0;
 287         const unsigned char *s = (const unsigned char*)_s;
 288
 289         if (!utf8_enable.integer) {
 290                 len = strlen(_s);
 291                 return (len < n) ? len : n;
 292         }
 293
 294         while (*s && n)
 295         {
 296                 // ascii char, skip u8_analyze
 297                 if (*s < 0x80)
 298                 {
 299                         ++len;
 300                         ++s;
 301                         --n;
 302                         continue;
 303                 }
 304
 305                 // invalid, skip u8_analyze
 306                 if (*s < 0xC2)
 307                 {
 308                         ++s;
 309                         ++len;
 310                         continue;
 311                 }
 312
 313                 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 314                         break;
 315                 --n;
 316                 s += st + ln;
 317                 len += st + ln;
 318         }
 319         return len;
 320 }
 321
 322 /** Get the byte-index for a character-index.
 323  * @param _s      An utf-8 encoded string.
 324  * @param i       The character-index for which you want the byte offset.
 325  * @param len     If not null, character's length will be stored in there.
 326  * @return        The byte-index at which the character begins, or -1 if the string is too short.
 327  */
 328 int u8_byteofs(const char *_s, size_t i, size_t *len)
 329 {
 330         size_t st, ln;
 331         size_t ofs = 0;
 332         const unsigned char *s = (const unsigned char*)_s;
 333
 334         if (!utf8_enable.integer)
 335         {
 336                 if (strlen(_s) < i)
 337                 {
 338                         if (len) *len = 0;
 339                         return -1;
 340                 }
 341
 342                 if (len) *len = 1;
 343                 return i;
 344         }
 345
 346         st = ln = 0;
 347         do
 348         {
 349                 ofs += ln;
 350                 if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 351                         return -1;
 352                 ofs += st;
 353         } while(i-- > 0);
 354         if (len)
 355                 *len = ln;
 356         return ofs;
 357 }
 358
 359 /** Get the char-index for a byte-index.
 360  * @param _s      An utf-8 encoded string.
 361  * @param i       The byte offset for which you want the character index.
 362  * @param len     If not null, the offset within the character is stored here.
 363  * @return        The character-index, or -1 if the string is too short.
 364  */
 365 int u8_charidx(const char *_s, size_t i, size_t *len)
 366 {
 367         size_t st, ln;
 368         size_t ofs = 0;
 369         size_t pofs = 0;
 370         int idx = 0;
 371         const unsigned char *s = (const unsigned char*)_s;
 372
 373         if (!utf8_enable.integer)
 374         {
 375                 if (len) *len = 0;
 376                 return i;
 377         }
 378
 379         while (ofs < i && s[ofs])
 380         {
 381                 // ascii character, skip u8_analyze
 382                 if (s[ofs] < 0x80)
 383                 {
 384                         pofs = ofs;
 385                         ++idx;
 386                         ++ofs;
 387                         continue;
 388                 }
 389
 390                 // invalid, skip u8_analyze
 391                 if (s[ofs] < 0xC2)
 392                 {
 393                         ++ofs;
 394                         continue;
 395                 }
 396
 397                 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 398                         return -1;
 399                 // see if next char is after the bytemark
 400                 if (ofs + st > i)
 401                 {
 402                         if (len)
 403                                 *len = i - pofs;
 404                         return idx;
 405                 }
 406                 ++idx;
 407                 pofs = ofs + st;
 408                 ofs += st + ln;
 409                 // see if bytemark is within the char
 410                 if (ofs > i)
 411                 {
 412                         if (len)
 413                                 *len = i - pofs;
 414                         return idx;
 415                 }
 416         }
 417         if (len) *len = 0;
 418         return idx;
 419 }
 420
 421 /** Get the byte offset of the previous byte.
 422  * The result equals:
 423  * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
 424  * @param _s      An utf-8 encoded string.
 425  * @param i       The current byte offset.
 426  * @return        The byte offset of the previous character
 427  */
 428 size_t u8_prevbyte(const char *_s, size_t i)
 429 {
 430         size_t st, ln;
 431         const unsigned char *s = (const unsigned char*)_s;
 432         size_t lastofs = 0;
 433         size_t ofs = 0;
 434
 435         if (!utf8_enable.integer)
 436         {
 437                 if (i > 0)
 438                         return i-1;
 439                 return 0;
 440         }
 441
 442         while (ofs < i && s[ofs])
 443         {
 444                 // ascii character, skip u8_analyze
 445                 if (s[ofs] < 0x80)
 446                 {
 447                         lastofs = ofs++;
 448                         continue;
 449                 }
 450
 451                 // invalid, skip u8_analyze
 452                 if (s[ofs] < 0xC2)
 453                 {
 454                         ++ofs;
 455                         continue;
 456                 }
 457
 458                 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 459                         return lastofs;
 460                 if (ofs + st > i)
 461                         return lastofs;
 462                 if (ofs + st + ln >= i)
 463                         return ofs + st;
 464
 465                 lastofs = ofs;
 466                 ofs += st + ln;
 467         }
 468         return lastofs;
 469 }
 470
 471 Uchar u8_quake2utf8map[256] = {
 472         0xE000, 0xE001, 0xE002, 0xE003, 0xE004, 0xE005, 0xE006, 0xE007, 0xE008, 0xE009, 0xE00A, 0xE00B, 0xE00C, 0xE00D, 0xE00E, 0xE00F, // specials
 473         0xE010, 0xE011, 0xE012, 0xE013, 0xE014, 0xE015, 0xE016, 0xE017, 0xE018, 0xE019, 0xE01A, 0xE01B, 0xE01C, 0xE01D, 0xE01E, 0xE01F, // specials
 474         0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // shift+digit line
 475         0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // digits
 476         0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // caps
 477         0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // caps
 478         0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // small
 479         0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // small
 480         0xE080, 0xE081, 0xE082, 0xE083, 0xE084, 0xE085, 0xE086, 0xE087, 0xE088, 0xE089, 0xE08A, 0xE08B, 0xE08C, 0xE08D, 0xE08E, 0xE08F, // specials
 481         0xE090, 0xE091, 0xE092, 0xE093, 0xE094, 0xE095, 0xE096, 0xE097, 0xE098, 0xE099, 0xE09A, 0xE09B, 0xE09C, 0xE09D, 0xE09E, 0xE09F, // faces
 482         0xE0A0, 0xE0A1, 0xE0A2, 0xE0A3, 0xE0A4, 0xE0A5, 0xE0A6, 0xE0A7, 0xE0A8, 0xE0A9, 0xE0AA, 0xE0AB, 0xE0AC, 0xE0AD, 0xE0AE, 0xE0AF,
 483         0xE0B0, 0xE0B1, 0xE0B2, 0xE0B3, 0xE0B4, 0xE0B5, 0xE0B6, 0xE0B7, 0xE0B8, 0xE0B9, 0xE0BA, 0xE0BB, 0xE0BC, 0xE0BD, 0xE0BE, 0xE0BF,
 484         0xE0C0, 0xE0C1, 0xE0C2, 0xE0C3, 0xE0C4, 0xE0C5, 0xE0C6, 0xE0C7, 0xE0C8, 0xE0C9, 0xE0CA, 0xE0CB, 0xE0CC, 0xE0CD, 0xE0CE, 0xE0CF,
 485         0xE0D0, 0xE0D1, 0xE0D2, 0xE0D3, 0xE0D4, 0xE0D5, 0xE0D6, 0xE0D7, 0xE0D8, 0xE0D9, 0xE0DA, 0xE0DB, 0xE0DC, 0xE0DD, 0xE0DE, 0xE0DF,
 486         0xE0E0, 0xE0E1, 0xE0E2, 0xE0E3, 0xE0E4, 0xE0E5, 0xE0E6, 0xE0E7, 0xE0E8, 0xE0E9, 0xE0EA, 0xE0EB, 0xE0EC, 0xE0ED, 0xE0EE, 0xE0EF,
 487         0xE0F0, 0xE0F1, 0xE0F2, 0xE0F3, 0xE0F4, 0xE0F5, 0xE0F6, 0xE0F7, 0xE0F8, 0xE0F9, 0xE0FA, 0xE0FB, 0xE0FC, 0xE0FD, 0xE0FE, 0xE0FF,
 488 };
 489
 490 /** Fetch a character from an utf-8 encoded string.
 491  * @param _s      The start of an utf-8 encoded multi-byte character.
 492  * @param _end    Will point to after the first multi-byte character.
 493  * @return        The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
 494  */
 495 Uchar u8_getchar_utf8_enabled(const char *_s, const char **_end)
 496 {
 497         size_t st, ln;
 498         Uchar ch;
 499
 500         if (!u8_analyze(_s, &st, &ln, &ch, U8_ANALYZE_INFINITY))
 501                 ch = 0;
 502         if (_end)
 503                 *_end = _s + st + ln;
 504         return ch;
 505 }
 506
 507 /** Fetch a character from an utf-8 encoded string.
 508  * @param _s      The start of an utf-8 encoded multi-byte character.
 509  * @param _end    Will point to after the first multi-byte character.
 510  * @return        The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
 511  */
 512 Uchar u8_getnchar_utf8_enabled(const char *_s, const char **_end, size_t _maxlen)
 513 {
 514         size_t st, ln;
 515         Uchar ch;
 516
 517         if (!u8_analyze(_s, &st, &ln, &ch, _maxlen))
 518                 ch = 0;
 519         if (_end)
 520                 *_end = _s + st + ln;
 521         return ch;
 522 }
 523
 524 /** Encode a wide-character into utf-8.
 525  * @param w        The wide character to encode.
 526  * @param to       The target buffer the utf-8 encoded string is stored to.
 527  * @param maxlen   The maximum number of bytes that fit into the target buffer.
 528  * @return         Number of bytes written to the buffer not including the terminating null.
 529  *                 Less or equal to 0 if the buffer is too small.
 530  */
 531 int u8_fromchar(Uchar w, char *to, size_t maxlen)
 532 {
 533         if (maxlen < 1)
 534                 return 0;
 535
 536         if (!w)
 537                 return 0;
 538
 539         if (w >= 0xE000 && !utf8_enable.integer)
 540                 w -= 0xE000;
 541
 542         if (w < 0x80 || !utf8_enable.integer)
 543         {
 544                 to[0] = (char)w;
 545                 if (maxlen < 2)
 546                         return -1;
 547                 to[1] = 0;
 548                 return 1;
 549         }
 550         // for a little speedup
 551         if (w < 0x800)
 552         {
 553                 if (maxlen < 3)
 554                 {
 555                         to[0] = 0;
 556                         return -1;
 557                 }
 558                 to[2] = 0;
 559                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 560                 to[0] = 0xC0 | w;
 561                 return 2;
 562         }
 563         if (w < 0x10000)
 564         {
 565                 if (maxlen < 4)
 566                 {
 567                         to[0] = 0;
 568                         return -1;
 569                 }
 570                 to[3] = 0;
 571                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 572                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 573                 to[0] = 0xE0 | w;
 574                 return 3;
 575         }
 576
 577         // RFC 3629
 578         if (w <= 0x10FFFF)
 579         {
 580                 if (maxlen < 5)
 581                 {
 582                         to[0] = 0;
 583                         return -1;
 584                 }
 585                 to[4] = 0;
 586                 to[3] = 0x80 | (w & 0x3F); w >>= 6;
 587                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 588                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 589                 to[0] = 0xE0 | w;
 590                 return 4;
 591         }
 592         return 0;
 593 }
 594
 595 /** uses u8_fromchar on a static buffer
 596  * @param ch        The unicode character to convert to encode
 597  * @param l         The number of bytes without the terminating null.
 598  * @return          A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
 599  */
 600 char *u8_encodech(Uchar ch, size_t *l)
 601 {
 602         static char buf[16];
 603         size_t len;
 604         len = u8_fromchar(ch, buf, sizeof(buf));
 605         if (len > 0)
 606         {
 607                 if (l) *l = len;
 608                 return buf;
 609         }
 610         return NULL;
 611 }
 612
 613 /** Convert a utf-8 multibyte string to a wide character string.
 614  * @param wcs       The target wide-character buffer.
 615  * @param mb        The utf-8 encoded multibyte string to convert.
 616  * @param maxlen    The maximum number of wide-characters that fit into the target buffer.
 617  * @return          The number of characters written to the target buffer.
 618  */
 619 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
 620 {
 621         size_t i;
 622         Uchar ch;
 623         if (maxlen < 1)
 624                 return 0;
 625         for (i = 0; *mb && i < maxlen-1; ++i)
 626         {
 627                 ch = u8_getchar(mb, &mb);
 628                 if (!ch)
 629                         break;
 630                 wcs[i] = ch;
 631         }
 632         wcs[i] = 0;
 633         return i;
 634 }
 635
 636 /** Convert a wide-character string to a utf-8 multibyte string.
 637  * @param mb      The target buffer the utf-8 string is written to.
 638  * @param wcs     The wide-character string to convert.
 639  * @param maxlen  The number bytes that fit into the multibyte target buffer.
 640  * @return        The number of bytes written, not including the terminating \0
 641  */
 642 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
 643 {
 644         size_t i;
 645         const char *start = mb;
 646         if (maxlen < 2)
 647                 return 0;
 648         for (i = 0; wcs[i] && i < maxlen-1; ++i)
 649         {
 650                 /*
 651                 int len;
 652                 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
 653                         return (mb - start);
 654                 mb += len;
 655                 */
 656                 mb += u8_fromchar(wcs[i], mb, maxlen - i);
 657         }
 658         *mb = 0;
 659         return (mb - start);
 660 }
 661
 662 /*
 663 ============
 664 UTF-8 aware COM_StringLengthNoColors
 665
 666 calculates the visible width of a color coded string.
 667
 668 *valid is filled with TRUE if the string is a valid colored string (that is, if
 669 it does not end with an unfinished color code). If it gets filled with FALSE, a
 670 fix would be adding a STRING_COLOR_TAG at the end of the string.
 671
 672 valid can be set to NULL if the caller doesn't care.
 673
 674 For size_s, specify the maximum number of characters from s to use, or 0 to use
 675 all characters until the zero terminator.
 676 ============
 677 */
 678 size_t
 679 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
 680 size_t
 681 u8_COM_StringLengthNoColors(const char *_s, size_t size_s, qboolean *valid)
 682 {
 683         const unsigned char *s = (const unsigned char*)_s;
 684         const unsigned char *end;
 685         size_t len = 0;
 686
 687         if (!utf8_enable.integer)
 688                 return COM_StringLengthNoColors(_s, size_s, valid);
 689
 690         end = size_s ? (s + size_s) : NULL;
 691
 692         for(;;)
 693         {
 694                 switch((s == end) ? 0 : *s)
 695                 {
 696                         case 0:
 697                                 if(valid)
 698                                         *valid = TRUE;
 699                                 return len;
 700                         case STRING_COLOR_TAG:
 701                                 ++s;
 702                                 switch((s == end) ? 0 : *s)
 703                                 {
 704                                         case STRING_COLOR_RGB_TAG_CHAR:
 705                                                 if (s+1 != end && isxdigit(s[1]) &&
 706                                                         s+2 != end && isxdigit(s[2]) &&
 707                                                         s+3 != end && isxdigit(s[3]) )
 708                                                 {
 709                                                         s+=3;
 710                                                         break;
 711                                                 }
 712                                                 ++len; // STRING_COLOR_TAG
 713                                                 ++len; // STRING_COLOR_RGB_TAG_CHAR
 714                                                 break;
 715                                         case 0: // ends with unfinished color code!
 716                                                 ++len;
 717                                                 if(valid)
 718                                                         *valid = FALSE;
 719                                                 return len;
 720                                         case STRING_COLOR_TAG: // escaped ^
 721                                                 ++len;
 722                                                 break;
 723                                         case '0': case '1': case '2': case '3': case '4':
 724                                         case '5': case '6': case '7': case '8': case '9': // color code
 725                                                 break;
 726                                         default: // not a color code
 727                                                 ++len; // STRING_COLOR_TAG
 728                                                 ++len; // the character
 729                                                 break;
 730                                 }
 731                                 break;
 732                         default:
 733                                 ++len;
 734                                 break;
 735                 }
 736
 737                 // start of a wide character
 738                 if (*s & 0xC0)
 739                 {
 740                         for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
 741                         continue;
 742                 }
 743                 // part of a wide character, we ignore that one
 744                 if (*s <= 0xBF)
 745                         --len;
 746                 ++s;
 747         }
 748         // never get here
 749 }
 750
 751 /** Pads a utf-8 string
 752  * @param out     The target buffer the utf-8 string is written to.
 753  * @param outsize The size of the target buffer, including the final NUL
 754  * @param in      The input utf-8 buffer
 755  * @param leftalign Left align the output string (by default right alignment is done)
 756  * @param minwidth The minimum output width
 757  * @param maxwidth The maximum output width
 758  * @return        The number of bytes written, not including the terminating \0
 759  */
 760 size_t u8_strpad(char *out, size_t outsize, const char *in, qboolean leftalign, size_t minwidth, size_t maxwidth)
 761 {
 762         if(!utf8_enable.integer)
 763         {
 764                 return dpsnprintf(out, outsize, "%*.*s", leftalign ? -(int) minwidth : (int) minwidth, (int) maxwidth, in);
 765         }
 766         else
 767         {
 768                 size_t l = u8_bytelen(in, maxwidth);
 769                 size_t actual_width = u8_strnlen(in, l);
 770                 int pad = (actual_width >= minwidth) ? 0 : (minwidth - actual_width);
 771                 int prec = l;
 772                 int lpad = leftalign ? 0 : pad;
 773                 int rpad = leftalign ? pad : 0;
 774                 return dpsnprintf(out, outsize, "%*s%.*s%*s", lpad, "", prec, in, rpad, "");
 775         }
 776 }