utf8lib.c

   1 #include "quakedef.h"
   2 #include "utf8lib.h"
   3
   4 /*
   5 ================================================================================
   6 Initialization of UTF-8 support and new cvars.
   7 ================================================================================
   8 */
   9 // for compatibility this defaults to 0
  10 cvar_t    utf8_enable = {CVAR_SAVE, "utf8_enable", "0", "Enable UTF-8 support. For compatibility, this is disabled by default in most games."};
  11
  12 void   u8_Init(void)
  13 {
  14         Cvar_RegisterVariable(&utf8_enable);
  15 }
  16
  17 /*
  18 ================================================================================
  19 UTF-8 encoding and decoding functions follow.
  20 ================================================================================
  21 */
  22
  23 /** Analyze the next character and return various information if requested.
  24  * @param _s      An utf-8 string.
  25  * @param _start  Filled with the start byte-offset of the next valid character
  26  * @param _len    Fileed with the length of the next valid character
  27  * @param _ch     Filled with the unicode value of the next character
  28  * @param _maxlen Maximum number of bytes to read from _s
  29  * @return        Whether or not another valid character is in the string
  30  */
  31 #define U8_ANALYZE_INFINITY 7
  32 static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch, size_t _maxlen)
  33 {
  34         const unsigned char *s = (const unsigned char*)_s;
  35         unsigned char bt, bc;
  36         size_t i;
  37         size_t bits, j;
  38         Uchar ch;
  39
  40         i = 0;
  41 findchar:
  42
  43         // <0xC2 is always an overlong encoding, they're invalid, thus skipped
  44         while (i < _maxlen && s[i] && s[i] >= 0x80 && s[i] < 0xC2) {
  45                 //fprintf(stderr, "skipping\n");
  46                 ++i;
  47         }
  48
  49         //fprintf(stderr, "checking\n");
  50         // If we hit the end, well, we're out and invalid
  51         if(i >= _maxlen || !s[i]) {
  52                 if (_start) *_start = i;
  53                 if (_len) *_len = 0;
  54                 return false;
  55         }
  56
  57         //fprintf(stderr, "checking ascii\n");
  58         // ascii characters
  59         if (s[i] < 0x80)
  60         {
  61                 if (_start) *_start = i;
  62                 if (_len) *_len = 1;
  63                 if (_ch) *_ch = (Uchar)s[i];
  64                 //fprintf(stderr, "valid ascii\n");
  65                 return true;
  66         }
  67         //fprintf(stderr, "checking length\n");
  68
  69         // Figure out the next char's length
  70         bc = s[i];
  71         bits = 1;
  72         // count the 1 bits, they're the # of bytes
  73         for (bt = 0x40; bt && (bc & bt); bt >>= 1, ++bits);
  74         if (!bt)
  75         {
  76                 //fprintf(stderr, "superlong\n");
  77                 ++i;
  78                 goto findchar;
  79         }
  80         if(i + bits > _maxlen) {
  81                 /*
  82                 if (_start) *_start = i;
  83                 if (_len) *_len = 0;
  84                 return false;
  85                 */
  86                 ++i;
  87                 goto findchar;
  88         }
  89         // turn bt into a mask and give ch a starting value
  90         --bt;
  91         ch = (s[i] & bt);
  92         // check the byte sequence for invalid bytes
  93         for (j = 1; j < bits; ++j)
  94         {
  95                 // valid bit value: 10xx xxxx
  96                 //if (s[i+j] < 0x80 || s[i+j] >= 0xC0)
  97                 if ( (s[i+j] & 0xC0) != 0x80 )
  98                 {
  99                         //fprintf(stderr, "sequence of %i f'd at %i by %x\n", bits, j, (unsigned int)s[i+j]);
 100                         // this byte sequence is invalid, skip it
 101                         i += j;
 102                         // find a character after it
 103                         goto findchar;
 104                 }
 105                 // at the same time, decode the character
 106                 ch = (ch << 6) | (s[i+j] & 0x3F);
 107         }
 108
 109         // Now check the decoded byte for an overlong encoding
 110         if ( (bits >= 2 && ch < 0x80) ||
 111              (bits >= 3 && ch < 0x800) ||
 112              (bits >= 4 && ch < 0x10000) ||
 113              ch >= 0x10FFFF // RFC 3629
 114                 )
 115         {
 116                 i += bits;
 117                 //fprintf(stderr, "overlong: %i bytes for %x\n", bits, ch);
 118                 goto findchar;
 119         }
 120
 121         if (_start)
 122                 *_start = i;
 123         if (_len)
 124                 *_len = bits;
 125         if (_ch)
 126                 *_ch = ch;
 127         //fprintf(stderr, "valid utf8\n");
 128         return true;
 129 }
 130
 131 /** Get the number of characters in an UTF-8 string.
 132  * @param _s    An utf-8 encoded null-terminated string.
 133  * @return      The number of unicode characters in the string.
 134  */
 135 size_t u8_strlen(const char *_s)
 136 {
 137         size_t st, ln;
 138         size_t len = 0;
 139         const unsigned char *s = (const unsigned char*)_s;
 140
 141         if (!utf8_enable.integer)
 142                 return strlen(_s);
 143
 144         while (*s)
 145         {
 146                 // ascii char, skip u8_analyze
 147                 if (*s < 0x80)
 148                 {
 149                         ++len;
 150                         ++s;
 151                         continue;
 152                 }
 153
 154                 // invalid, skip u8_analyze
 155                 if (*s < 0xC2)
 156                 {
 157                         ++s;
 158                         continue;
 159                 }
 160
 161                 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 162                         break;
 163                 // valid character, skip after it
 164                 s += st + ln;
 165                 ++len;
 166         }
 167         return len;
 168 }
 169
 170 /** Get the number of characters in a part of an UTF-8 string.
 171  * @param _s    An utf-8 encoded null-terminated string.
 172  * @param n     The maximum number of bytes.
 173  * @return      The number of unicode characters in the string.
 174  */
 175 size_t u8_strnlen(const char *_s, size_t n)
 176 {
 177         size_t st, ln;
 178         size_t len = 0;
 179         const unsigned char *s = (const unsigned char*)_s;
 180
 181         if (!utf8_enable.integer)
 182         {
 183                 len = strlen(_s);
 184                 return (len < n) ? len : n;
 185         }
 186
 187         while (*s && n)
 188         {
 189                 // ascii char, skip u8_analyze
 190                 if (*s < 0x80)
 191                 {
 192                         ++len;
 193                         ++s;
 194                         --n;
 195                         continue;
 196                 }
 197
 198                 // invalid, skip u8_analyze
 199                 if (*s < 0xC2)
 200                 {
 201                         ++s;
 202                         --n;
 203                         continue;
 204                 }
 205
 206                 if (!u8_analyze((const char*)s, &st, &ln, NULL, n))
 207                         break;
 208                 // valid character, see if it's still inside the range specified by n:
 209                 if (n < st + ln)
 210                         return len;
 211                 ++len;
 212                 n -= st + ln;
 213                 s += st + ln;
 214         }
 215         return len;
 216 }
 217
 218 /** Get the number of bytes used in a string to represent an amount of characters.
 219  * @param _s    An utf-8 encoded null-terminated string.
 220  * @param n     The number of characters we want to know the byte-size for.
 221  * @return      The number of bytes used to represent n characters.
 222  */
 223 size_t u8_bytelen(const char *_s, size_t n)
 224 {
 225         size_t st, ln;
 226         size_t len = 0;
 227         const unsigned char *s = (const unsigned char*)_s;
 228
 229         if (!utf8_enable.integer) {
 230                 len = strlen(_s);
 231                 return (len < n) ? len : n;
 232         }
 233
 234         while (*s && n)
 235         {
 236                 // ascii char, skip u8_analyze
 237                 if (*s < 0x80)
 238                 {
 239                         ++len;
 240                         ++s;
 241                         --n;
 242                         continue;
 243                 }
 244
 245                 // invalid, skip u8_analyze
 246                 if (*s < 0xC2)
 247                 {
 248                         ++s;
 249                         ++len;
 250                         continue;
 251                 }
 252
 253                 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 254                         break;
 255                 --n;
 256                 s += st + ln;
 257                 len += st + ln;
 258         }
 259         return len;
 260 }
 261
 262 /** Get the byte-index for a character-index.
 263  * @param _s      An utf-8 encoded string.
 264  * @param i       The character-index for which you want the byte offset.
 265  * @param len     If not null, character's length will be stored in there.
 266  * @return        The byte-index at which the character begins, or -1 if the string is too short.
 267  */
 268 int u8_byteofs(const char *_s, size_t i, size_t *len)
 269 {
 270         size_t st, ln;
 271         size_t ofs = 0;
 272         const unsigned char *s = (const unsigned char*)_s;
 273
 274         if (!utf8_enable.integer)
 275         {
 276                 if (strlen(_s) < i)
 277                 {
 278                         if (len) *len = 0;
 279                         return -1;
 280                 }
 281
 282                 if (len) *len = 1;
 283                 return i;
 284         }
 285
 286         st = ln = 0;
 287         do
 288         {
 289                 ofs += ln;
 290                 if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 291                         return -1;
 292                 ofs += st;
 293         } while(i-- > 0);
 294         if (len)
 295                 *len = ln;
 296         return ofs;
 297 }
 298
 299 /** Get the char-index for a byte-index.
 300  * @param _s      An utf-8 encoded string.
 301  * @param i       The byte offset for which you want the character index.
 302  * @param len     If not null, the offset within the character is stored here.
 303  * @return        The character-index, or -1 if the string is too short.
 304  */
 305 int u8_charidx(const char *_s, size_t i, size_t *len)
 306 {
 307         size_t st, ln;
 308         size_t ofs = 0;
 309         size_t pofs = 0;
 310         int idx = 0;
 311         const unsigned char *s = (const unsigned char*)_s;
 312
 313         if (!utf8_enable.integer)
 314         {
 315                 if (len) *len = 0;
 316                 return i;
 317         }
 318
 319         while (ofs < i && s[ofs])
 320         {
 321                 // ascii character, skip u8_analyze
 322                 if (s[ofs] < 0x80)
 323                 {
 324                         pofs = ofs;
 325                         ++idx;
 326                         ++ofs;
 327                         continue;
 328                 }
 329
 330                 // invalid, skip u8_analyze
 331                 if (s[ofs] < 0xC2)
 332                 {
 333                         ++ofs;
 334                         continue;
 335                 }
 336
 337                 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 338                         return -1;
 339                 // see if next char is after the bytemark
 340                 if (ofs + st > i)
 341                 {
 342                         if (len)
 343                                 *len = i - pofs;
 344                         return idx;
 345                 }
 346                 ++idx;
 347                 pofs = ofs + st;
 348                 ofs += st + ln;
 349                 // see if bytemark is within the char
 350                 if (ofs > i)
 351                 {
 352                         if (len)
 353                                 *len = i - pofs;
 354                         return idx;
 355                 }
 356         }
 357         if (len) *len = 0;
 358         return idx;
 359 }
 360
 361 /** Get the byte offset of the previous byte.
 362  * The result equals:
 363  * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
 364  * @param _s      An utf-8 encoded string.
 365  * @param i       The current byte offset.
 366  * @return        The byte offset of the previous character
 367  */
 368 size_t u8_prevbyte(const char *_s, size_t i)
 369 {
 370         size_t st, ln;
 371         const unsigned char *s = (const unsigned char*)_s;
 372         size_t lastofs = 0;
 373         size_t ofs = 0;
 374
 375         if (!utf8_enable.integer)
 376         {
 377                 if (i > 0)
 378                         return i-1;
 379                 return 0;
 380         }
 381
 382         while (ofs < i && s[ofs])
 383         {
 384                 // ascii character, skip u8_analyze
 385                 if (s[ofs] < 0x80)
 386                 {
 387                         lastofs = ofs++;
 388                         continue;
 389                 }
 390
 391                 // invalid, skip u8_analyze
 392                 if (s[ofs] < 0xC2)
 393                 {
 394                         ++ofs;
 395                         continue;
 396                 }
 397
 398                 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
 399                         return lastofs;
 400                 if (ofs + st > i)
 401                         return lastofs;
 402                 if (ofs + st + ln >= i)
 403                         return ofs + st;
 404
 405                 lastofs = ofs;
 406                 ofs += st + ln;
 407         }
 408         return lastofs;
 409 }
 410
 411 Uchar u8_quake2utf8map[256] = {
 412         0xE000, 0xE001, 0xE002, 0xE003, 0xE004, 0xE005, 0xE006, 0xE007, 0xE008, 0xE009, 0xE00A, 0xE00B, 0xE00C, 0xE00D, 0xE00E, 0xE00F, // specials
 413         0xE010, 0xE011, 0xE012, 0xE013, 0xE014, 0xE015, 0xE016, 0xE017, 0xE018, 0xE019, 0xE01A, 0xE01B, 0xE01C, 0xE01D, 0xE01E, 0xE01F, // specials
 414         0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // shift+digit line
 415         0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // digits
 416         0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // caps
 417         0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // caps
 418         0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // small
 419         0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // small
 420         0xE080, 0xE081, 0xE082, 0xE083, 0xE084, 0xE085, 0xE086, 0xE087, 0xE088, 0xE089, 0xE08A, 0xE08B, 0xE08C, 0xE08D, 0xE08E, 0xE08F, // specials
 421         0xE090, 0xE091, 0xE092, 0xE093, 0xE094, 0xE095, 0xE096, 0xE097, 0xE098, 0xE099, 0xE09A, 0xE09B, 0xE09C, 0xE09D, 0xE09E, 0xE09F, // faces
 422         0xE0A0, 0xE0A1, 0xE0A2, 0xE0A3, 0xE0A4, 0xE0A5, 0xE0A6, 0xE0A7, 0xE0A8, 0xE0A9, 0xE0AA, 0xE0AB, 0xE0AC, 0xE0AD, 0xE0AE, 0xE0AF,
 423         0xE0B0, 0xE0B1, 0xE0B2, 0xE0B3, 0xE0B4, 0xE0B5, 0xE0B6, 0xE0B7, 0xE0B8, 0xE0B9, 0xE0BA, 0xE0BB, 0xE0BC, 0xE0BD, 0xE0BE, 0xE0BF,
 424         0xE0C0, 0xE0C1, 0xE0C2, 0xE0C3, 0xE0C4, 0xE0C5, 0xE0C6, 0xE0C7, 0xE0C8, 0xE0C9, 0xE0CA, 0xE0CB, 0xE0CC, 0xE0CD, 0xE0CE, 0xE0CF,
 425         0xE0D0, 0xE0D1, 0xE0D2, 0xE0D3, 0xE0D4, 0xE0D5, 0xE0D6, 0xE0D7, 0xE0D8, 0xE0D9, 0xE0DA, 0xE0DB, 0xE0DC, 0xE0DD, 0xE0DE, 0xE0DF,
 426         0xE0E0, 0xE0E1, 0xE0E2, 0xE0E3, 0xE0E4, 0xE0E5, 0xE0E6, 0xE0E7, 0xE0E8, 0xE0E9, 0xE0EA, 0xE0EB, 0xE0EC, 0xE0ED, 0xE0EE, 0xE0EF,
 427         0xE0F0, 0xE0F1, 0xE0F2, 0xE0F3, 0xE0F4, 0xE0F5, 0xE0F6, 0xE0F7, 0xE0F8, 0xE0F9, 0xE0FA, 0xE0FB, 0xE0FC, 0xE0FD, 0xE0FE, 0xE0FF,
 428 };
 429
 430 /** Fetch a character from an utf-8 encoded string.
 431  * @param _s      The start of an utf-8 encoded multi-byte character.
 432  * @param _end    Will point to after the first multi-byte character.
 433  * @return        The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
 434  */
 435 Uchar u8_getchar_utf8_enabled(const char *_s, const char **_end)
 436 {
 437         size_t st, ln;
 438         Uchar ch;
 439
 440         if (!u8_analyze(_s, &st, &ln, &ch, U8_ANALYZE_INFINITY))
 441                 ch = 0;
 442         if (_end)
 443                 *_end = _s + st + ln;
 444         return ch;
 445 }
 446
 447 /** Fetch a character from an utf-8 encoded string.
 448  * @param _s      The start of an utf-8 encoded multi-byte character.
 449  * @param _end    Will point to after the first multi-byte character.
 450  * @return        The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
 451  */
 452 Uchar u8_getnchar_utf8_enabled(const char *_s, const char **_end, size_t _maxlen)
 453 {
 454         size_t st, ln;
 455         Uchar ch;
 456
 457         if (!u8_analyze(_s, &st, &ln, &ch, _maxlen))
 458                 ch = 0;
 459         if (_end)
 460                 *_end = _s + st + ln;
 461         return ch;
 462 }
 463
 464 /** Encode a wide-character into utf-8.
 465  * @param w        The wide character to encode.
 466  * @param to       The target buffer the utf-8 encoded string is stored to.
 467  * @param maxlen   The maximum number of bytes that fit into the target buffer.
 468  * @return         Number of bytes written to the buffer not including the terminating null.
 469  *                 Less or equal to 0 if the buffer is too small.
 470  */
 471 int u8_fromchar(Uchar w, char *to, size_t maxlen)
 472 {
 473         if (maxlen < 1)
 474                 return 0;
 475
 476         if (!w)
 477                 return 0;
 478
 479         if (w >= 0xE000 && !utf8_enable.integer)
 480                 w -= 0xE000;
 481
 482         if (w < 0x80 || !utf8_enable.integer)
 483         {
 484                 to[0] = (char)w;
 485                 if (maxlen < 2)
 486                         return -1;
 487                 to[1] = 0;
 488                 return 1;
 489         }
 490         // for a little speedup
 491         if (w < 0x800)
 492         {
 493                 if (maxlen < 3)
 494                 {
 495                         to[0] = 0;
 496                         return -1;
 497                 }
 498                 to[2] = 0;
 499                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 500                 to[0] = 0xC0 | w;
 501                 return 2;
 502         }
 503         if (w < 0x10000)
 504         {
 505                 if (maxlen < 4)
 506                 {
 507                         to[0] = 0;
 508                         return -1;
 509                 }
 510                 to[3] = 0;
 511                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 512                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 513                 to[0] = 0xE0 | w;
 514                 return 3;
 515         }
 516
 517         // RFC 3629
 518         if (w <= 0x10FFFF)
 519         {
 520                 if (maxlen < 5)
 521                 {
 522                         to[0] = 0;
 523                         return -1;
 524                 }
 525                 to[4] = 0;
 526                 to[3] = 0x80 | (w & 0x3F); w >>= 6;
 527                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 528                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 529                 to[0] = 0xE0 | w;
 530                 return 4;
 531         }
 532         return 0;
 533 }
 534
 535 /** uses u8_fromchar on a static buffer
 536  * @param ch        The unicode character to convert to encode
 537  * @param l         The number of bytes without the terminating null.
 538  * @return          A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
 539  */
 540 char *u8_encodech(Uchar ch, size_t *l)
 541 {
 542         static char buf[16];
 543         size_t len;
 544         len = u8_fromchar(ch, buf, sizeof(buf));
 545         if (len > 0)
 546         {
 547                 if (l) *l = len;
 548                 return buf;
 549         }
 550         return NULL;
 551 }
 552
 553 /** Convert a utf-8 multibyte string to a wide character string.
 554  * @param wcs       The target wide-character buffer.
 555  * @param mb        The utf-8 encoded multibyte string to convert.
 556  * @param maxlen    The maximum number of wide-characters that fit into the target buffer.
 557  * @return          The number of characters written to the target buffer.
 558  */
 559 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
 560 {
 561         size_t i;
 562         Uchar ch;
 563         if (maxlen < 1)
 564                 return 0;
 565         for (i = 0; *mb && i < maxlen-1; ++i)
 566         {
 567                 ch = u8_getchar(mb, &mb);
 568                 if (!ch)
 569                         break;
 570                 wcs[i] = ch;
 571         }
 572         wcs[i] = 0;
 573         return i;
 574 }
 575
 576 /** Convert a wide-character string to a utf-8 multibyte string.
 577  * @param mb      The target buffer the utf-8 string is written to.
 578  * @param wcs     The wide-character string to convert.
 579  * @param maxlen  The number bytes that fit into the multibyte target buffer.
 580  * @return        The number of bytes written, not including the terminating \0
 581  */
 582 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
 583 {
 584         size_t i;
 585         const char *start = mb;
 586         if (maxlen < 2)
 587                 return 0;
 588         for (i = 0; wcs[i] && i < maxlen-1; ++i)
 589         {
 590                 /*
 591                 int len;
 592                 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
 593                         return (mb - start);
 594                 mb += len;
 595                 */
 596                 mb += u8_fromchar(wcs[i], mb, maxlen - i);
 597         }
 598         *mb = 0;
 599         return (mb - start);
 600 }
 601
 602 /*
 603 ============
 604 UTF-8 aware COM_StringLengthNoColors
 605
 606 calculates the visible width of a color coded string.
 607
 608 *valid is filled with TRUE if the string is a valid colored string (that is, if
 609 it does not end with an unfinished color code). If it gets filled with FALSE, a
 610 fix would be adding a STRING_COLOR_TAG at the end of the string.
 611
 612 valid can be set to NULL if the caller doesn't care.
 613
 614 For size_s, specify the maximum number of characters from s to use, or 0 to use
 615 all characters until the zero terminator.
 616 ============
 617 */
 618 size_t
 619 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
 620 size_t
 621 u8_COM_StringLengthNoColors(const char *_s, size_t size_s, qboolean *valid)
 622 {
 623         const unsigned char *s = (const unsigned char*)_s;
 624         const unsigned char *end;
 625         size_t len = 0;
 626
 627         if (!utf8_enable.integer)
 628                 return COM_StringLengthNoColors(_s, size_s, valid);
 629
 630         end = size_s ? (s + size_s) : NULL;
 631
 632         for(;;)
 633         {
 634                 switch((s == end) ? 0 : *s)
 635                 {
 636                         case 0:
 637                                 if(valid)
 638                                         *valid = TRUE;
 639                                 return len;
 640                         case STRING_COLOR_TAG:
 641                                 ++s;
 642                                 switch((s == end) ? 0 : *s)
 643                                 {
 644                                         case STRING_COLOR_RGB_TAG_CHAR:
 645                                                 if (s+1 != end && isxdigit(s[1]) &&
 646                                                         s+2 != end && isxdigit(s[2]) &&
 647                                                         s+3 != end && isxdigit(s[3]) )
 648                                                 {
 649                                                         s+=3;
 650                                                         break;
 651                                                 }
 652                                                 ++len; // STRING_COLOR_TAG
 653                                                 ++len; // STRING_COLOR_RGB_TAG_CHAR
 654                                                 break;
 655                                         case 0: // ends with unfinished color code!
 656                                                 ++len;
 657                                                 if(valid)
 658                                                         *valid = FALSE;
 659                                                 return len;
 660                                         case STRING_COLOR_TAG: // escaped ^
 661                                                 ++len;
 662                                                 break;
 663                                         case '0': case '1': case '2': case '3': case '4':
 664                                         case '5': case '6': case '7': case '8': case '9': // color code
 665                                                 break;
 666                                         default: // not a color code
 667                                                 ++len; // STRING_COLOR_TAG
 668                                                 ++len; // the character
 669                                                 break;
 670                                 }
 671                                 break;
 672                         default:
 673                                 ++len;
 674                                 break;
 675                 }
 676
 677                 // start of a wide character
 678                 if (*s & 0xC0)
 679                 {
 680                         for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
 681                         continue;
 682                 }
 683                 // part of a wide character, we ignore that one
 684                 if (*s <= 0xBF)
 685                         --len;
 686                 ++s;
 687         }
 688         // never get here
 689 }
 690
 691 /** Pads a utf-8 string
 692  * @param out     The target buffer the utf-8 string is written to.
 693  * @param outsize The size of the target buffer, including the final NUL
 694  * @param in      The input utf-8 buffer
 695  * @param leftalign Left align the output string (by default right alignment is done)
 696  * @param minwidth The minimum output width
 697  * @param maxwidth The maximum output width
 698  * @return        The number of bytes written, not including the terminating \0
 699  */
 700 size_t u8_strpad(char *out, size_t outsize, const char *in, qboolean leftalign, size_t minwidth, size_t maxwidth)
 701 {
 702         if(!utf8_enable.integer)
 703         {
 704                 return dpsnprintf(out, outsize, "%*.*s", leftalign ? -(int) minwidth : (int) minwidth, (int) maxwidth, in);
 705         }
 706         else
 707         {
 708                 size_t l = u8_bytelen(in, maxwidth);
 709                 size_t actual_width = u8_strnlen(in, l);
 710                 int pad = (actual_width >= minwidth) ? 0 : (minwidth - actual_width);
 711                 int prec = l;
 712                 int lpad = leftalign ? 0 : pad;
 713                 int rpad = leftalign ? pad : 0;
 714                 return dpsnprintf(out, outsize, "%*s%.*s%*s", lpad, "", prec, in, rpad, "");
 715         }
 716 }