5 ================================================================================
6 Initialization of UTF-8 support and new cvars.
7 ================================================================================
9 // for compatibility this defaults to 0
10 cvar_t utf8_enable = {CVAR_SAVE, "utf8_enable", "0", "Enable UTF-8 support. For compatibility, this is disabled by default in most games."};
14 Cvar_RegisterVariable(&utf8_enable);
18 ================================================================================
19 UTF-8 encoding and decoding functions follow.
20 ================================================================================
23 /** Analyze the next character and return various information if requested.
24 * @param _s An utf-8 string.
25 * @param _start Filled with the start byte-offset of the next valid character
26 * @param _len Fileed with the length of the next valid character
27 * @param _ch Filled with the unicode value of the next character
28 * @param _maxlen Maximum number of bytes to read from _s
29 * @return Whether or not another valid character is in the string
31 #define U8_ANALYZE_INFINITY 7
32 static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch, size_t _maxlen)
34 const unsigned char *s = (const unsigned char*)_s;
43 // <0xC2 is always an overlong encoding, they're invalid, thus skipped
44 while (i < _maxlen && s[i] && s[i] >= 0x80 && s[i] < 0xC2) {
45 //fprintf(stderr, "skipping\n");
49 //fprintf(stderr, "checking\n");
50 // If we hit the end, well, we're out and invalid
51 if(i >= _maxlen || !s[i]) {
52 if (_start) *_start = i;
57 //fprintf(stderr, "checking ascii\n");
61 if (_start) *_start = i;
63 if (_ch) *_ch = (Uchar)s[i];
64 //fprintf(stderr, "valid ascii\n");
67 //fprintf(stderr, "checking length\n");
69 // Figure out the next char's length
72 // count the 1 bits, they're the # of bytes
73 for (bt = 0x40; bt && (bc & bt); bt >>= 1, ++bits);
76 //fprintf(stderr, "superlong\n");
80 if(i + bits > _maxlen) {
82 if (_start) *_start = i;
89 // turn bt into a mask and give ch a starting value
92 // check the byte sequence for invalid bytes
93 for (j = 1; j < bits; ++j)
95 // valid bit value: 10xx xxxx
96 //if (s[i+j] < 0x80 || s[i+j] >= 0xC0)
97 if ( (s[i+j] & 0xC0) != 0x80 )
99 //fprintf(stderr, "sequence of %i f'd at %i by %x\n", bits, j, (unsigned int)s[i+j]);
100 // this byte sequence is invalid, skip it
102 // find a character after it
105 // at the same time, decode the character
106 ch = (ch << 6) | (s[i+j] & 0x3F);
109 // Now check the decoded byte for an overlong encoding
110 if ( (bits >= 2 && ch < 0x80) ||
111 (bits >= 3 && ch < 0x800) ||
112 (bits >= 4 && ch < 0x10000) ||
113 ch >= 0x10FFFF // RFC 3629
117 //fprintf(stderr, "overlong: %i bytes for %x\n", bits, ch);
127 //fprintf(stderr, "valid utf8\n");
131 /** Get the number of characters in an UTF-8 string.
132 * @param _s An utf-8 encoded null-terminated string.
133 * @return The number of unicode characters in the string.
135 size_t u8_strlen(const char *_s)
139 const unsigned char *s = (const unsigned char*)_s;
141 if (!utf8_enable.integer)
146 // ascii char, skip u8_analyze
154 // invalid, skip u8_analyze
161 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
163 // valid character, skip after it
170 /** Get the number of characters in a part of an UTF-8 string.
171 * @param _s An utf-8 encoded null-terminated string.
172 * @param n The maximum number of bytes.
173 * @return The number of unicode characters in the string.
175 size_t u8_strnlen(const char *_s, size_t n)
179 const unsigned char *s = (const unsigned char*)_s;
181 if (!utf8_enable.integer)
184 return (len < n) ? len : n;
189 // ascii char, skip u8_analyze
198 // invalid, skip u8_analyze
206 if (!u8_analyze((const char*)s, &st, &ln, NULL, n))
208 // valid character, see if it's still inside the range specified by n:
218 /** Get the number of bytes used in a string to represent an amount of characters.
219 * @param _s An utf-8 encoded null-terminated string.
220 * @param n The number of characters we want to know the byte-size for.
221 * @return The number of bytes used to represent n characters.
223 size_t u8_bytelen(const char *_s, size_t n)
227 const unsigned char *s = (const unsigned char*)_s;
229 if (!utf8_enable.integer) {
231 return (len < n) ? len : n;
236 // ascii char, skip u8_analyze
245 // invalid, skip u8_analyze
253 if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY))
262 /** Get the byte-index for a character-index.
263 * @param _s An utf-8 encoded string.
264 * @param i The character-index for which you want the byte offset.
265 * @param len If not null, character's length will be stored in there.
266 * @return The byte-index at which the character begins, or -1 if the string is too short.
268 int u8_byteofs(const char *_s, size_t i, size_t *len)
272 const unsigned char *s = (const unsigned char*)_s;
274 if (!utf8_enable.integer)
290 if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
299 /** Get the char-index for a byte-index.
300 * @param _s An utf-8 encoded string.
301 * @param i The byte offset for which you want the character index.
302 * @param len If not null, the offset within the character is stored here.
303 * @return The character-index, or -1 if the string is too short.
305 int u8_charidx(const char *_s, size_t i, size_t *len)
311 const unsigned char *s = (const unsigned char*)_s;
313 if (!utf8_enable.integer)
319 while (ofs < i && s[ofs])
321 // ascii character, skip u8_analyze
330 // invalid, skip u8_analyze
337 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
339 // see if next char is after the bytemark
349 // see if bytemark is within the char
361 /** Get the byte offset of the previous byte.
363 * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
364 * @param _s An utf-8 encoded string.
365 * @param i The current byte offset.
366 * @return The byte offset of the previous character
368 size_t u8_prevbyte(const char *_s, size_t i)
371 const unsigned char *s = (const unsigned char*)_s;
375 if (!utf8_enable.integer)
382 while (ofs < i && s[ofs])
384 // ascii character, skip u8_analyze
391 // invalid, skip u8_analyze
398 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY))
402 if (ofs + st + ln >= i)
411 Uchar u8_quake2utf8map[256] = {
412 0xE000, 0xE001, 0xE002, 0xE003, 0xE004, 0xE005, 0xE006, 0xE007, 0xE008, 0xE009, 0xE00A, 0xE00B, 0xE00C, 0xE00D, 0xE00E, 0xE00F, // specials
413 0xE010, 0xE011, 0xE012, 0xE013, 0xE014, 0xE015, 0xE016, 0xE017, 0xE018, 0xE019, 0xE01A, 0xE01B, 0xE01C, 0xE01D, 0xE01E, 0xE01F, // specials
414 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // shift+digit line
415 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // digits
416 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // caps
417 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // caps
418 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // small
419 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // small
420 0xE080, 0xE081, 0xE082, 0xE083, 0xE084, 0xE085, 0xE086, 0xE087, 0xE088, 0xE089, 0xE08A, 0xE08B, 0xE08C, 0xE08D, 0xE08E, 0xE08F, // specials
421 0xE090, 0xE091, 0xE092, 0xE093, 0xE094, 0xE095, 0xE096, 0xE097, 0xE098, 0xE099, 0xE09A, 0xE09B, 0xE09C, 0xE09D, 0xE09E, 0xE09F, // faces
422 0xE0A0, 0xE0A1, 0xE0A2, 0xE0A3, 0xE0A4, 0xE0A5, 0xE0A6, 0xE0A7, 0xE0A8, 0xE0A9, 0xE0AA, 0xE0AB, 0xE0AC, 0xE0AD, 0xE0AE, 0xE0AF,
423 0xE0B0, 0xE0B1, 0xE0B2, 0xE0B3, 0xE0B4, 0xE0B5, 0xE0B6, 0xE0B7, 0xE0B8, 0xE0B9, 0xE0BA, 0xE0BB, 0xE0BC, 0xE0BD, 0xE0BE, 0xE0BF,
424 0xE0C0, 0xE0C1, 0xE0C2, 0xE0C3, 0xE0C4, 0xE0C5, 0xE0C6, 0xE0C7, 0xE0C8, 0xE0C9, 0xE0CA, 0xE0CB, 0xE0CC, 0xE0CD, 0xE0CE, 0xE0CF,
425 0xE0D0, 0xE0D1, 0xE0D2, 0xE0D3, 0xE0D4, 0xE0D5, 0xE0D6, 0xE0D7, 0xE0D8, 0xE0D9, 0xE0DA, 0xE0DB, 0xE0DC, 0xE0DD, 0xE0DE, 0xE0DF,
426 0xE0E0, 0xE0E1, 0xE0E2, 0xE0E3, 0xE0E4, 0xE0E5, 0xE0E6, 0xE0E7, 0xE0E8, 0xE0E9, 0xE0EA, 0xE0EB, 0xE0EC, 0xE0ED, 0xE0EE, 0xE0EF,
427 0xE0F0, 0xE0F1, 0xE0F2, 0xE0F3, 0xE0F4, 0xE0F5, 0xE0F6, 0xE0F7, 0xE0F8, 0xE0F9, 0xE0FA, 0xE0FB, 0xE0FC, 0xE0FD, 0xE0FE, 0xE0FF,
430 /** Fetch a character from an utf-8 encoded string.
431 * @param _s The start of an utf-8 encoded multi-byte character.
432 * @param _end Will point to after the first multi-byte character.
433 * @return The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
435 Uchar u8_getchar_utf8_enabled(const char *_s, const char **_end)
440 if (!u8_analyze(_s, &st, &ln, &ch, U8_ANALYZE_INFINITY))
443 *_end = _s + st + ln;
447 /** Fetch a character from an utf-8 encoded string.
448 * @param _s The start of an utf-8 encoded multi-byte character.
449 * @param _end Will point to after the first multi-byte character.
450 * @return The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
452 Uchar u8_getnchar_utf8_enabled(const char *_s, const char **_end, size_t _maxlen)
457 if (!u8_analyze(_s, &st, &ln, &ch, _maxlen))
460 *_end = _s + st + ln;
464 /** Encode a wide-character into utf-8.
465 * @param w The wide character to encode.
466 * @param to The target buffer the utf-8 encoded string is stored to.
467 * @param maxlen The maximum number of bytes that fit into the target buffer.
468 * @return Number of bytes written to the buffer not including the terminating null.
469 * Less or equal to 0 if the buffer is too small.
471 int u8_fromchar(Uchar w, char *to, size_t maxlen)
479 if (w >= 0xE000 && !utf8_enable.integer)
482 if (w < 0x80 || !utf8_enable.integer)
490 // for a little speedup
499 to[1] = 0x80 | (w & 0x3F); w >>= 6;
511 to[2] = 0x80 | (w & 0x3F); w >>= 6;
512 to[1] = 0x80 | (w & 0x3F); w >>= 6;
526 to[3] = 0x80 | (w & 0x3F); w >>= 6;
527 to[2] = 0x80 | (w & 0x3F); w >>= 6;
528 to[1] = 0x80 | (w & 0x3F); w >>= 6;
535 /** uses u8_fromchar on a static buffer
536 * @param ch The unicode character to convert to encode
537 * @param l The number of bytes without the terminating null.
538 * @return A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
540 char *u8_encodech(Uchar ch, size_t *l)
544 len = u8_fromchar(ch, buf, sizeof(buf));
553 /** Convert a utf-8 multibyte string to a wide character string.
554 * @param wcs The target wide-character buffer.
555 * @param mb The utf-8 encoded multibyte string to convert.
556 * @param maxlen The maximum number of wide-characters that fit into the target buffer.
557 * @return The number of characters written to the target buffer.
559 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
565 for (i = 0; *mb && i < maxlen-1; ++i)
567 ch = u8_getchar(mb, &mb);
576 /** Convert a wide-character string to a utf-8 multibyte string.
577 * @param mb The target buffer the utf-8 string is written to.
578 * @param wcs The wide-character string to convert.
579 * @param maxlen The number bytes that fit into the multibyte target buffer.
580 * @return The number of bytes written, not including the terminating \0
582 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
585 const char *start = mb;
588 for (i = 0; wcs[i] && i < maxlen-1; ++i)
592 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
596 mb += u8_fromchar(wcs[i], mb, maxlen - i);
604 UTF-8 aware COM_StringLengthNoColors
606 calculates the visible width of a color coded string.
608 *valid is filled with TRUE if the string is a valid colored string (that is, if
609 it does not end with an unfinished color code). If it gets filled with FALSE, a
610 fix would be adding a STRING_COLOR_TAG at the end of the string.
612 valid can be set to NULL if the caller doesn't care.
614 For size_s, specify the maximum number of characters from s to use, or 0 to use
615 all characters until the zero terminator.
619 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
621 u8_COM_StringLengthNoColors(const char *_s, size_t size_s, qboolean *valid)
623 const unsigned char *s = (const unsigned char*)_s;
624 const unsigned char *end;
627 if (!utf8_enable.integer)
628 return COM_StringLengthNoColors(_s, size_s, valid);
630 end = size_s ? (s + size_s) : NULL;
634 switch((s == end) ? 0 : *s)
640 case STRING_COLOR_TAG:
642 switch((s == end) ? 0 : *s)
644 case STRING_COLOR_RGB_TAG_CHAR:
645 if (s+1 != end && isxdigit(s[1]) &&
646 s+2 != end && isxdigit(s[2]) &&
647 s+3 != end && isxdigit(s[3]) )
652 ++len; // STRING_COLOR_TAG
653 ++len; // STRING_COLOR_RGB_TAG_CHAR
655 case 0: // ends with unfinished color code!
660 case STRING_COLOR_TAG: // escaped ^
663 case '0': case '1': case '2': case '3': case '4':
664 case '5': case '6': case '7': case '8': case '9': // color code
666 default: // not a color code
667 ++len; // STRING_COLOR_TAG
668 ++len; // the character
677 // start of a wide character
680 for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
683 // part of a wide character, we ignore that one
691 /** Pads a utf-8 string
692 * @param out The target buffer the utf-8 string is written to.
693 * @param outsize The size of the target buffer, including the final NUL
694 * @param in The input utf-8 buffer
695 * @param leftalign Left align the output string (by default right alignment is done)
696 * @param minwidth The minimum output width
697 * @param maxwidth The maximum output width
698 * @return The number of bytes written, not including the terminating \0
700 size_t u8_strpad(char *out, size_t outsize, const char *in, qboolean leftalign, size_t minwidth, size_t maxwidth)
702 if(!utf8_enable.integer)
704 return dpsnprintf(out, outsize, "%*.*s", leftalign ? -(int) minwidth : (int) minwidth, (int) maxwidth, in);
708 size_t l = u8_bytelen(in, maxwidth);
709 size_t actual_width = u8_strnlen(in, l);
710 int pad = (actual_width >= minwidth) ? 0 : (minwidth - actual_width);
712 int lpad = leftalign ? 0 : pad;
713 int rpad = leftalign ? pad : 0;
714 return dpsnprintf(out, outsize, "%*s%.*s%*s", lpad, "", prec, in, rpad, "");