5 ================================================================================
6 Initialization of UTF-8 support and new cvars.
7 ================================================================================
9 // for compatibility this defaults to 0
10 cvar_t utf8_enable = {CVAR_SAVE, "utf8_enable", "0", "Enable UTF-8 support. For compatibility, this is disabled by default in most games."};
14 Cvar_RegisterVariable(&utf8_enable);
18 ================================================================================
19 UTF-8 encoding and decoding functions follow.
20 ================================================================================
23 /** Analyze the next character and return various information if requested.
24 * @param _s An utf-8 string.
25 * @param _start Filled with the start byte-offset of the next valid character
26 * @param _len Fileed with the length of the next valid character
27 * @param _ch Filled with the unicode value of the next character
28 * @return Whether or not another valid character is in the string
30 static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch)
32 const unsigned char *s = (const unsigned char*)_s;
41 // <0xC2 is always an overlong encoding, they're invalid, thus skipped
42 while (s[i] && s[i] >= 0x80 && s[i] <= 0xC2) {
43 //fprintf(stderr, "skipping\n");
46 //fprintf(stderr, "checking\n");
48 // If we hit the end, well, we're out and invalid
51 //fprintf(stderr, "checking ascii\n");
56 if (_start) *_start = i;
58 if (_ch) *_ch = (Uchar)s[i];
59 //fprintf(stderr, "valid ascii\n");
62 //fprintf(stderr, "checking length\n");
64 // Figure out the next char's length
67 // count the 1 bits, they're the # of bytes
68 for (bt = 0x40; bt && (bc & bt); bt >>= 1, ++bits);
71 //fprintf(stderr, "superlong\n");
75 // turn bt into a mask and give ch a starting value
78 // check the byte sequence for invalid bytes
79 for (j = 1; j < bits; ++j)
81 // valid bit value: 10xx xxxx
82 //if (s[i+j] < 0x80 || s[i+j] >= 0xC0)
83 if ( (s[i+j] & 0xC0) != 0x80 )
85 //fprintf(stderr, "sequence of %i f'd at %i by %x\n", bits, j, (unsigned int)s[i+j]);
86 // this byte sequence is invalid, skip it
88 // find a character after it
91 // at the same time, decode the character
92 ch = (ch << 6) | (s[i+j] & 0x3F);
95 // Now check the decoded byte for an overlong encoding
96 if ( (bits >= 2 && ch < 0x80) ||
97 (bits >= 3 && ch < 0x800) ||
98 (bits >= 4 && ch < 0x10000) ||
99 ch >= 0x10FFFF // RFC 3629
103 //fprintf(stderr, "overlong: %i bytes for %x\n", bits, ch);
113 //fprintf(stderr, "valid utf8\n");
117 /** Get the number of characters in an UTF-8 string.
118 * @param _s An utf-8 encoded null-terminated string.
119 * @return The number of unicode characters in the string.
121 size_t u8_strlen(const char *_s)
125 const unsigned char *s = (const unsigned char*)_s;
127 if (!utf8_enable.integer)
132 // ascii char, skip u8_analyze
140 // invalid, skip u8_analyze
147 if (!u8_analyze((const char*)s, &st, &ln, NULL))
149 // valid character, skip after it
156 /** Get the number of characters in a part of an UTF-8 string.
157 * @param _s An utf-8 encoded null-terminated string.
158 * @param n The maximum number of bytes.
159 * @return The number of unicode characters in the string.
161 size_t u8_strnlen(const char *_s, size_t n)
165 const unsigned char *s = (const unsigned char*)_s;
167 if (!utf8_enable.integer)
170 return (len < n) ? len : n;
175 // ascii char, skip u8_analyze
184 // invalid, skip u8_analyze
192 if (!u8_analyze((const char*)s, &st, &ln, NULL))
194 // valid character, see if it's still inside the range specified by n:
204 /** Get the number of bytes used in a string to represent an amount of characters.
205 * @param _s An utf-8 encoded null-terminated string.
206 * @param n The number of characters we want to know the byte-size for.
207 * @return The number of bytes used to represent n characters.
209 size_t u8_bytelen(const char *_s, size_t n)
213 const unsigned char *s = (const unsigned char*)_s;
215 if (!utf8_enable.integer)
220 // ascii char, skip u8_analyze
229 // invalid, skip u8_analyze
237 if (!u8_analyze((const char*)s, &st, &ln, NULL))
246 /** Get the byte-index for a character-index.
247 * @param _s An utf-8 encoded string.
248 * @param i The character-index for which you want the byte offset.
249 * @param len If not null, character's length will be stored in there.
250 * @return The byte-index at which the character begins, or -1 if the string is too short.
252 int u8_byteofs(const char *_s, size_t i, size_t *len)
256 const unsigned char *s = (const unsigned char*)_s;
258 if (!utf8_enable.integer)
268 if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL))
277 /** Get the char-index for a byte-index.
278 * @param _s An utf-8 encoded string.
279 * @param i The byte offset for which you want the character index.
280 * @param len If not null, the offset within the character is stored here.
281 * @return The character-index, or -1 if the string is too short.
283 int u8_charidx(const char *_s, size_t i, size_t *len)
289 const unsigned char *s = (const unsigned char*)_s;
291 if (!utf8_enable.integer)
297 while (ofs < i && s[ofs])
299 // ascii character, skip u8_analyze
308 // invalid, skip u8_analyze
315 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL))
317 // see if next char is after the bytemark
327 // see if bytemark is within the char
339 /** Get the byte offset of the previous byte.
341 * prevchar_pos = u8_byteofs(text, u8_charidx(text, thischar_pos, NULL) - 1, NULL)
342 * @param _s An utf-8 encoded string.
343 * @param i The current byte offset.
344 * @return The byte offset of the previous character
346 size_t u8_prevbyte(const char *_s, size_t i)
349 const unsigned char *s = (const unsigned char*)_s;
353 if (!utf8_enable.integer)
360 while (ofs < i && s[ofs])
362 // ascii character, skip u8_analyze
369 // invalid, skip u8_analyze
376 if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL))
380 if (ofs + st + ln >= i)
389 static int char_usefont[256] = {
390 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials
391 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials
392 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // shift+digit line
393 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // digits
394 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // caps
395 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // caps
396 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // small
397 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // small
398 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials
399 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // faces
400 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
401 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
402 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
403 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
404 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
405 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
409 /** Fetch a character from an utf-8 encoded string.
410 * @param _s The start of an utf-8 encoded multi-byte character.
411 * @param _end Will point to after the first multi-byte character.
412 * @return The 32-bit integer representation of the first multi-byte character or 0 for invalid characters.
414 Uchar u8_getchar(const char *_s, const char **_end)
419 if (!utf8_enable.integer)
423 /* Careful: if we disable utf8 but not freetype, we wish to see freetype chars
424 * for normal letters. So use E000+x for special chars, but leave the freetype stuff for the
427 if (!char_usefont[(unsigned int)*(const unsigned char*)_s])
428 return 0xE000 + (Uchar)*(const unsigned char*)_s;
429 return (Uchar)*(const unsigned char*)_s;
432 if (!u8_analyze(_s, &st, &ln, &ch))
435 *_end = _s + st + ln;
439 /** Encode a wide-character into utf-8.
440 * @param w The wide character to encode.
441 * @param to The target buffer the utf-8 encoded string is stored to.
442 * @param maxlen The maximum number of bytes that fit into the target buffer.
443 * @return Number of bytes written to the buffer not including the terminating null.
444 * Less or equal to 0 if the buffer is too small.
446 int u8_fromchar(Uchar w, char *to, size_t maxlen)
454 if (w >= 0xE000 && !utf8_enable.integer)
457 if (w < 0x80 || !utf8_enable.integer)
465 // for a little speedup
474 to[1] = 0x80 | (w & 0x3F); w >>= 6;
486 to[2] = 0x80 | (w & 0x3F); w >>= 6;
487 to[1] = 0x80 | (w & 0x3F); w >>= 6;
501 to[3] = 0x80 | (w & 0x3F); w >>= 6;
502 to[2] = 0x80 | (w & 0x3F); w >>= 6;
503 to[1] = 0x80 | (w & 0x3F); w >>= 6;
510 /** uses u8_fromchar on a static buffer
511 * @param ch The unicode character to convert to encode
512 * @param l The number of bytes without the terminating null.
513 * @return A statically allocated buffer containing the character's utf8 representation, or NULL if it fails.
515 char *u8_encodech(Uchar ch, size_t *l)
519 len = u8_fromchar(ch, buf, sizeof(buf));
528 /** Convert a utf-8 multibyte string to a wide character string.
529 * @param wcs The target wide-character buffer.
530 * @param mb The utf-8 encoded multibyte string to convert.
531 * @param maxlen The maximum number of wide-characters that fit into the target buffer.
532 * @return The number of characters written to the target buffer.
534 size_t u8_mbstowcs(Uchar *wcs, const char *mb, size_t maxlen)
540 for (i = 0; *mb && i < maxlen-1; ++i)
542 ch = u8_getchar(mb, &mb);
551 /** Convert a wide-character string to a utf-8 multibyte string.
552 * @param mb The target buffer the utf-8 string is written to.
553 * @param wcs The wide-character string to convert.
554 * @param maxlen The number bytes that fit into the multibyte target buffer.
555 * @return The number of bytes written, not including the terminating \0
557 size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen)
560 const char *start = mb;
563 for (i = 0; wcs[i] && i < maxlen-1; ++i)
566 if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0)
576 UTF-8 aware COM_StringLengthNoColors
578 calculates the visible width of a color coded string.
580 *valid is filled with TRUE if the string is a valid colored string (that is, if
581 it does not end with an unfinished color code). If it gets filled with FALSE, a
582 fix would be adding a STRING_COLOR_TAG at the end of the string.
584 valid can be set to NULL if the caller doesn't care.
586 For size_s, specify the maximum number of characters from s to use, or 0 to use
587 all characters until the zero terminator.
591 COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid);
593 u8_COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid)
598 if (!utf8_enable.integer)
599 return COM_StringLengthNoColors(s, size_s, valid);
601 end = size_s ? (s + size_s) : NULL;
605 switch((s == end) ? 0 : *s)
611 case STRING_COLOR_TAG:
613 switch((s == end) ? 0 : *s)
615 case STRING_COLOR_RGB_TAG_CHAR:
616 if (s+1 != end && isxdigit(s[1]) &&
617 s+2 != end && isxdigit(s[2]) &&
618 s+3 != end && isxdigit(s[3]) )
623 ++len; // STRING_COLOR_TAG
624 ++len; // STRING_COLOR_RGB_TAG_CHAR
626 case 0: // ends with unfinished color code!
631 case STRING_COLOR_TAG: // escaped ^
634 case '0': case '1': case '2': case '3': case '4':
635 case '5': case '6': case '7': case '8': case '9': // color code
637 default: // not a color code
638 ++len; // STRING_COLOR_TAG
639 ++len; // the character
648 // start of a wide character
651 for (++s; *s >= 0x80 && *s <= 0xC0; ++s);
654 // part of a wide character, we ignore that one