X-Git-Url: https://git.xonotic.org/?a=blobdiff_plain;f=utf8lib.c;h=bb775b7fd91b749a4593382f52691135c58f4250;hb=bd172244c077be2f4f13caf8cabd022fcad4bd33;hp=8b50e753d4cf3a57e873416079b63f4866caf0b8;hpb=34ac3e1a25d3b1541a7b32e89c63812b52c3edac;p=xonotic%2Fdarkplaces.git diff --git a/utf8lib.c b/utf8lib.c index 8b50e753..bb775b7f 100644 --- a/utf8lib.c +++ b/utf8lib.c @@ -20,37 +20,101 @@ UTF-8 encoding and decoding functions follow. ================================================================================ */ +unsigned char utf8_lengths[256] = { // 0 = invalid + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // ascii characters + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0xBF are within multibyte sequences + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // they could be interpreted as 2-byte starts but + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // the codepoint would be < 127 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 and C1 would also result in overlong encodings + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + // with F5 the codepoint is above 0x10FFFF, + // F8-FB would start 5-byte sequences + // FC-FD would start 6-byte sequences + // ... +}; +Uchar utf8_range[5] = { + 1, // invalid - let's not allow the creation of 0-bytes :P + 1, // ascii minimum + 0x80, // 2-byte minimum + 0x800, // 3-byte minimum + 0x10000, // 4-byte minimum +}; + /** Analyze the next character and return various information if requested. * @param _s An utf-8 string. * @param _start Filled with the start byte-offset of the next valid character * @param _len Fileed with the length of the next valid character * @param _ch Filled with the unicode value of the next character + * @param _maxlen Maximum number of bytes to read from _s * @return Whether or not another valid character is in the string */ -static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch) +#define U8_ANALYZE_INFINITY 7 +static qboolean u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch, size_t _maxlen) { const unsigned char *s = (const unsigned char*)_s; - unsigned char bt, bc; - size_t i; - size_t bits, j; + size_t i, j; + size_t bits = 0; Uchar ch; i = 0; findchar: + while (i < _maxlen && s[i] && (bits = utf8_lengths[s[i]]) == 0) + ++i; + if (i >= _maxlen || !s[i]) { + if (_start) *_start = i; + if (_len) *_len = 0; + return false; + } + + if (bits == 1) { // ascii + if (_start) *_start = i; + if (_len) *_len = 1; + if (_ch) *_ch = (Uchar)s[i]; + return true; + } + + ch = (s[i] & (0xFF >> bits)); + for (j = 1; j < bits; ++j) + { + if ( (s[i+j] & 0xC0) != 0x80 ) + { + i += j; + goto findchar; + } + ch = (ch << 6) | (s[i+j] & 0x3F); + } + if (ch < utf8_range[bits] || ch >= 0x10FFFF) + { + i += bits; + goto findchar; + } +#if 0 // <0xC2 is always an overlong encoding, they're invalid, thus skipped - while (s[i] && s[i] >= 0x80 && s[i] <= 0xC2) { + while (i < _maxlen && s[i] && s[i] >= 0x80 && s[i] < 0xC2) { //fprintf(stderr, "skipping\n"); ++i; } - //fprintf(stderr, "checking\n"); // If we hit the end, well, we're out and invalid - if (!s[i]) + if(i >= _maxlen || !s[i]) { + if (_start) *_start = i; + if (_len) *_len = 0; return false; - //fprintf(stderr, "checking ascii\n"); + } - // ascii characters + // I'll leave that in - if you remove it, also change the part below + // to support 1-byte chars correctly if (s[i] < 0x80) { if (_start) *_start = i; @@ -59,7 +123,6 @@ findchar: //fprintf(stderr, "valid ascii\n"); return true; } - //fprintf(stderr, "checking length\n"); // Figure out the next char's length bc = s[i]; @@ -72,6 +135,15 @@ findchar: ++i; goto findchar; } + if(i + bits > _maxlen) { + /* + if (_start) *_start = i; + if (_len) *_len = 0; + return false; + */ + ++i; + goto findchar; + } // turn bt into a mask and give ch a starting value --bt; ch = (s[i] & bt); @@ -103,6 +175,7 @@ findchar: //fprintf(stderr, "overlong: %i bytes for %x\n", bits, ch); goto findchar; } +#endif if (_start) *_start = i; @@ -138,13 +211,13 @@ size_t u8_strlen(const char *_s) } // invalid, skip u8_analyze - if (*s <= 0xC2) + if (*s < 0xC2) { ++s; continue; } - if (!u8_analyze((const char*)s, &st, &ln, NULL)) + if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY)) break; // valid character, skip after it s += st + ln; @@ -182,14 +255,14 @@ size_t u8_strnlen(const char *_s, size_t n) } // invalid, skip u8_analyze - if (*s <= 0xC2) + if (*s < 0xC2) { ++s; --n; continue; } - if (!u8_analyze((const char*)s, &st, &ln, NULL)) + if (!u8_analyze((const char*)s, &st, &ln, NULL, n)) break; // valid character, see if it's still inside the range specified by n: if (n < st + ln) @@ -212,8 +285,10 @@ size_t u8_bytelen(const char *_s, size_t n) size_t len = 0; const unsigned char *s = (const unsigned char*)_s; - if (!utf8_enable.integer) - return n; + if (!utf8_enable.integer) { + len = strlen(_s); + return (len < n) ? len : n; + } while (*s && n) { @@ -227,14 +302,14 @@ size_t u8_bytelen(const char *_s, size_t n) } // invalid, skip u8_analyze - if (*s <= 0xC2) + if (*s < 0xC2) { ++s; ++len; continue; } - if (!u8_analyze((const char*)s, &st, &ln, NULL)) + if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY)) break; --n; s += st + ln; @@ -257,6 +332,12 @@ int u8_byteofs(const char *_s, size_t i, size_t *len) if (!utf8_enable.integer) { + if (strlen(_s) < i) + { + if (len) *len = 0; + return -1; + } + if (len) *len = 1; return i; } @@ -265,7 +346,7 @@ int u8_byteofs(const char *_s, size_t i, size_t *len) do { ofs += ln; - if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL)) + if (!u8_analyze((const char*)s + ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY)) return -1; ofs += st; } while(i-- > 0); @@ -306,13 +387,13 @@ int u8_charidx(const char *_s, size_t i, size_t *len) } // invalid, skip u8_analyze - if (s[ofs] <= 0xC2) + if (s[ofs] < 0xC2) { ++ofs; continue; } - if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL)) + if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY)) return -1; // see if next char is after the bytemark if (ofs + st > i) @@ -367,13 +448,13 @@ size_t u8_prevbyte(const char *_s, size_t i) } // invalid, skip u8_analyze - if (s[ofs] <= 0xC2) + if (s[ofs] < 0xC2) { ++ofs; continue; } - if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL)) + if (!u8_analyze((const char*)s+ofs, &st, &ln, NULL, U8_ANALYZE_INFINITY)) return lastofs; if (ofs + st > i) return lastofs; @@ -386,51 +467,54 @@ size_t u8_prevbyte(const char *_s, size_t i) return lastofs; } -static int char_usefont[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // shift+digit line - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // digits - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // caps - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // caps - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // small - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // small - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // specials - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // faces - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +Uchar u8_quake2utf8map[256] = { + 0xE000, 0xE001, 0xE002, 0xE003, 0xE004, 0xE005, 0xE006, 0xE007, 0xE008, 0xE009, 0xE00A, 0xE00B, 0xE00C, 0xE00D, 0xE00E, 0xE00F, // specials + 0xE010, 0xE011, 0xE012, 0xE013, 0xE014, 0xE015, 0xE016, 0xE017, 0xE018, 0xE019, 0xE01A, 0xE01B, 0xE01C, 0xE01D, 0xE01E, 0xE01F, // specials + 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // shift+digit line + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // digits + 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // caps + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // caps + 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // small + 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // small + 0xE080, 0xE081, 0xE082, 0xE083, 0xE084, 0xE085, 0xE086, 0xE087, 0xE088, 0xE089, 0xE08A, 0xE08B, 0xE08C, 0xE08D, 0xE08E, 0xE08F, // specials + 0xE090, 0xE091, 0xE092, 0xE093, 0xE094, 0xE095, 0xE096, 0xE097, 0xE098, 0xE099, 0xE09A, 0xE09B, 0xE09C, 0xE09D, 0xE09E, 0xE09F, // faces + 0xE0A0, 0xE0A1, 0xE0A2, 0xE0A3, 0xE0A4, 0xE0A5, 0xE0A6, 0xE0A7, 0xE0A8, 0xE0A9, 0xE0AA, 0xE0AB, 0xE0AC, 0xE0AD, 0xE0AE, 0xE0AF, + 0xE0B0, 0xE0B1, 0xE0B2, 0xE0B3, 0xE0B4, 0xE0B5, 0xE0B6, 0xE0B7, 0xE0B8, 0xE0B9, 0xE0BA, 0xE0BB, 0xE0BC, 0xE0BD, 0xE0BE, 0xE0BF, + 0xE0C0, 0xE0C1, 0xE0C2, 0xE0C3, 0xE0C4, 0xE0C5, 0xE0C6, 0xE0C7, 0xE0C8, 0xE0C9, 0xE0CA, 0xE0CB, 0xE0CC, 0xE0CD, 0xE0CE, 0xE0CF, + 0xE0D0, 0xE0D1, 0xE0D2, 0xE0D3, 0xE0D4, 0xE0D5, 0xE0D6, 0xE0D7, 0xE0D8, 0xE0D9, 0xE0DA, 0xE0DB, 0xE0DC, 0xE0DD, 0xE0DE, 0xE0DF, + 0xE0E0, 0xE0E1, 0xE0E2, 0xE0E3, 0xE0E4, 0xE0E5, 0xE0E6, 0xE0E7, 0xE0E8, 0xE0E9, 0xE0EA, 0xE0EB, 0xE0EC, 0xE0ED, 0xE0EE, 0xE0EF, + 0xE0F0, 0xE0F1, 0xE0F2, 0xE0F3, 0xE0F4, 0xE0F5, 0xE0F6, 0xE0F7, 0xE0F8, 0xE0F9, 0xE0FA, 0xE0FB, 0xE0FC, 0xE0FD, 0xE0FE, 0xE0FF, }; +/** Fetch a character from an utf-8 encoded string. + * @param _s The start of an utf-8 encoded multi-byte character. + * @param _end Will point to after the first multi-byte character. + * @return The 32-bit integer representation of the first multi-byte character or 0 for invalid characters. + */ +Uchar u8_getchar_utf8_enabled(const char *_s, const char **_end) +{ + size_t st, ln; + Uchar ch; + + if (!u8_analyze(_s, &st, &ln, &ch, U8_ANALYZE_INFINITY)) + ch = 0; + if (_end) + *_end = _s + st + ln; + return ch; +} /** Fetch a character from an utf-8 encoded string. * @param _s The start of an utf-8 encoded multi-byte character. * @param _end Will point to after the first multi-byte character. * @return The 32-bit integer representation of the first multi-byte character or 0 for invalid characters. */ -Uchar u8_getchar(const char *_s, const char **_end) +Uchar u8_getnchar_utf8_enabled(const char *_s, const char **_end, size_t _maxlen) { size_t st, ln; Uchar ch; - if (!utf8_enable.integer) - { - if (_end) - *_end = _s + 1; - /* Careful: if we disable utf8 but not freetype, we wish to see freetype chars - * for normal letters. So use E000+x for special chars, but leave the freetype stuff for the - * rest: - */ - if (!char_usefont[(unsigned int)*(const unsigned char*)_s]) - return 0xE000 + (Uchar)*(const unsigned char*)_s; - return (Uchar)*(const unsigned char*)_s; - } - - if (!u8_analyze(_s, &st, &ln, &ch)) - return 0; + if (!u8_analyze(_s, &st, &ln, &ch, _maxlen)) + ch = 0; if (_end) *_end = _s + st + ln; return ch; @@ -446,10 +530,10 @@ Uchar u8_getchar(const char *_s, const char **_end) int u8_fromchar(Uchar w, char *to, size_t maxlen) { if (maxlen < 1) - return -2; + return 0; if (!w) - return -5; + return 0; if (w >= 0xE000 && !utf8_enable.integer) w -= 0xE000; @@ -501,10 +585,10 @@ int u8_fromchar(Uchar w, char *to, size_t maxlen) to[3] = 0x80 | (w & 0x3F); w >>= 6; to[2] = 0x80 | (w & 0x3F); w >>= 6; to[1] = 0x80 | (w & 0x3F); w >>= 6; - to[0] = 0xE0 | w; + to[0] = 0xF0 | w; return 4; } - return -1; + return 0; } /** uses u8_fromchar on a static buffer @@ -562,10 +646,13 @@ size_t u8_wcstombs(char *mb, const Uchar *wcs, size_t maxlen) return 0; for (i = 0; wcs[i] && i < maxlen-1; ++i) { + /* int len; if ( (len = u8_fromchar(wcs[i], mb, maxlen - i)) < 0) return (mb - start); mb += len; + */ + mb += u8_fromchar(wcs[i], mb, maxlen - i); } *mb = 0; return (mb - start); @@ -590,13 +677,15 @@ all characters until the zero terminator. size_t COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid); size_t -u8_COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid) +u8_COM_StringLengthNoColors(const char *_s, size_t size_s, qboolean *valid) { - const char *end; + const unsigned char *s = (const unsigned char*)_s; + const unsigned char *end; size_t len = 0; + size_t st, ln; if (!utf8_enable.integer) - return COM_StringLengthNoColors(s, size_s, valid); + return COM_StringLengthNoColors(_s, size_s, valid); end = size_s ? (s + size_s) : NULL; @@ -639,22 +728,73 @@ u8_COM_StringLengthNoColors(const char *s, size_t size_s, qboolean *valid) ++len; // the character break; } - break; + ++s; + continue; default: - ++len; break; } - // start of a wide character - if (*s & 0xC0) + // ascii char, skip u8_analyze + if (*s < 0x80) + { + ++len; + ++s; + continue; + } + + // invalid, skip u8_analyze + if (*s < 0xC2) { - for (++s; *s >= 0x80 && *s <= 0xC0; ++s); + ++s; continue; } - // part of a wide character, we ignore that one - if (*s <= 0xBF) - --len; - ++s; + + if (!u8_analyze((const char*)s, &st, &ln, NULL, U8_ANALYZE_INFINITY)) + { + // we CAN end up here, if an invalid char is between this one and the end of the string + if(valid) + *valid = TRUE; + return len; + } + + if(end && s + st + ln > end) + { + // string length exceeded by new character + if(valid) + *valid = TRUE; + return len; + } + + // valid character, skip after it + s += st + ln; + ++len; } // never get here } + +/** Pads a utf-8 string + * @param out The target buffer the utf-8 string is written to. + * @param outsize The size of the target buffer, including the final NUL + * @param in The input utf-8 buffer + * @param leftalign Left align the output string (by default right alignment is done) + * @param minwidth The minimum output width + * @param maxwidth The maximum output width + * @return The number of bytes written, not including the terminating \0 + */ +size_t u8_strpad(char *out, size_t outsize, const char *in, qboolean leftalign, size_t minwidth, size_t maxwidth) +{ + if(!utf8_enable.integer) + { + return dpsnprintf(out, outsize, "%*.*s", leftalign ? -(int) minwidth : (int) minwidth, (int) maxwidth, in); + } + else + { + size_t l = u8_bytelen(in, maxwidth); + size_t actual_width = u8_strnlen(in, l); + int pad = (actual_width >= minwidth) ? 0 : (minwidth - actual_width); + int prec = l; + int lpad = leftalign ? 0 : pad; + int rpad = leftalign ? pad : 0; + return dpsnprintf(out, outsize, "%*s%.*s%*s", lpad, "", prec, in, rpad, ""); + } +}