utf8lib.c

   1 #include "gmqcc.h"
   2
   3 static unsigned char utf8_lengths[256] = {
   4         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii characters */
   5         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   6         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   7         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   8         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   9         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  10         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  11         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  12         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0xBF are within multibyte sequences
  13         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  * they could be interpreted as 2-byte starts but
  14         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  * the codepoint would be < 127
  15         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  *
  16         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  * C0 and C1 would also result in overlong encodings
  17         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  */
  18         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  19         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  20         /* with F5 the codepoint is above 0x10FFFF,
  21          * F8-FB would start 5-byte sequences
  22          * FC-FD would start 6-byte sequences
  23          * ...
  24          */
  25 };
  26
  27 static Uchar utf8_range[5] = {
  28         1,       /* invalid - let's not allow the creation of 0-bytes :P
  29         1,        * ascii minimum
  30         0x80,     * 2-byte minimum
  31         0x800,    * 3-byte minimum
  32         0x10000,  * 4-byte minimum */
  33 };
  34
  35 /** Analyze the next character and return various information if requested.
  36  * @param _s      An utf-8 string.
  37  * @param _start  Filled with the start byte-offset of the next valid character
  38  * @param _len    Fileed with the length of the next valid character
  39  * @param _ch     Filled with the unicode value of the next character
  40  * @param _maxlen Maximum number of bytes to read from _s
  41  * @return        Whether or not another valid character is in the string
  42  */
  43 static bool u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch, size_t _maxlen)
  44 {
  45         const unsigned char *s = (const unsigned char*)_s;
  46         size_t i, j;
  47         size_t bits = 0;
  48         Uchar ch;
  49
  50         i = 0;
  51 /* findchar: */
  52         while (i < _maxlen && s[i] && (bits = utf8_lengths[s[i]]) == 0)
  53                 ++i;
  54
  55         if (i >= _maxlen || !s[i]) {
  56                 if (_start) *_start = i;
  57                 if (_len) *_len = 0;
  58                 return false;
  59         }
  60
  61         if (bits == 1) { /* ascii */
  62                 if (_start) *_start = i;
  63                 if (_len) *_len = 1;
  64                 if (_ch) *_ch = (Uchar)s[i];
  65                 return true;
  66         }
  67
  68         ch = (s[i] & (0xFF >> bits));
  69         for (j = 1; j < bits; ++j)
  70         {
  71                 if ( (s[i+j] & 0xC0) != 0x80 )
  72                 {
  73                         i += j;
  74                         /* in gmqcc, invalid / overlong encodings are considered an error
  75                          * goto findchar;
  76                          */
  77                         return false;
  78                 }
  79                 ch = (ch << 6) | (s[i+j] & 0x3F);
  80         }
  81         if (ch < utf8_range[bits] || ch >= 0x10FFFF)
  82         {
  83                 /* same: error
  84                  * i += bits;
  85                  * goto findchar;
  86                  */
  87                 return false;
  88         }
  89
  90         if (_start)
  91                 *_start = i;
  92         if (_len)
  93                 *_len = bits;
  94         if (_ch)
  95                 *_ch = ch;
  96         return true;
  97 }
  98
  99 /* might come in handy */
 100 size_t u8_strlen(const char *_s)
 101 {
 102         size_t st, ln;
 103         size_t len = 0;
 104         const unsigned char *s = (const unsigned char*)_s;
 105
 106         while (*s)
 107         {
 108                 /* ascii char, skip u8_analyze */
 109                 if (*s < 0x80)
 110                 {
 111                         ++len;
 112                         ++s;
 113                         continue;
 114                 }
 115
 116                 /* invalid, skip u8_analyze */
 117                 if (*s < 0xC2)
 118                 {
 119                         ++s;
 120                         continue;
 121                 }
 122
 123                 if (!u8_analyze((const char*)s, &st, &ln, NULL, 0x10))
 124                         break;
 125                 /* valid character, skip after it */
 126                 s += st + ln;
 127                 ++len;
 128         }
 129         return len;
 130 }
 131
 132 size_t u8_strnlen(const char *_s, size_t n)
 133 {
 134         size_t st, ln;
 135         size_t len = 0;
 136         const unsigned char *s = (const unsigned char*)_s;
 137
 138         while (*s && n)
 139         {
 140                 /* ascii char, skip u8_analyze */
 141                 if (*s < 0x80)
 142                 {
 143                         ++len;
 144                         ++s;
 145                         --n;
 146                         continue;
 147                 }
 148
 149                 /* invalid, skip u8_analyze */
 150                 if (*s < 0xC2)
 151                 {
 152                         ++s;
 153                         --n;
 154                         continue;
 155                 }
 156
 157                 if (!u8_analyze((const char*)s, &st, &ln, NULL, n))
 158                         break;
 159                 /* valid character, see if it's still inside the range specified by n: */
 160                 if (n < st + ln)
 161                         return len;
 162                 ++len;
 163                 n -= st + ln;
 164                 s += st + ln;
 165         }
 166         return len;
 167 }
 168
 169 /* Required for character constants */
 170 Uchar u8_getchar(const char *_s, const char **_end)
 171 {
 172         size_t st, ln;
 173         Uchar ch;
 174
 175         if (!u8_analyze(_s, &st, &ln, &ch, 0x10))
 176                 ch = 0;
 177         if (_end)
 178                 *_end = _s + st + ln;
 179         return ch;
 180 }
 181
 182 Uchar u8_getnchar(const char *_s, const char **_end, size_t _maxlen)
 183 {
 184         size_t st, ln;
 185         Uchar ch;
 186
 187         if (!u8_analyze(_s, &st, &ln, &ch, _maxlen))
 188                 ch = 0;
 189         if (_end)
 190                 *_end = _s + st + ln;
 191         return ch;
 192 }
 193
 194 /* required for \x{asdf}-like string escape sequences */
 195 int u8_fromchar(Uchar w, char *to, size_t maxlen)
 196 {
 197         if (maxlen < 1)
 198                 return 0;
 199
 200         if (!w)
 201                 return 0;
 202
 203 /* We may want an -f flag for this behaviour...
 204         if (w >= 0xE000)
 205                 w -= 0xE000;
 206 */
 207
 208         if (w < 0x80)
 209         {
 210                 to[0] = (char)w;
 211                 if (maxlen < 2)
 212                         return -1;
 213                 to[1] = 0;
 214                 return 1;
 215         }
 216         /* for a little speedup */
 217         if (w < 0x800)
 218         {
 219                 if (maxlen < 3)
 220                 {
 221                         to[0] = 0;
 222                         return -1;
 223                 }
 224                 to[2] = 0;
 225                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 226                 to[0] = 0xC0 | w;
 227                 return 2;
 228         }
 229         if (w < 0x10000)
 230         {
 231                 if (maxlen < 4)
 232                 {
 233                         to[0] = 0;
 234                         return -1;
 235                 }
 236                 to[3] = 0;
 237                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 238                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 239                 to[0] = 0xE0 | w;
 240                 return 3;
 241         }
 242
 243         /* RFC 3629 */
 244         if (w <= 0x10FFFF)
 245         {
 246                 if (maxlen < 5)
 247                 {
 248                         to[0] = 0;
 249                         return -1;
 250                 }
 251                 to[4] = 0;
 252                 to[3] = 0x80 | (w & 0x3F); w >>= 6;
 253                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 254                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 255                 to[0] = 0xF0 | w;
 256                 return 4;
 257         }
 258         return 0;
 259 }