utf8lib.c

   1 #include "gmqcc.h"
   2
   3 static unsigned char utf8_lengths[256] = {
   4         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii characters */
   5         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   6         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   7         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   8         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   9         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  10         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  11         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  12         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0xBF are within multibyte sequences
  13         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  * they could be interpreted as 2-byte starts but
  14         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  * the codepoint would be < 127
  15         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  *
  16         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  * C0 and C1 would also result in overlong encodings
  17         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  */
  18         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  19         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  20         /* with F5 the codepoint is above 0x10FFFF,
  21          * F8-FB would start 5-byte sequences
  22          * FC-FD would start 6-byte sequences
  23          * ...
  24          */
  25 };
  26
  27 static uchar_t utf8_range[5] = {
  28         1,       /* invalid - let's not allow the creation of 0-bytes :P
  29         1,        * ascii minimum
  30         0x80,     * 2-byte minimum
  31         0x800,    * 3-byte minimum
  32         0x10000,  * 4-byte minimum */
  33 };
  34
  35 /** Analyze the next character and return various information if requested.
  36  * @param _s      An utf-8 string.
  37  * @param _start  Filled with the start byte-offset of the next valid character
  38  * @param _len    Filled with the length of the next valid character
  39  * @param _ch     Filled with the unicode value of the next character
  40  * @param _maxlen Maximum number of bytes to read from _s
  41  * @return        Whether or not another valid character is in the string
  42  */
  43 bool u8_analyze(const char *_s, size_t *_start, size_t *_len, uchar_t *_ch, size_t _maxlen)
  44 {
  45         const unsigned char *s = (const unsigned char*)_s;
  46         size_t i, j;
  47         size_t bits = 0;
  48         uchar_t ch;
  49
  50         i = 0;
  51 /* findchar: */
  52         while (i < _maxlen && s[i] && (bits = utf8_lengths[s[i]]) == 0)
  53                 ++i;
  54
  55         if (i >= _maxlen || !s[i]) {
  56                 if (_start) *_start = i;
  57                 if (_len) *_len = 0;
  58                 return false;
  59         }
  60
  61         if (bits == 1) { /* ascii */
  62                 if (_start) *_start = i;
  63                 if (_len) *_len = 1;
  64                 if (_ch) *_ch = (uchar_t)s[i];
  65                 return true;
  66         }
  67
  68         ch = (s[i] & (0xFF >> bits));
  69         for (j = 1; j < bits; ++j)
  70         {
  71                 if ( (s[i+j] & 0xC0) != 0x80 )
  72                 {
  73                         i += j;
  74                         /* in gmqcc, invalid / overlong encodings are considered an error
  75                          * goto findchar;
  76                          */
  77                         if (!s[i]) goto done;
  78                         return false;
  79                 }
  80                 ch = (ch << 6) | (s[i+j] & 0x3F);
  81         }
  82         if (ch < utf8_range[bits] || ch >= 0x10FFFF)
  83         {
  84                 /* same: error
  85                  * i += bits;
  86                  * goto findchar;
  87                  */
  88                 return false;
  89         }
  90
  91 done:
  92         if (_start)
  93                 *_start = i;
  94         if (_len)
  95                 *_len = bits;
  96         if (_ch)
  97                 *_ch = ch;
  98         return true;
  99 }
 100
 101 /* might come in handy */
 102 size_t u8_strlen(const char *_s)
 103 {
 104         size_t st, ln;
 105         size_t len = 0;
 106         const unsigned char *s = (const unsigned char*)_s;
 107
 108         while (*s)
 109         {
 110                 /* ascii char, skip u8_analyze */
 111                 if (*s < 0x80)
 112                 {
 113                         ++len;
 114                         ++s;
 115                         continue;
 116                 }
 117
 118                 /* invalid, skip u8_analyze */
 119                 if (*s < 0xC2)
 120                 {
 121                         ++s;
 122                         continue;
 123                 }
 124
 125                 if (!u8_analyze((const char*)s, &st, &ln, NULL, 0x10))
 126                         break;
 127                 /* valid character, skip after it */
 128                 s += st + ln;
 129                 ++len;
 130         }
 131         return len;
 132 }
 133
 134 size_t u8_strnlen(const char *_s, size_t n)
 135 {
 136         size_t st, ln;
 137         size_t len = 0;
 138         const unsigned char *s = (const unsigned char*)_s;
 139
 140         while (*s && n)
 141         {
 142                 /* ascii char, skip u8_analyze */
 143                 if (*s < 0x80)
 144                 {
 145                         ++len;
 146                         ++s;
 147                         --n;
 148                         continue;
 149                 }
 150
 151                 /* invalid, skip u8_analyze */
 152                 if (*s < 0xC2)
 153                 {
 154                         ++s;
 155                         --n;
 156                         continue;
 157                 }
 158
 159                 if (!u8_analyze((const char*)s, &st, &ln, NULL, n))
 160                         break;
 161                 /* valid character, see if it's still inside the range specified by n: */
 162                 if (n < st + ln)
 163                         return len;
 164                 ++len;
 165                 n -= st + ln;
 166                 s += st + ln;
 167         }
 168         return len;
 169 }
 170
 171 /* Required for character constants */
 172 uchar_t u8_getchar(const char *_s, const char **_end)
 173 {
 174         size_t st, ln;
 175         uchar_t ch;
 176
 177         if (!u8_analyze(_s, &st, &ln, &ch, 0x10))
 178                 ch = 0;
 179         if (_end)
 180                 *_end = _s + st + ln;
 181         return ch;
 182 }
 183
 184 uchar_t u8_getnchar(const char *_s, const char **_end, size_t _maxlen)
 185 {
 186         size_t st, ln;
 187         uchar_t ch;
 188
 189         if (!u8_analyze(_s, &st, &ln, &ch, _maxlen))
 190                 ch = 0;
 191         if (_end)
 192                 *_end = _s + st + ln;
 193         return ch;
 194 }
 195
 196 /* required for \x{asdf}-like string escape sequences */
 197 int u8_fromchar(uchar_t w, char *to, size_t maxlen)
 198 {
 199         if (maxlen < 1)
 200                 return 0;
 201
 202         if (!w)
 203                 return 0;
 204
 205 /* We may want an -f flag for this behaviour...
 206         if (w >= 0xE000)
 207                 w -= 0xE000;
 208 */
 209
 210         if (w < 0x80)
 211         {
 212                 to[0] = (char)w;
 213                 if (maxlen < 2)
 214                         return -1;
 215                 to[1] = 0;
 216                 return 1;
 217         }
 218         /* for a little speedup */
 219         if (w < 0x800)
 220         {
 221                 if (maxlen < 3)
 222                 {
 223                         to[0] = 0;
 224                         return -1;
 225                 }
 226                 to[2] = 0;
 227                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 228                 to[0] = 0xC0 | w;
 229                 return 2;
 230         }
 231         if (w < 0x10000)
 232         {
 233                 if (maxlen < 4)
 234                 {
 235                         to[0] = 0;
 236                         return -1;
 237                 }
 238                 to[3] = 0;
 239                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 240                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 241                 to[0] = 0xE0 | w;
 242                 return 3;
 243         }
 244
 245         /* RFC 3629 */
 246         if (w <= 0x10FFFF)
 247         {
 248                 if (maxlen < 5)
 249                 {
 250                         to[0] = 0;
 251                         return -1;
 252                 }
 253                 to[4] = 0;
 254                 to[3] = 0x80 | (w & 0x3F); w >>= 6;
 255                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 256                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 257                 to[0] = 0xF0 | w;
 258                 return 4;
 259         }
 260         return 0;
 261 }