utf8lib.c

   1 #include "gmqcc.h"
   2
   3 static unsigned char utf8_lengths[256] = {
   4         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii characters */
   5         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   6         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   7         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   8         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   9         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  10         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  11         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  12         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0xBF are within multibyte sequences
  13         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  * they could be interpreted as 2-byte starts but
  14         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  * the codepoint would be < 127
  15         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  *
  16         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  * C0 and C1 would also result in overlong encodings
  17         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  */
  18         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  19         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  20         /* with F5 the codepoint is above 0x10FFFF,
  21          * F8-FB would start 5-byte sequences
  22          * FC-FD would start 6-byte sequences
  23          * ...
  24          */
  25 };
  26
  27 static Uchar utf8_range[5] = {
  28         1,       /* invalid - let's not allow the creation of 0-bytes :P
  29         1,        * ascii minimum
  30         0x80,     * 2-byte minimum
  31         0x800,    * 3-byte minimum
  32         0x10000,  * 4-byte minimum */
  33 };
  34
  35 /** Analyze the next character and return various information if requested.
  36  * @param _s      An utf-8 string.
  37  * @param _start  Filled with the start byte-offset of the next valid character
  38  * @param _len    Fileed with the length of the next valid character
  39  * @param _ch     Filled with the unicode value of the next character
  40  * @param _maxlen Maximum number of bytes to read from _s
  41  * @return        Whether or not another valid character is in the string
  42  */
  43 static bool u8_analyze(const char *_s, size_t *_start, size_t *_len, Uchar *_ch, size_t _maxlen)
  44 {
  45         const unsigned char *s = (const unsigned char*)_s;
  46         size_t i, j;
  47         size_t bits = 0;
  48         Uchar ch;
  49
  50         i = 0;
  51 findchar:
  52         while (i < _maxlen && s[i] && (bits = utf8_lengths[s[i]]) == 0)
  53                 ++i;
  54
  55         if (i >= _maxlen || !s[i]) {
  56                 if (_start) *_start = i;
  57                 if (_len) *_len = 0;
  58                 return false;
  59         }
  60
  61         if (bits == 1) { /* ascii */
  62                 if (_start) *_start = i;
  63                 if (_len) *_len = 1;
  64                 if (_ch) *_ch = (Uchar)s[i];
  65                 return true;
  66         }
  67
  68         ch = (s[i] & (0xFF >> bits));
  69         for (j = 1; j < bits; ++j)
  70         {
  71                 if ( (s[i+j] & 0xC0) != 0x80 )
  72                 {
  73                         i += j;
  74                         goto findchar;
  75                 }
  76                 ch = (ch << 6) | (s[i+j] & 0x3F);
  77         }
  78         if (ch < utf8_range[bits] || ch >= 0x10FFFF)
  79         {
  80                 i += bits;
  81                 goto findchar;
  82         }
  83
  84         if (_start)
  85                 *_start = i;
  86         if (_len)
  87                 *_len = bits;
  88         if (_ch)
  89                 *_ch = ch;
  90         return true;
  91 }
  92
  93 /* might come in handy */
  94 size_t u8_strlen(const char *_s)
  95 {
  96         size_t st, ln;
  97         size_t len = 0;
  98         const unsigned char *s = (const unsigned char*)_s;
  99
 100         while (*s)
 101         {
 102                 /* ascii char, skip u8_analyze */
 103                 if (*s < 0x80)
 104                 {
 105                         ++len;
 106                         ++s;
 107                         continue;
 108                 }
 109
 110                 /* invalid, skip u8_analyze */
 111                 if (*s < 0xC2)
 112                 {
 113                         ++s;
 114                         continue;
 115                 }
 116
 117                 if (!u8_analyze((const char*)s, &st, &ln, NULL, 0x10))
 118                         break;
 119                 /* valid character, skip after it */
 120                 s += st + ln;
 121                 ++len;
 122         }
 123         return len;
 124 }
 125
 126 size_t u8_strnlen(const char *_s, size_t n)
 127 {
 128         size_t st, ln;
 129         size_t len = 0;
 130         const unsigned char *s = (const unsigned char*)_s;
 131
 132         while (*s && n)
 133         {
 134                 /* ascii char, skip u8_analyze */
 135                 if (*s < 0x80)
 136                 {
 137                         ++len;
 138                         ++s;
 139                         --n;
 140                         continue;
 141                 }
 142
 143                 /* invalid, skip u8_analyze */
 144                 if (*s < 0xC2)
 145                 {
 146                         ++s;
 147                         --n;
 148                         continue;
 149                 }
 150
 151                 if (!u8_analyze((const char*)s, &st, &ln, NULL, n))
 152                         break;
 153                 /* valid character, see if it's still inside the range specified by n: */
 154                 if (n < st + ln)
 155                         return len;
 156                 ++len;
 157                 n -= st + ln;
 158                 s += st + ln;
 159         }
 160         return len;
 161 }
 162
 163 /* Required for character constants */
 164 Uchar u8_getchar(const char *_s, const char **_end)
 165 {
 166         size_t st, ln;
 167         Uchar ch;
 168
 169         if (!u8_analyze(_s, &st, &ln, &ch, 0x10))
 170                 ch = 0;
 171         if (_end)
 172                 *_end = _s + st + ln;
 173         return ch;
 174 }
 175
 176 Uchar u8_getnchar(const char *_s, const char **_end, size_t _maxlen)
 177 {
 178         size_t st, ln;
 179         Uchar ch;
 180
 181         if (!u8_analyze(_s, &st, &ln, &ch, _maxlen))
 182                 ch = 0;
 183         if (_end)
 184                 *_end = _s + st + ln;
 185         return ch;
 186 }
 187
 188 /* required for \x{asdf}-like string escape sequences */
 189 int u8_fromchar(Uchar w, char *to, size_t maxlen)
 190 {
 191         if (maxlen < 1)
 192                 return 0;
 193
 194         if (!w)
 195                 return 0;
 196
 197 /* We may want an -f flag for this behaviour...
 198         if (w >= 0xE000)
 199                 w -= 0xE000;
 200 */
 201
 202         if (w < 0x80)
 203         {
 204                 to[0] = (char)w;
 205                 if (maxlen < 2)
 206                         return -1;
 207                 to[1] = 0;
 208                 return 1;
 209         }
 210         /* for a little speedup */
 211         if (w < 0x800)
 212         {
 213                 if (maxlen < 3)
 214                 {
 215                         to[0] = 0;
 216                         return -1;
 217                 }
 218                 to[2] = 0;
 219                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 220                 to[0] = 0xC0 | w;
 221                 return 2;
 222         }
 223         if (w < 0x10000)
 224         {
 225                 if (maxlen < 4)
 226                 {
 227                         to[0] = 0;
 228                         return -1;
 229                 }
 230                 to[3] = 0;
 231                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 232                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 233                 to[0] = 0xE0 | w;
 234                 return 3;
 235         }
 236
 237         /* RFC 3629 */
 238         if (w <= 0x10FFFF)
 239         {
 240                 if (maxlen < 5)
 241                 {
 242                         to[0] = 0;
 243                         return -1;
 244                 }
 245                 to[4] = 0;
 246                 to[3] = 0x80 | (w & 0x3F); w >>= 6;
 247                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 248                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 249                 to[0] = 0xF0 | w;
 250                 return 4;
 251         }
 252         return 0;
 253 }