utf8.c

   1 /*
   2  * Copyright (C) 2012
   3  *     Wolfgang Bumiller
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   6  * this software and associated documentation files (the "Software"), to deal in
   7  * the Software without restriction, including without limitation the rights to
   8  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is furnished to do
  10  * so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in all
  13  * copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23 #include "gmqcc.h"
  24
  25 static unsigned char utf8_lengths[256] = {
  26         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii characters */
  27         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  28         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  29         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  30         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  31         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  32         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  33         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  34         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0xBF are within multibyte sequences
  35         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  * they could be interpreted as 2-byte starts but
  36         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  * the codepoint would be < 127
  37         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  *
  38         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  * C0 and C1 would also result in overlong encodings
  39         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  */
  40         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  41         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  42         /* with F5 the codepoint is above 0x10FFFF,
  43          * F8-FB would start 5-byte sequences
  44          * FC-FD would start 6-byte sequences
  45          * ...
  46          */
  47 };
  48
  49 static uchar_t utf8_range[5] = {
  50         1,       /* invalid - let's not allow the creation of 0-bytes :P
  51         1,        * ascii minimum
  52         0x80,     * 2-byte minimum
  53         0x800,    * 3-byte minimum
  54         0x10000,  * 4-byte minimum */
  55 };
  56
  57 /** Analyze the next character and return various information if requested.
  58  * @param _s      An utf-8 string.
  59  * @param _start  Filled with the start byte-offset of the next valid character
  60  * @param _len    Filled with the length of the next valid character
  61  * @param _ch     Filled with the unicode value of the next character
  62  * @param _maxlen Maximum number of bytes to read from _s
  63  * @return        Whether or not another valid character is in the string
  64  */
  65 bool u8_analyze(const char *_s, size_t *_start, size_t *_len, uchar_t *_ch, size_t _maxlen)
  66 {
  67         const unsigned char *s = (const unsigned char*)_s;
  68         size_t i, j;
  69         size_t bits = 0;
  70         uchar_t ch;
  71
  72         i = 0;
  73 /* findchar: */
  74         while (i < _maxlen && s[i] && (bits = utf8_lengths[s[i]]) == 0)
  75                 ++i;
  76
  77         if (i >= _maxlen || !s[i]) {
  78                 if (_start) *_start = i;
  79                 if (_len) *_len = 0;
  80                 return false;
  81         }
  82
  83         if (bits == 1) { /* ascii */
  84                 if (_start) *_start = i;
  85                 if (_len) *_len = 1;
  86                 if (_ch) *_ch = (uchar_t)s[i];
  87                 return true;
  88         }
  89
  90         ch = (s[i] & (0xFF >> bits));
  91         for (j = 1; j < bits; ++j)
  92         {
  93                 if ( (s[i+j] & 0xC0) != 0x80 )
  94                 {
  95                         i += j;
  96                         /* in gmqcc, invalid / overlong encodings are considered an error
  97                          * goto findchar;
  98                          */
  99                         if (!s[i]) goto done;
 100                         return false;
 101                 }
 102                 ch = (ch << 6) | (s[i+j] & 0x3F);
 103         }
 104         if (ch < utf8_range[bits] || ch >= 0x10FFFF)
 105         {
 106                 /* same: error
 107                  * i += bits;
 108                  * goto findchar;
 109                  */
 110                 return false;
 111         }
 112
 113 done:
 114         if (_start)
 115                 *_start = i;
 116         if (_len)
 117                 *_len = bits;
 118         if (_ch)
 119                 *_ch = ch;
 120         return true;
 121 }
 122
 123 /* might come in handy */
 124 size_t u8_strlen(const char *_s)
 125 {
 126         size_t st, ln;
 127         size_t len = 0;
 128         const unsigned char *s = (const unsigned char*)_s;
 129
 130         while (*s)
 131         {
 132                 /* ascii char, skip u8_analyze */
 133                 if (*s < 0x80)
 134                 {
 135                         ++len;
 136                         ++s;
 137                         continue;
 138                 }
 139
 140                 /* invalid, skip u8_analyze */
 141                 if (*s < 0xC2)
 142                 {
 143                         ++s;
 144                         continue;
 145                 }
 146
 147                 if (!u8_analyze((const char*)s, &st, &ln, NULL, 0x10))
 148                         break;
 149                 /* valid character, skip after it */
 150                 s += st + ln;
 151                 ++len;
 152         }
 153         return len;
 154 }
 155
 156 size_t u8_strnlen(const char *_s, size_t n)
 157 {
 158         size_t st, ln;
 159         size_t len = 0;
 160         const unsigned char *s = (const unsigned char*)_s;
 161
 162         while (*s && n)
 163         {
 164                 /* ascii char, skip u8_analyze */
 165                 if (*s < 0x80)
 166                 {
 167                         ++len;
 168                         ++s;
 169                         --n;
 170                         continue;
 171                 }
 172
 173                 /* invalid, skip u8_analyze */
 174                 if (*s < 0xC2)
 175                 {
 176                         ++s;
 177                         --n;
 178                         continue;
 179                 }
 180
 181                 if (!u8_analyze((const char*)s, &st, &ln, NULL, n))
 182                         break;
 183                 /* valid character, see if it's still inside the range specified by n: */
 184                 if (n < st + ln)
 185                         return len;
 186                 ++len;
 187                 n -= st + ln;
 188                 s += st + ln;
 189         }
 190         return len;
 191 }
 192
 193 /* Required for character constants */
 194 uchar_t u8_getchar(const char *_s, const char **_end)
 195 {
 196         size_t st, ln;
 197         uchar_t ch;
 198
 199         if (!u8_analyze(_s, &st, &ln, &ch, 0x10))
 200                 ch = 0;
 201         if (_end)
 202                 *_end = _s + st + ln;
 203         return ch;
 204 }
 205
 206 uchar_t u8_getnchar(const char *_s, const char **_end, size_t _maxlen)
 207 {
 208         size_t st, ln;
 209         uchar_t ch;
 210
 211         if (!u8_analyze(_s, &st, &ln, &ch, _maxlen))
 212                 ch = 0;
 213         if (_end)
 214                 *_end = _s + st + ln;
 215         return ch;
 216 }
 217
 218 /* required for \x{asdf}-like string escape sequences */
 219 int u8_fromchar(uchar_t w, char *to, size_t maxlen)
 220 {
 221         if (maxlen < 1)
 222                 return 0;
 223
 224         if (!w)
 225                 return 0;
 226
 227 /* We may want an -f flag for this behaviour...
 228         if (w >= 0xE000)
 229                 w -= 0xE000;
 230 */
 231
 232         if (w < 0x80)
 233         {
 234                 to[0] = (char)w;
 235                 if (maxlen < 2)
 236                         return -1;
 237                 to[1] = 0;
 238                 return 1;
 239         }
 240         /* for a little speedup */
 241         if (w < 0x800)
 242         {
 243                 if (maxlen < 3)
 244                 {
 245                         to[0] = 0;
 246                         return -1;
 247                 }
 248                 to[2] = 0;
 249                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 250                 to[0] = 0xC0 | w;
 251                 return 2;
 252         }
 253         if (w < 0x10000)
 254         {
 255                 if (maxlen < 4)
 256                 {
 257                         to[0] = 0;
 258                         return -1;
 259                 }
 260                 to[3] = 0;
 261                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 262                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 263                 to[0] = 0xE0 | w;
 264                 return 3;
 265         }
 266
 267         /* RFC 3629 */
 268         if (w <= 0x10FFFF)
 269         {
 270                 if (maxlen < 5)
 271                 {
 272                         to[0] = 0;
 273                         return -1;
 274                 }
 275                 to[4] = 0;
 276                 to[3] = 0x80 | (w & 0x3F); w >>= 6;
 277                 to[2] = 0x80 | (w & 0x3F); w >>= 6;
 278                 to[1] = 0x80 | (w & 0x3F); w >>= 6;
 279                 to[0] = 0xF0 | w;
 280                 return 4;
 281         }
 282         return 0;
 283 }