-static unsigned char utf8_lengths[256] = {
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii characters */
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0xBF are within multibyte sequences */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* they could be interpreted as 2-byte starts but */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* the codepoint would be < 127 */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* */
- 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0 and C1 would also result in overlong encodings */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* */
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- /* with F5 the codepoint is above 0x10FFFF,
- * F8-FB would start 5-byte sequences
- * FC-FD would start 6-byte sequences
- * ...
- */
-};
-
-static uchar_t utf8_range[5] = {
- 1, /* invalid - let's not allow the creation of 0-bytes :P */
- 1, /* ascii minimum */
- 0x80, /* 2-byte minimum */
- 0x800, /* 3-byte minimum */
- 0x10000, /* 4-byte minimum */
-};
-
-/** Analyze the next character and return various information if requested.
- * @param _s An utf-8 string.
- * @param _start Filled with the start byte-offset of the next valid character
- * @param _len Filled with the length of the next valid character
- * @param _ch Filled with the unicode value of the next character
- * @param _maxlen Maximum number of bytes to read from _s
- * @return Whether or not another valid character is in the string
+/*
+ * Based on the flexible and economical utf8 decoder:
+ * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+ *
+ * This is slightly more economical, the fastest way to decode utf8 is
+ * with a lookup table as in:
+ *
+ * first 1-byte lookup
+ * if that fails, 2-byte lookup
+ * if that fails, 3-byte lookup
+ * if that fails, 4-byte lookup
+ *
+ * The following table can be generated with some interval trickery.
+ * consider an interval [a, b):
+ *
+ * a must be 0x80 or b must be 0xc0, lower 3 bits
+ * are clear, thus:
+ * interval(a,b) = ((uint32_t)((a==0x80?0x40-b:-a)<<23))
+ *
+ * The failstate can be represented as interval(0x80,0x80), it's
+ * odd to see but this is a full state machine.
+ *
+ * The table than maps the corresponding sections as a serise of
+ * intervals.
+ *
+ * In this table the transition values are pre-multiplied with 16 to
+ * save a shift instruction for every byte, we throw away fillers
+ * which makes the table smaller.
+ *
+ * The first section of the table handles bytes with leading C
+ * The second section of the table handles bytes with leading D
+ * The third section of the table handles bytes with leading E
+ * The last section of the table handles bytes with leading F
+ *
+ * The values themselfs in the table are arranged so that when you
+ * left shift them by 6 to shift continuation characters into place, the
+ * new top bits tell you:
+ *
+ * 1 - if you keep going
+ * 2 - the range of valid values for the next byte