lexer.c

   1 /*
   2  * Copyright (C) 2012, 2013
   3  *     Wolfgang Bumiller
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   6  * this software and associated documentation files (the "Software"), to deal in
   7  * the Software without restriction, including without limitation the rights to
   8  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is furnished to do
  10  * so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in all
  13  * copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23 #include <string.h>
  24 #include <stdlib.h>
  25
  26 #include "gmqcc.h"
  27 #include "lexer.h"
  28
  29 /*
  30  * List of Keywords
  31  */
  32
  33 /* original */
  34 static const char *keywords_qc[] = {
  35     "for", "do", "while",
  36     "if", "else",
  37     "local",
  38     "return",
  39     "const"
  40 };
  41 /* For fte/gmgqcc */
  42 static const char *keywords_fg[] = {
  43     "switch", "case", "default",
  44     "struct", "union",
  45     "break", "continue",
  46     "typedef",
  47     "goto",
  48
  49     "__builtin_debug_printtype"
  50 };
  51
  52 /*
  53  * Lexer code
  54  */
  55 static char* *lex_filenames;
  56
  57 static void lexerror(lex_file *lex, const char *fmt, ...)
  58 {
  59     va_list ap;
  60
  61     va_start(ap, fmt);
  62     if (lex)
  63         con_vprintmsg(LVL_ERROR, lex->name, lex->sline, lex->column, "parse error", fmt, ap);
  64     else
  65         con_vprintmsg(LVL_ERROR, "", 0, 0, "parse error", fmt, ap);
  66     va_end(ap);
  67 }
  68
  69 static bool lexwarn(lex_file *lex, int warntype, const char *fmt, ...)
  70 {
  71     bool      r;
  72     lex_ctx_t ctx;
  73     va_list   ap;
  74
  75     ctx.file   = lex->name;
  76     ctx.line   = lex->sline;
  77     ctx.column = lex->column;
  78
  79     va_start(ap, fmt);
  80     r = vcompile_warning(ctx, warntype, fmt, ap);
  81     va_end(ap);
  82     return r;
  83 }
  84
  85
  86 #if 0
  87 token* token_new()
  88 {
  89     token *tok = (token*)mem_a(sizeof(token));
  90     if (!tok)
  91         return NULL;
  92     memset(tok, 0, sizeof(*tok));
  93     return tok;
  94 }
  95
  96 void token_delete(token *self)
  97 {
  98     if (self->next && self->next->prev == self)
  99         self->next->prev = self->prev;
 100     if (self->prev && self->prev->next == self)
 101         self->prev->next = self->next;
 102     MEM_VECTOR_CLEAR(self, value);
 103     mem_d(self);
 104 }
 105
 106 token* token_copy(const token *cp)
 107 {
 108     token* self = token_new();
 109     if (!self)
 110         return NULL;
 111     /* copy the value */
 112     self->value_alloc = cp->value_count + 1;
 113     self->value_count = cp->value_count;
 114     self->value = (char*)mem_a(self->value_alloc);
 115     if (!self->value) {
 116         mem_d(self);
 117         return NULL;
 118     }
 119     memcpy(self->value, cp->value, cp->value_count);
 120     self->value[self->value_alloc-1] = 0;
 121
 122     /* rest */
 123     self->ctx = cp->ctx;
 124     self->ttype = cp->ttype;
 125     memcpy(&self->constval, &cp->constval, sizeof(self->constval));
 126     return self;
 127 }
 128
 129 void token_delete_all(token *t)
 130 {
 131     token *n;
 132
 133     do {
 134         n = t->next;
 135         token_delete(t);
 136         t = n;
 137     } while(t);
 138 }
 139
 140 token* token_copy_all(const token *cp)
 141 {
 142     token *cur;
 143     token *out;
 144
 145     out = cur = token_copy(cp);
 146     if (!out)
 147         return NULL;
 148
 149     while (cp->next) {
 150         cp = cp->next;
 151         cur->next = token_copy(cp);
 152         if (!cur->next) {
 153             token_delete_all(out);
 154             return NULL;
 155         }
 156         cur->next->prev = cur;
 157         cur = cur->next;
 158     }
 159
 160     return out;
 161 }
 162 #else
 163 static void lex_token_new(lex_file *lex)
 164 {
 165 #if 0
 166     if (lex->tok)
 167         token_delete(lex->tok);
 168     lex->tok = token_new();
 169 #else
 170     if (lex->tok.value)
 171         vec_shrinkto(lex->tok.value, 0);
 172
 173     lex->tok.constval.t  = 0;
 174     lex->tok.ctx.line    = lex->sline;
 175     lex->tok.ctx.file    = lex->name;
 176     lex->tok.ctx.column  = lex->column;
 177 #endif
 178 }
 179 #endif
 180
 181 lex_file* lex_open(const char *file)
 182 {
 183     lex_file  *lex;
 184     fs_file_t *in = fs_file_open(file, "rb");
 185
 186     if (!in) {
 187         lexerror(NULL, "open failed: '%s'\n", file);
 188         return NULL;
 189     }
 190
 191     lex = (lex_file*)mem_a(sizeof(*lex));
 192     if (!lex) {
 193         fs_file_close(in);
 194         lexerror(NULL, "out of memory\n");
 195         return NULL;
 196     }
 197
 198     memset(lex, 0, sizeof(*lex));
 199
 200     lex->file    = in;
 201     lex->name    = util_strdup(file);
 202     lex->line    = 1; /* we start counting at 1 */
 203     lex->column  = 0;
 204     lex->peekpos = 0;
 205     lex->eof     = false;
 206
 207     vec_push(lex_filenames, lex->name);
 208     return lex;
 209 }
 210
 211 lex_file* lex_open_string(const char *str, size_t len, const char *name)
 212 {
 213     lex_file *lex;
 214
 215     lex = (lex_file*)mem_a(sizeof(*lex));
 216     if (!lex) {
 217         lexerror(NULL, "out of memory\n");
 218         return NULL;
 219     }
 220
 221     memset(lex, 0, sizeof(*lex));
 222
 223     lex->file = NULL;
 224     lex->open_string        = str;
 225     lex->open_string_length = len;
 226     lex->open_string_pos    = 0;
 227
 228     lex->name    = util_strdup(name ? name : "<string-source>");
 229     lex->line    = 1; /* we start counting at 1 */
 230     lex->peekpos = 0;
 231     lex->eof     = false;
 232     lex->column  = 0;
 233
 234     vec_push(lex_filenames, lex->name);
 235
 236     return lex;
 237 }
 238
 239 void lex_cleanup(void)
 240 {
 241     size_t i;
 242     for (i = 0; i < vec_size(lex_filenames); ++i)
 243         mem_d(lex_filenames[i]);
 244     vec_free(lex_filenames);
 245 }
 246
 247 void lex_close(lex_file *lex)
 248 {
 249     size_t i;
 250     for (i = 0; i < vec_size(lex->frames); ++i)
 251         mem_d(lex->frames[i].name);
 252     vec_free(lex->frames);
 253
 254     if (lex->modelname)
 255         vec_free(lex->modelname);
 256
 257     if (lex->file)
 258         fs_file_close(lex->file);
 259 #if 0
 260     if (lex->tok)
 261         token_delete(lex->tok);
 262 #else
 263     vec_free(lex->tok.value);
 264 #endif
 265     /* mem_d(lex->name); collected in lex_filenames */
 266     mem_d(lex);
 267 }
 268
 269 static int lex_fgetc(lex_file *lex)
 270 {
 271     if (lex->file) {
 272         lex->column++;
 273         return fs_file_getc(lex->file);
 274     }
 275     if (lex->open_string) {
 276         if (lex->open_string_pos >= lex->open_string_length)
 277             return FS_FILE_EOF;
 278         lex->column++;
 279         return lex->open_string[lex->open_string_pos++];
 280     }
 281     return FS_FILE_EOF;
 282 }
 283
 284 /* Get or put-back data
 285  * The following to functions do NOT understand what kind of data they
 286  * are working on.
 287  * The are merely wrapping get/put in order to count line numbers.
 288  */
 289 static void lex_ungetch(lex_file *lex, int ch);
 290 static int lex_try_trigraph(lex_file *lex, int old)
 291 {
 292     int c2, c3;
 293     c2 = lex_fgetc(lex);
 294     if (!lex->push_line && c2 == '\n') {
 295         lex->line++;
 296         lex->column = 0;
 297     }
 298
 299     if (c2 != '?') {
 300         lex_ungetch(lex, c2);
 301         return old;
 302     }
 303
 304     c3 = lex_fgetc(lex);
 305     if (!lex->push_line && c3 == '\n') {
 306         lex->line++;
 307         lex->column = 0;
 308     }
 309
 310     switch (c3) {
 311         case '=': return '#';
 312         case '/': return '\\';
 313         case '\'': return '^';
 314         case '(': return '[';
 315         case ')': return ']';
 316         case '!': return '|';
 317         case '<': return '{';
 318         case '>': return '}';
 319         case '-': return '~';
 320         default:
 321             lex_ungetch(lex, c3);
 322             lex_ungetch(lex, c2);
 323             return old;
 324     }
 325 }
 326
 327 static int lex_try_digraph(lex_file *lex, int ch)
 328 {
 329     int c2;
 330     c2 = lex_fgetc(lex);
 331     /* we just used fgetc() so count lines
 332      * need to offset a \n the ungetch would recognize
 333      */
 334     if (!lex->push_line && c2 == '\n')
 335         lex->line++;
 336     if      (ch == '<' && c2 == ':')
 337         return '[';
 338     else if (ch == ':' && c2 == '>')
 339         return ']';
 340     else if (ch == '<' && c2 == '%')
 341         return '{';
 342     else if (ch == '%' && c2 == '>')
 343         return '}';
 344     else if (ch == '%' && c2 == ':')
 345         return '#';
 346     lex_ungetch(lex, c2);
 347     return ch;
 348 }
 349
 350 static int lex_getch(lex_file *lex)
 351 {
 352     int ch;
 353
 354     if (lex->peekpos) {
 355         lex->peekpos--;
 356         if (!lex->push_line && lex->peek[lex->peekpos] == '\n') {
 357             lex->line++;
 358             lex->column = 0;
 359         }
 360         return lex->peek[lex->peekpos];
 361     }
 362
 363     ch = lex_fgetc(lex);
 364     if (!lex->push_line && ch == '\n') {
 365         lex->line++;
 366         lex->column = 0;
 367     }
 368     else if (ch == '?')
 369         return lex_try_trigraph(lex, ch);
 370     else if (!lex->flags.nodigraphs && (ch == '<' || ch == ':' || ch == '%'))
 371         return lex_try_digraph(lex, ch);
 372     return ch;
 373 }
 374
 375 static void lex_ungetch(lex_file *lex, int ch)
 376 {
 377     lex->peek[lex->peekpos++] = ch;
 378     lex->column--;
 379     if (!lex->push_line && ch == '\n') {
 380         lex->line--;
 381         lex->column = 0;
 382     }
 383 }
 384
 385 /* classify characters
 386  * some additions to the is*() functions of ctype.h
 387  */
 388
 389 /* Idents are alphanumberic, but they start with alpha or _ */
 390 static bool isident_start(int ch)
 391 {
 392     return util_isalpha(ch) || ch == '_';
 393 }
 394
 395 static bool isident(int ch)
 396 {
 397     return isident_start(ch) || util_isdigit(ch);
 398 }
 399
 400 /* isxdigit_only is used when we already know it's not a digit
 401  * and want to see if it's a hex digit anyway.
 402  */
 403 static bool isxdigit_only(int ch)
 404 {
 405     return (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
 406 }
 407
 408 /* Append a character to the token buffer */
 409 static void lex_tokench(lex_file *lex, int ch)
 410 {
 411     vec_push(lex->tok.value, ch);
 412 }
 413
 414 /* Append a trailing null-byte */
 415 static void lex_endtoken(lex_file *lex)
 416 {
 417     vec_push(lex->tok.value, 0);
 418     vec_shrinkby(lex->tok.value, 1);
 419 }
 420
 421 static bool lex_try_pragma(lex_file *lex)
 422 {
 423     int ch;
 424     char *pragma  = NULL;
 425     char *command = NULL;
 426     char *param   = NULL;
 427     size_t line;
 428
 429     if (lex->flags.preprocessing)
 430         return false;
 431
 432     line = lex->line;
 433
 434     ch = lex_getch(lex);
 435     if (ch != '#') {
 436         lex_ungetch(lex, ch);
 437         return false;
 438     }
 439
 440     for (ch = lex_getch(lex); vec_size(pragma) < 8 && ch >= 'a' && ch <= 'z'; ch = lex_getch(lex))
 441         vec_push(pragma, ch);
 442     vec_push(pragma, 0);
 443
 444     if (ch != ' ' || strcmp(pragma, "pragma")) {
 445         lex_ungetch(lex, ch);
 446         goto unroll;
 447     }
 448
 449     for (ch = lex_getch(lex); vec_size(command) < 32 && ch >= 'a' && ch <= 'z'; ch = lex_getch(lex))
 450         vec_push(command, ch);
 451     vec_push(command, 0);
 452
 453     if (ch != '(') {
 454         lex_ungetch(lex, ch);
 455         goto unroll;
 456     }
 457
 458     for (ch = lex_getch(lex); vec_size(param) < 1024 && ch != ')' && ch != '\n'; ch = lex_getch(lex))
 459         vec_push(param, ch);
 460     vec_push(param, 0);
 461
 462     if (ch != ')') {
 463         lex_ungetch(lex, ch);
 464         goto unroll;
 465     }
 466
 467     if (!strcmp(command, "push")) {
 468         if (!strcmp(param, "line")) {
 469             lex->push_line++;
 470             if (lex->push_line == 1)
 471                 --line;
 472         }
 473         else
 474             goto unroll;
 475     }
 476     else if (!strcmp(command, "pop")) {
 477         if (!strcmp(param, "line")) {
 478             if (lex->push_line)
 479                 lex->push_line--;
 480             if (lex->push_line == 0)
 481                 --line;
 482         }
 483         else
 484             goto unroll;
 485     }
 486     else if (!strcmp(command, "file")) {
 487         lex->name = util_strdup(param);
 488         vec_push(lex_filenames, lex->name);
 489     }
 490     else if (!strcmp(command, "line")) {
 491         line = strtol(param, NULL, 0)-1;
 492     }
 493     else
 494         goto unroll;
 495
 496     lex->line = line;
 497     while (ch != '\n' && ch != FS_FILE_EOF)
 498         ch = lex_getch(lex);
 499     vec_free(command);
 500     vec_free(param);
 501     vec_free(pragma);
 502     return true;
 503
 504 unroll:
 505     if (command) {
 506         vec_pop(command);
 507         while (vec_size(command)) {
 508             lex_ungetch(lex, (unsigned char)vec_last(command));
 509             vec_pop(command);
 510         }
 511         vec_free(command);
 512         lex_ungetch(lex, ' ');
 513     }
 514     if (param) {
 515         vec_pop(param);
 516         while (vec_size(param)) {
 517             lex_ungetch(lex, (unsigned char)vec_last(param));
 518             vec_pop(param);
 519         }
 520         vec_free(param);
 521         lex_ungetch(lex, ' ');
 522     }
 523     if (pragma) {
 524         vec_pop(pragma);
 525         while (vec_size(pragma)) {
 526             lex_ungetch(lex, (unsigned char)vec_last(pragma));
 527             vec_pop(pragma);
 528         }
 529         vec_free(pragma);
 530     }
 531     lex_ungetch(lex, '#');
 532
 533     lex->line = line;
 534     return false;
 535 }
 536
 537 /* Skip whitespace and comments and return the first
 538  * non-white character.
 539  * As this makes use of the above getch() ungetch() functions,
 540  * we don't need to care at all about line numbering anymore.
 541  *
 542  * In theory, this function should only be used at the beginning
 543  * of lexing, or when we *know* the next character is part of the token.
 544  * Otherwise, if the parser throws an error, the linenumber may not be
 545  * the line of the error, but the line of the next token AFTER the error.
 546  *
 547  * This is currently only problematic when using c-like string-continuation,
 548  * since comments and whitespaces are allowed between 2 such strings.
 549  * Example:
 550 printf(   "line one\n"
 551 // A comment
 552           "A continuation of the previous string"
 553 // This line is skipped
 554       , foo);
 555
 556  * In this case, if the parse decides it didn't actually want a string,
 557  * and uses lex->line to print an error, it will show the ', foo);' line's
 558  * linenumber.
 559  *
 560  * On the other hand, the parser is supposed to remember the line of the next
 561  * token's beginning. In this case we would want skipwhite() to be called
 562  * AFTER reading a token, so that the parser, before reading the NEXT token,
 563  * doesn't store teh *comment's* linenumber, but the actual token's linenumber.
 564  *
 565  * THIS SOLUTION
 566  *    here is to store the line of the first character after skipping
 567  *    the initial whitespace in lex->sline, this happens in lex_do.
 568  */
 569 static int lex_skipwhite(lex_file *lex, bool hadwhite)
 570 {
 571     int ch = 0;
 572     bool haswhite = hadwhite;
 573
 574     do
 575     {
 576         ch = lex_getch(lex);
 577         while (ch != FS_FILE_EOF && util_isspace(ch)) {
 578             if (ch == '\n') {
 579                 if (lex_try_pragma(lex))
 580                     continue;
 581             }
 582             if (lex->flags.preprocessing) {
 583                 if (ch == '\n') {
 584                     /* end-of-line */
 585                     /* see if there was whitespace first */
 586                     if (haswhite) { /* (vec_size(lex->tok.value)) { */
 587                         lex_ungetch(lex, ch);
 588                         lex_endtoken(lex);
 589                         return TOKEN_WHITE;
 590                     }
 591                     /* otherwise return EOL */
 592                     return TOKEN_EOL;
 593                 }
 594                 haswhite = true;
 595                 lex_tokench(lex, ch);
 596             }
 597             ch = lex_getch(lex);
 598         }
 599
 600         if (ch == '/') {
 601             ch = lex_getch(lex);
 602             if (ch == '/')
 603             {
 604                 /* one line comment */
 605                 ch = lex_getch(lex);
 606
 607                 if (lex->flags.preprocessing) {
 608                     haswhite = true;
 609                     /*
 610                     lex_tokench(lex, '/');
 611                     lex_tokench(lex, '/');
 612                     */
 613                     lex_tokench(lex, ' ');
 614                     lex_tokench(lex, ' ');
 615                 }
 616
 617                 while (ch != FS_FILE_EOF && ch != '\n') {
 618                     if (lex->flags.preprocessing)
 619                         lex_tokench(lex, ' '); /* ch); */
 620                     ch = lex_getch(lex);
 621                 }
 622                 if (lex->flags.preprocessing) {
 623                     lex_ungetch(lex, '\n');
 624                     lex_endtoken(lex);
 625                     return TOKEN_WHITE;
 626                 }
 627                 continue;
 628             }
 629             if (ch == '*')
 630             {
 631                 /* multiline comment */
 632                 if (lex->flags.preprocessing) {
 633                     haswhite = true;
 634                     /*
 635                     lex_tokench(lex, '/');
 636                     lex_tokench(lex, '*');
 637                     */
 638                     lex_tokench(lex, ' ');
 639                     lex_tokench(lex, ' ');
 640                 }
 641
 642                 while (ch != FS_FILE_EOF)
 643                 {
 644                     ch = lex_getch(lex);
 645                     if (ch == '*') {
 646                         ch = lex_getch(lex);
 647                         if (ch == '/') {
 648                             if (lex->flags.preprocessing) {
 649                                 /*
 650                                 lex_tokench(lex, '*');
 651                                 lex_tokench(lex, '/');
 652                                 */
 653                                 lex_tokench(lex, ' ');
 654                                 lex_tokench(lex, ' ');
 655                             }
 656                             break;
 657                         }
 658                         lex_ungetch(lex, ch);
 659                     }
 660                     if (lex->flags.preprocessing) {
 661                         if (ch == '\n')
 662                             lex_tokench(lex, '\n');
 663                         else
 664                             lex_tokench(lex, ' '); /* ch); */
 665                     }
 666                 }
 667                 ch = ' '; /* cause TRUE in the isspace check */
 668                 continue;
 669             }
 670             /* Otherwise roll back to the slash and break out of the loop */
 671             lex_ungetch(lex, ch);
 672             ch = '/';
 673             break;
 674         }
 675     } while (ch != FS_FILE_EOF && util_isspace(ch));
 676
 677     if (haswhite) {
 678         lex_endtoken(lex);
 679         lex_ungetch(lex, ch);
 680         return TOKEN_WHITE;
 681     }
 682     return ch;
 683 }
 684
 685 /* Get a token */
 686 static bool GMQCC_WARN lex_finish_ident(lex_file *lex)
 687 {
 688     int ch;
 689
 690     ch = lex_getch(lex);
 691     while (ch != FS_FILE_EOF && isident(ch))
 692     {
 693         lex_tokench(lex, ch);
 694         ch = lex_getch(lex);
 695     }
 696
 697     /* last ch was not an ident ch: */
 698     lex_ungetch(lex, ch);
 699
 700     return true;
 701 }
 702
 703 /* read one ident for the frame list */
 704 static int lex_parse_frame(lex_file *lex)
 705 {
 706     int ch;
 707
 708     lex_token_new(lex);
 709
 710     ch = lex_getch(lex);
 711     while (ch != FS_FILE_EOF && ch != '\n' && util_isspace(ch))
 712         ch = lex_getch(lex);
 713
 714     if (ch == '\n')
 715         return 1;
 716
 717     if (!isident_start(ch)) {
 718         lexerror(lex, "invalid framename, must start with one of a-z or _, got %c", ch);
 719         return -1;
 720     }
 721
 722     lex_tokench(lex, ch);
 723     if (!lex_finish_ident(lex))
 724         return -1;
 725     lex_endtoken(lex);
 726     return 0;
 727 }
 728
 729 /* read a list of $frames */
 730 static bool lex_finish_frames(lex_file *lex)
 731 {
 732     do {
 733         size_t i;
 734         int    rc;
 735         frame_macro m;
 736
 737         rc = lex_parse_frame(lex);
 738         if (rc > 0) /* end of line */
 739             return true;
 740         if (rc < 0) /* error */
 741             return false;
 742
 743         for (i = 0; i < vec_size(lex->frames); ++i) {
 744             if (!strcmp(lex->tok.value, lex->frames[i].name)) {
 745                 lex->frames[i].value = lex->framevalue++;
 746                 if (lexwarn(lex, WARN_FRAME_MACROS, "duplicate frame macro defined: `%s`", lex->tok.value))
 747                     return false;
 748                 break;
 749             }
 750         }
 751         if (i < vec_size(lex->frames))
 752             continue;
 753
 754         m.value = lex->framevalue++;
 755         m.name = util_strdup(lex->tok.value);
 756         vec_shrinkto(lex->tok.value, 0);
 757         vec_push(lex->frames, m);
 758     } while (true);
 759
 760     return false;
 761 }
 762
 763 static int GMQCC_WARN lex_finish_string(lex_file *lex, int quote)
 764 {
 765     utf8ch_t chr = 0;
 766     int ch = 0;
 767     int nextch;
 768     bool hex;
 769     char u8buf[8]; /* way more than enough */
 770     int  u8len, uc;
 771
 772     while (ch != FS_FILE_EOF)
 773     {
 774         ch = lex_getch(lex);
 775         if (ch == quote)
 776             return TOKEN_STRINGCONST;
 777
 778         if (lex->flags.preprocessing && ch == '\\') {
 779             lex_tokench(lex, ch);
 780             ch = lex_getch(lex);
 781             if (ch == FS_FILE_EOF) {
 782                 lexerror(lex, "unexpected end of file");
 783                 lex_ungetch(lex, FS_FILE_EOF); /* next token to be TOKEN_EOF */
 784                 return (lex->tok.ttype = TOKEN_ERROR);
 785             }
 786             lex_tokench(lex, ch);
 787         }
 788         else if (ch == '\\') {
 789             ch = lex_getch(lex);
 790             if (ch == FS_FILE_EOF) {
 791                 lexerror(lex, "unexpected end of file");
 792                 lex_ungetch(lex, FS_FILE_EOF); /* next token to be TOKEN_EOF */
 793                 return (lex->tok.ttype = TOKEN_ERROR);
 794             }
 795
 796             switch (ch) {
 797             case '\\': break;
 798             case '\'': break;
 799             case '"':  break;
 800             case 'a':  ch = '\a'; break;
 801             case 'b':  ch = '\b'; break;
 802             case 'r':  ch = '\r'; break;
 803             case 'n':  ch = '\n'; break;
 804             case 't':  ch = '\t'; break;
 805             case 'f':  ch = '\f'; break;
 806             case 'v':  ch = '\v'; break;
 807             case 'x':
 808             case 'X':
 809                 /* same procedure as in fteqcc */
 810                 ch = 0;
 811                 nextch = lex_getch(lex);
 812                 if      (nextch >= '0' && nextch <= '9')
 813                     ch += nextch - '0';
 814                 else if (nextch >= 'a' && nextch <= 'f')
 815                     ch += nextch - 'a' + 10;
 816                 else if (nextch >= 'A' && nextch <= 'F')
 817                     ch += nextch - 'A' + 10;
 818                 else {
 819                     lexerror(lex, "bad character code");
 820                     lex_ungetch(lex, nextch);
 821                     return (lex->tok.ttype = TOKEN_ERROR);
 822                 }
 823
 824                 ch *= 0x10;
 825                 nextch = lex_getch(lex);
 826                 if      (nextch >= '0' && nextch <= '9')
 827                     ch += nextch - '0';
 828                 else if (nextch >= 'a' && nextch <= 'f')
 829                     ch += nextch - 'a' + 10;
 830                 else if (nextch >= 'A' && nextch <= 'F')
 831                     ch += nextch - 'A' + 10;
 832                 else {
 833                     lexerror(lex, "bad character code");
 834                     lex_ungetch(lex, nextch);
 835                     return (lex->tok.ttype = TOKEN_ERROR);
 836                 }
 837                 break;
 838
 839             /* fteqcc support */
 840             case '0': case '1': case '2': case '3':
 841             case '4': case '5': case '6': case '7':
 842             case '8': case '9':
 843                 ch = 18 + ch - '0';
 844                 break;
 845             case '<':  ch = 29; break;
 846             case '-':  ch = 30; break;
 847             case '>':  ch = 31; break;
 848             case '[':  ch = 16; break;
 849             case ']':  ch = 17; break;
 850             case '{':
 851                 chr = 0;
 852                 nextch = lex_getch(lex);
 853                 hex = (nextch == 'x');
 854                 if (!hex)
 855                     lex_ungetch(lex, nextch);
 856                 for (nextch = lex_getch(lex); nextch != '}'; nextch = lex_getch(lex)) {
 857                     if (!hex) {
 858                         if (nextch >= '0' && nextch <= '9')
 859                             chr = chr * 10 + nextch - '0';
 860                         else {
 861                             lexerror(lex, "bad character code");
 862                             return (lex->tok.ttype = TOKEN_ERROR);
 863                         }
 864                     } else {
 865                         if (nextch >= '0' && nextch <= '9')
 866                             chr = chr * 0x10 + nextch - '0';
 867                         else if (nextch >= 'a' && nextch <= 'f')
 868                             chr = chr * 0x10 + nextch - 'a' + 10;
 869                         else if (nextch >= 'A' && nextch <= 'F')
 870                             chr = chr * 0x10 + nextch - 'A' + 10;
 871                         else {
 872                             lexerror(lex, "bad character code");
 873                             return (lex->tok.ttype = TOKEN_ERROR);
 874                         }
 875                     }
 876                     if (chr > 0x10FFFF || (!OPTS_FLAG(UTF8) && chr > 255))
 877                     {
 878                         lexerror(lex, "character code out of range");
 879                         return (lex->tok.ttype = TOKEN_ERROR);
 880                     }
 881                 }
 882                 if (OPTS_FLAG(UTF8) && chr >= 128) {
 883                     u8len = utf8_from(u8buf, chr);
 884                     if (!u8len)
 885                         ch = 0;
 886                     else {
 887                         --u8len;
 888                         lex->column += u8len;
 889                         for (uc = 0; uc < u8len; ++uc)
 890                             lex_tokench(lex, u8buf[uc]);
 891                         /*
 892                          * the last character will be inserted with the tokench() call
 893                          * below the switch
 894                          */
 895                         ch = u8buf[uc];
 896                     }
 897                 }
 898                 else
 899                     ch = chr;
 900                 break;
 901             case '\n':  ch = '\n'; break;
 902
 903             default:
 904                 lexwarn(lex, WARN_UNKNOWN_CONTROL_SEQUENCE, "unrecognized control sequence: \\%c", ch);
 905                 /* so we just add the character plus backslash no matter what it actually is */
 906                 lex_tokench(lex, '\\');
 907             }
 908             /* add the character finally */
 909             lex_tokench(lex, ch);
 910         }
 911         else
 912             lex_tokench(lex, ch);
 913     }
 914     lexerror(lex, "unexpected end of file within string constant");
 915     lex_ungetch(lex, FS_FILE_EOF); /* next token to be TOKEN_EOF */
 916     return (lex->tok.ttype = TOKEN_ERROR);
 917 }
 918
 919 static int GMQCC_WARN lex_finish_digit(lex_file *lex, int lastch)
 920 {
 921     bool ishex = false;
 922
 923     int  ch = lastch;
 924
 925     /* parse a number... */
 926     if (ch == '.')
 927         lex->tok.ttype = TOKEN_FLOATCONST;
 928     else
 929         lex->tok.ttype = TOKEN_INTCONST;
 930
 931     lex_tokench(lex, ch);
 932
 933     ch = lex_getch(lex);
 934     if (ch != '.' && !util_isdigit(ch))
 935     {
 936         if (lastch != '0' || ch != 'x')
 937         {
 938             /* end of the number or EOF */
 939             lex_ungetch(lex, ch);
 940             lex_endtoken(lex);
 941
 942             lex->tok.constval.i = lastch - '0';
 943             return lex->tok.ttype;
 944         }
 945
 946         ishex = true;
 947     }
 948
 949     /* EOF would have been caught above */
 950
 951     if (ch != '.')
 952     {
 953         lex_tokench(lex, ch);
 954         ch = lex_getch(lex);
 955         while (util_isdigit(ch) || (ishex && isxdigit_only(ch)))
 956         {
 957             lex_tokench(lex, ch);
 958             ch = lex_getch(lex);
 959         }
 960     }
 961     /* NOT else, '.' can come from above as well */
 962     if (lex->tok.ttype != TOKEN_FLOATCONST && ch == '.' && !ishex)
 963     {
 964         /* Allow floating comma in non-hex mode */
 965         lex->tok.ttype = TOKEN_FLOATCONST;
 966         lex_tokench(lex, ch);
 967
 968         /* continue digits-only */
 969         ch = lex_getch(lex);
 970         while (util_isdigit(ch))
 971         {
 972             lex_tokench(lex, ch);
 973             ch = lex_getch(lex);
 974         }
 975     }
 976     /* put back the last character */
 977     /* but do not put back the trailing 'f' or a float */
 978     if (lex->tok.ttype == TOKEN_FLOATCONST && ch == 'f')
 979         ch = lex_getch(lex);
 980
 981     /* generally we don't want words to follow numbers: */
 982     if (isident(ch)) {
 983         lexerror(lex, "unexpected trailing characters after number");
 984         return (lex->tok.ttype = TOKEN_ERROR);
 985     }
 986     lex_ungetch(lex, ch);
 987
 988     lex_endtoken(lex);
 989     if (lex->tok.ttype == TOKEN_FLOATCONST)
 990         lex->tok.constval.f = strtod(lex->tok.value, NULL);
 991     else
 992         lex->tok.constval.i = strtol(lex->tok.value, NULL, 0);
 993     return lex->tok.ttype;
 994 }
 995
 996 int lex_do(lex_file *lex)
 997 {
 998     int ch, nextch, thirdch;
 999     bool hadwhite = false;
1000
1001     lex_token_new(lex);
1002 #if 0
1003     if (!lex->tok)
1004         return TOKEN_FATAL;
1005 #endif
1006
1007     while (true) {
1008         ch = lex_skipwhite(lex, hadwhite);
1009         hadwhite = true;
1010         if (!lex->flags.mergelines || ch != '\\')
1011             break;
1012         ch = lex_getch(lex);
1013         if (ch == '\r')
1014             ch = lex_getch(lex);
1015         if (ch != '\n') {
1016             lex_ungetch(lex, ch);
1017             ch = '\\';
1018             break;
1019         }
1020         /* we reached a linemerge */
1021         lex_tokench(lex, '\n');
1022         continue;
1023     }
1024
1025     if (lex->flags.preprocessing && (ch == TOKEN_WHITE || ch == TOKEN_EOL || ch == TOKEN_FATAL)) {
1026         return (lex->tok.ttype = ch);
1027     }
1028
1029     lex->sline = lex->line;
1030     lex->tok.ctx.line = lex->sline;
1031     lex->tok.ctx.file = lex->name;
1032
1033     if (lex->eof)
1034         return (lex->tok.ttype = TOKEN_FATAL);
1035
1036     if (ch == FS_FILE_EOF) {
1037         lex->eof = true;
1038         return (lex->tok.ttype = TOKEN_EOF);
1039     }
1040
1041     /* modelgen / spiritgen commands */
1042     if (ch == '$' && !lex->flags.preprocessing) {
1043         const char *v;
1044         size_t frame;
1045
1046         ch = lex_getch(lex);
1047         if (!isident_start(ch)) {
1048             lexerror(lex, "hanging '$' modelgen/spritegen command line");
1049             return lex_do(lex);
1050         }
1051         lex_tokench(lex, ch);
1052         if (!lex_finish_ident(lex))
1053             return (lex->tok.ttype = TOKEN_ERROR);
1054         lex_endtoken(lex);
1055         /* skip the known commands */
1056         v = lex->tok.value;
1057
1058         if (!strcmp(v, "frame") || !strcmp(v, "framesave"))
1059         {
1060             /* frame/framesave command works like an enum
1061              * similar to fteqcc we handle this in the lexer.
1062              * The reason for this is that it is sensitive to newlines,
1063              * which the parser is unaware of
1064              */
1065             if (!lex_finish_frames(lex))
1066                  return (lex->tok.ttype = TOKEN_ERROR);
1067             return lex_do(lex);
1068         }
1069
1070         if (!strcmp(v, "framevalue"))
1071         {
1072             ch = lex_getch(lex);
1073             while (ch != FS_FILE_EOF && util_isspace(ch) && ch != '\n')
1074                 ch = lex_getch(lex);
1075
1076             if (!util_isdigit(ch)) {
1077                 lexerror(lex, "$framevalue requires an integer parameter");
1078                 return lex_do(lex);
1079             }
1080
1081             lex_token_new(lex);
1082             lex->tok.ttype = lex_finish_digit(lex, ch);
1083             lex_endtoken(lex);
1084             if (lex->tok.ttype != TOKEN_INTCONST) {
1085                 lexerror(lex, "$framevalue requires an integer parameter");
1086                 return lex_do(lex);
1087             }
1088             lex->framevalue = lex->tok.constval.i;
1089             return lex_do(lex);
1090         }
1091
1092         if (!strcmp(v, "framerestore"))
1093         {
1094             int rc;
1095
1096             lex_token_new(lex);
1097
1098             rc = lex_parse_frame(lex);
1099
1100             if (rc > 0) {
1101                 lexerror(lex, "$framerestore requires a framename parameter");
1102                 return lex_do(lex);
1103             }
1104             if (rc < 0)
1105                 return (lex->tok.ttype = TOKEN_FATAL);
1106
1107             v = lex->tok.value;
1108             for (frame = 0; frame < vec_size(lex->frames); ++frame) {
1109                 if (!strcmp(v, lex->frames[frame].name)) {
1110                     lex->framevalue = lex->frames[frame].value;
1111                     return lex_do(lex);
1112                 }
1113             }
1114             lexerror(lex, "unknown framename `%s`", v);
1115             return lex_do(lex);
1116         }
1117
1118         if (!strcmp(v, "modelname"))
1119         {
1120             int rc;
1121
1122             lex_token_new(lex);
1123
1124             rc = lex_parse_frame(lex);
1125
1126             if (rc > 0) {
1127                 lexerror(lex, "$modelname requires a parameter");
1128                 return lex_do(lex);
1129             }
1130             if (rc < 0)
1131                 return (lex->tok.ttype = TOKEN_FATAL);
1132
1133             if (lex->modelname) {
1134                 frame_macro m;
1135                 m.value = lex->framevalue;
1136                 m.name = lex->modelname;
1137                 lex->modelname = NULL;
1138                 vec_push(lex->frames, m);
1139             }
1140             lex->modelname = lex->tok.value;
1141             lex->tok.value = NULL;
1142             return lex_do(lex);
1143         }
1144
1145         if (!strcmp(v, "flush"))
1146         {
1147             size_t fi;
1148             for (fi = 0; fi < vec_size(lex->frames); ++fi)
1149                 mem_d(lex->frames[fi].name);
1150             vec_free(lex->frames);
1151             /* skip line (fteqcc does it too) */
1152             ch = lex_getch(lex);
1153             while (ch != FS_FILE_EOF && ch != '\n')
1154                 ch = lex_getch(lex);
1155             return lex_do(lex);
1156         }
1157
1158         if (!strcmp(v, "cd") ||
1159             !strcmp(v, "origin") ||
1160             !strcmp(v, "base") ||
1161             !strcmp(v, "flags") ||
1162             !strcmp(v, "scale") ||
1163             !strcmp(v, "skin"))
1164         {
1165             /* skip line */
1166             ch = lex_getch(lex);
1167             while (ch != FS_FILE_EOF && ch != '\n')
1168                 ch = lex_getch(lex);
1169             return lex_do(lex);
1170         }
1171
1172         for (frame = 0; frame < vec_size(lex->frames); ++frame) {
1173             if (!strcmp(v, lex->frames[frame].name)) {
1174                 lex->tok.constval.i = lex->frames[frame].value;
1175                 return (lex->tok.ttype = TOKEN_INTCONST);
1176             }
1177         }
1178
1179         lexerror(lex, "invalid frame macro");
1180         return lex_do(lex);
1181     }
1182
1183     /* single-character tokens */
1184     switch (ch)
1185     {
1186         case '[':
1187             nextch = lex_getch(lex);
1188             if (nextch == '[') {
1189                 lex_tokench(lex, ch);
1190                 lex_tokench(lex, nextch);
1191                 lex_endtoken(lex);
1192                 return (lex->tok.ttype = TOKEN_ATTRIBUTE_OPEN);
1193             }
1194             lex_ungetch(lex, nextch);
1195             /* FALL THROUGH */
1196         case '(':
1197         case ':':
1198         case '?':
1199             lex_tokench(lex, ch);
1200             lex_endtoken(lex);
1201             if (lex->flags.noops)
1202                 return (lex->tok.ttype = ch);
1203             else
1204                 return (lex->tok.ttype = TOKEN_OPERATOR);
1205
1206         case ']':
1207             if (lex->flags.noops) {
1208                 nextch = lex_getch(lex);
1209                 if (nextch == ']') {
1210                     lex_tokench(lex, ch);
1211                     lex_tokench(lex, nextch);
1212                     lex_endtoken(lex);
1213                     return (lex->tok.ttype = TOKEN_ATTRIBUTE_CLOSE);
1214                 }
1215                 lex_ungetch(lex, nextch);
1216             }
1217             /* FALL THROUGH */
1218         case ')':
1219         case ';':
1220         case '{':
1221         case '}':
1222
1223         case '#':
1224             lex_tokench(lex, ch);
1225             lex_endtoken(lex);
1226             return (lex->tok.ttype = ch);
1227         default:
1228             break;
1229     }
1230
1231     if (ch == '.') {
1232         nextch = lex_getch(lex);
1233         /* digits starting with a dot */
1234         if (util_isdigit(nextch)) {
1235             lex_ungetch(lex, nextch);
1236             lex->tok.ttype = lex_finish_digit(lex, ch);
1237             lex_endtoken(lex);
1238             return lex->tok.ttype;
1239         }
1240         lex_ungetch(lex, nextch);
1241     }
1242
1243     if (lex->flags.noops)
1244     {
1245         /* Detect characters early which are normally
1246          * operators OR PART of an operator.
1247          */
1248         switch (ch)
1249         {
1250             /*
1251             case '+':
1252             case '-':
1253             */
1254             case '*':
1255             case '/':
1256             case '<':
1257             case '>':
1258             case '=':
1259             case '&':
1260             case '|':
1261             case '^':
1262             case '~':
1263             case ',':
1264             case '!':
1265                 lex_tokench(lex, ch);
1266                 lex_endtoken(lex);
1267                 return (lex->tok.ttype = ch);
1268             default:
1269                 break;
1270         }
1271     }
1272
1273     if (ch == '.')
1274     {
1275         lex_tokench(lex, ch);
1276         /* peak ahead once */
1277         nextch = lex_getch(lex);
1278         if (nextch != '.') {
1279             lex_ungetch(lex, nextch);
1280             lex_endtoken(lex);
1281             if (lex->flags.noops)
1282                 return (lex->tok.ttype = ch);
1283             else
1284                 return (lex->tok.ttype = TOKEN_OPERATOR);
1285         }
1286         /* peak ahead again */
1287         nextch = lex_getch(lex);
1288         if (nextch != '.') {
1289             lex_ungetch(lex, nextch);
1290             lex_ungetch(lex, '.');
1291             lex_endtoken(lex);
1292             if (lex->flags.noops)
1293                 return (lex->tok.ttype = ch);
1294             else
1295                 return (lex->tok.ttype = TOKEN_OPERATOR);
1296         }
1297         /* fill the token to be "..." */
1298         lex_tokench(lex, ch);
1299         lex_tokench(lex, ch);
1300         lex_endtoken(lex);
1301         return (lex->tok.ttype = TOKEN_DOTS);
1302     }
1303
1304     if (ch == ',' || ch == '.') {
1305         lex_tokench(lex, ch);
1306         lex_endtoken(lex);
1307         return (lex->tok.ttype = TOKEN_OPERATOR);
1308     }
1309
1310     if (ch == '+' || ch == '-' || /* ++, --, +=, -=  and -> as well! */
1311         ch == '>' || ch == '<' || /* <<, >>, <=, >=  and >< as well! */
1312         ch == '=' || ch == '!' || /* <=>, ==, !=                     */
1313         ch == '&' || ch == '|' || /* &&, ||, &=, |=                  */
1314         ch == '~' || ch == '^'    /* ~=, ~, ^                        */
1315     )  {
1316         lex_tokench(lex, ch);
1317
1318         nextch = lex_getch(lex);
1319         if ((nextch == '=' && ch != '<') ||
1320             (nextch == ch  && ch != '!') ||
1321             (nextch == '<' && ch == '>')) {
1322             lex_tokench(lex, nextch);
1323         } else if (ch == '<' && nextch == '=') {
1324             lex_tokench(lex, nextch);
1325             if ((thirdch = lex_getch(lex)) == '>')
1326                 lex_tokench(lex, thirdch);
1327             else
1328                 lex_ungetch(lex, thirdch);
1329
1330         } else if (ch == '-' && nextch == '>') {
1331             lex_tokench(lex, nextch);
1332         } else if (ch == '&' && nextch == '~') {
1333             thirdch = lex_getch(lex);
1334             if (thirdch != '=') {
1335                 lex_ungetch(lex, thirdch);
1336                 lex_ungetch(lex, nextch);
1337             }
1338             else {
1339                 lex_tokench(lex, nextch);
1340                 lex_tokench(lex, thirdch);
1341             }
1342         }
1343         else if (lex->flags.preprocessing &&
1344                  ch == '-' && util_isdigit(nextch))
1345         {
1346             lex->tok.ttype = lex_finish_digit(lex, nextch);
1347             if (lex->tok.ttype == TOKEN_INTCONST)
1348                 lex->tok.constval.i = -lex->tok.constval.i;
1349             else
1350                 lex->tok.constval.f = -lex->tok.constval.f;
1351             lex_endtoken(lex);
1352             return lex->tok.ttype;
1353         } else {
1354             lex_ungetch(lex, nextch);
1355         }
1356
1357         lex_endtoken(lex);
1358         return (lex->tok.ttype = TOKEN_OPERATOR);
1359     }
1360
1361     /*
1362     if (ch == '^' || ch == '~' || ch == '!')
1363     {
1364         lex_tokench(lex, ch);
1365         lex_endtoken(lex);
1366         return (lex->tok.ttype = TOKEN_OPERATOR);
1367     }
1368     */
1369
1370     if (ch == '*' || ch == '/') /* *=, /= */
1371     {
1372         lex_tokench(lex, ch);
1373
1374         nextch = lex_getch(lex);
1375         if (nextch == '=' || nextch == '*') {
1376             lex_tokench(lex, nextch);
1377         } else
1378             lex_ungetch(lex, nextch);
1379
1380         lex_endtoken(lex);
1381         return (lex->tok.ttype = TOKEN_OPERATOR);
1382     }
1383
1384     if (ch == '%') {
1385         lex_tokench(lex, ch);
1386         lex_endtoken(lex);
1387         return (lex->tok.ttype = TOKEN_OPERATOR);
1388     }
1389
1390     if (isident_start(ch))
1391     {
1392         const char *v;
1393
1394         lex_tokench(lex, ch);
1395         if (!lex_finish_ident(lex)) {
1396             /* error? */
1397             return (lex->tok.ttype = TOKEN_ERROR);
1398         }
1399         lex_endtoken(lex);
1400         lex->tok.ttype = TOKEN_IDENT;
1401
1402         v = lex->tok.value;
1403         if (!strcmp(v, "void")) {
1404             lex->tok.ttype = TOKEN_TYPENAME;
1405             lex->tok.constval.t = TYPE_VOID;
1406         } else if (!strcmp(v, "int")) {
1407             lex->tok.ttype = TOKEN_TYPENAME;
1408             lex->tok.constval.t = TYPE_INTEGER;
1409         } else if (!strcmp(v, "float")) {
1410             lex->tok.ttype = TOKEN_TYPENAME;
1411             lex->tok.constval.t = TYPE_FLOAT;
1412         } else if (!strcmp(v, "string")) {
1413             lex->tok.ttype = TOKEN_TYPENAME;
1414             lex->tok.constval.t = TYPE_STRING;
1415         } else if (!strcmp(v, "entity")) {
1416             lex->tok.ttype = TOKEN_TYPENAME;
1417             lex->tok.constval.t = TYPE_ENTITY;
1418         } else if (!strcmp(v, "vector")) {
1419             lex->tok.ttype = TOKEN_TYPENAME;
1420             lex->tok.constval.t = TYPE_VECTOR;
1421         } else {
1422             size_t kw;
1423             for (kw = 0; kw < GMQCC_ARRAY_COUNT(keywords_qc); ++kw) {
1424                 if (!strcmp(v, keywords_qc[kw]))
1425                     return (lex->tok.ttype = TOKEN_KEYWORD);
1426             }
1427             if (OPTS_OPTION_U32(OPTION_STANDARD) != COMPILER_QCC) {
1428                 for (kw = 0; kw < GMQCC_ARRAY_COUNT(keywords_fg); ++kw) {
1429                     if (!strcmp(v, keywords_fg[kw]))
1430                         return (lex->tok.ttype = TOKEN_KEYWORD);
1431                 }
1432             }
1433         }
1434
1435         return lex->tok.ttype;
1436     }
1437
1438     if (ch == '"')
1439     {
1440         lex->flags.nodigraphs = true;
1441         if (lex->flags.preprocessing)
1442             lex_tokench(lex, ch);
1443         lex->tok.ttype = lex_finish_string(lex, '"');
1444         if (lex->flags.preprocessing)
1445             lex_tokench(lex, ch);
1446         while (!lex->flags.preprocessing && lex->tok.ttype == TOKEN_STRINGCONST)
1447         {
1448             /* Allow c style "string" "continuation" */
1449             ch = lex_skipwhite(lex, false);
1450             if (ch != '"') {
1451                 lex_ungetch(lex, ch);
1452                 break;
1453             }
1454
1455             lex->tok.ttype = lex_finish_string(lex, '"');
1456         }
1457         lex->flags.nodigraphs = false;
1458         lex_endtoken(lex);
1459         return lex->tok.ttype;
1460     }
1461
1462     if (ch == '\'')
1463     {
1464         /* we parse character constants like string,
1465          * but return TOKEN_CHARCONST, or a vector type if it fits...
1466          * Likewise actual unescaping has to be done by the parser.
1467          * The difference is we don't allow 'char' 'continuation'.
1468          */
1469         if (lex->flags.preprocessing)
1470             lex_tokench(lex, ch);
1471         lex->tok.ttype = lex_finish_string(lex, '\'');
1472         if (lex->flags.preprocessing)
1473             lex_tokench(lex, ch);
1474         lex_endtoken(lex);
1475
1476         lex->tok.ttype = TOKEN_CHARCONST;
1477
1478         /* It's a vector if we can successfully scan 3 floats */
1479         if (util_sscanf(lex->tok.value, " %f %f %f ",
1480                    &lex->tok.constval.v.x, &lex->tok.constval.v.y, &lex->tok.constval.v.z) == 3)
1481
1482         {
1483              lex->tok.ttype = TOKEN_VECTORCONST;
1484         }
1485         else
1486         {
1487             if (!lex->flags.preprocessing && strlen(lex->tok.value) > 1) {
1488                 utf8ch_t u8char;
1489                 /* check for a valid utf8 character */
1490                 if (!OPTS_FLAG(UTF8) || !utf8_to(&u8char, (const unsigned char *)lex->tok.value, 8)) {
1491                     if (lexwarn(lex, WARN_MULTIBYTE_CHARACTER,
1492                                 ( OPTS_FLAG(UTF8) ? "invalid multibyte character sequence `%s`"
1493                                                   : "multibyte character: `%s`" ),
1494                                 lex->tok.value))
1495                         return (lex->tok.ttype = TOKEN_ERROR);
1496                 }
1497                 else
1498                     lex->tok.constval.i = u8char;
1499             }
1500             else
1501                 lex->tok.constval.i = lex->tok.value[0];
1502         }
1503
1504         return lex->tok.ttype;
1505     }
1506
1507     if (util_isdigit(ch))
1508     {
1509         lex->tok.ttype = lex_finish_digit(lex, ch);
1510         lex_endtoken(lex);
1511         return lex->tok.ttype;
1512     }
1513
1514     if (lex->flags.preprocessing) {
1515         lex_tokench(lex, ch);
1516         lex_endtoken(lex);
1517         return (lex->tok.ttype = ch);
1518     }
1519
1520     lexerror(lex, "unknown token: `%c`", ch);
1521     return (lex->tok.ttype = TOKEN_ERROR);
1522 }