lexer.c

   1 /*
   2  * Copyright (C) 2012, 2013
   3  *     Wolfgang Bumiller
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   6  * this software and associated documentation files (the "Software"), to deal in
   7  * the Software without restriction, including without limitation the rights to
   8  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is furnished to do
  10  * so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in all
  13  * copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23 #include <string.h>
  24 #include <stdlib.h>
  25
  26 #include "gmqcc.h"
  27 #include "lexer.h"
  28
  29 /*
  30  * List of Keywords
  31  */
  32
  33 /* original */
  34 static const char *keywords_qc[] = {
  35     "for", "do", "while",
  36     "if", "else",
  37     "local",
  38     "return",
  39     "const"
  40 };
  41 /* For fte/gmgqcc */
  42 static const char *keywords_fg[] = {
  43     "switch", "case", "default",
  44     "struct", "union",
  45     "break", "continue",
  46     "typedef",
  47     "goto",
  48
  49     "__builtin_debug_printtype"
  50 };
  51
  52 /*
  53  * Lexer code
  54  */
  55 static char* *lex_filenames;
  56
  57 static void lexerror(lex_file *lex, const char *fmt, ...)
  58 {
  59     va_list ap;
  60
  61     va_start(ap, fmt);
  62     if (lex)
  63         con_vprintmsg(LVL_ERROR, lex->name, lex->sline, lex->column, "parse error", fmt, ap);
  64     else
  65         con_vprintmsg(LVL_ERROR, "", 0, 0, "parse error", fmt, ap);
  66     va_end(ap);
  67 }
  68
  69 static bool lexwarn(lex_file *lex, int warntype, const char *fmt, ...)
  70 {
  71     bool      r;
  72     lex_ctx_t ctx;
  73     va_list   ap;
  74
  75     ctx.file   = lex->name;
  76     ctx.line   = lex->sline;
  77     ctx.column = lex->column;
  78
  79     va_start(ap, fmt);
  80     r = vcompile_warning(ctx, warntype, fmt, ap);
  81     va_end(ap);
  82     return r;
  83 }
  84
  85
  86 #if 0
  87 token* token_new()
  88 {
  89     token *tok = (token*)mem_a(sizeof(token));
  90     if (!tok)
  91         return NULL;
  92     memset(tok, 0, sizeof(*tok));
  93     return tok;
  94 }
  95
  96 void token_delete(token *self)
  97 {
  98     if (self->next && self->next->prev == self)
  99         self->next->prev = self->prev;
 100     if (self->prev && self->prev->next == self)
 101         self->prev->next = self->next;
 102     MEM_VECTOR_CLEAR(self, value);
 103     mem_d(self);
 104 }
 105
 106 token* token_copy(const token *cp)
 107 {
 108     token* self = token_new();
 109     if (!self)
 110         return NULL;
 111     /* copy the value */
 112     self->value_alloc = cp->value_count + 1;
 113     self->value_count = cp->value_count;
 114     self->value = (char*)mem_a(self->value_alloc);
 115     if (!self->value) {
 116         mem_d(self);
 117         return NULL;
 118     }
 119     memcpy(self->value, cp->value, cp->value_count);
 120     self->value[self->value_alloc-1] = 0;
 121
 122     /* rest */
 123     self->ctx = cp->ctx;
 124     self->ttype = cp->ttype;
 125     memcpy(&self->constval, &cp->constval, sizeof(self->constval));
 126     return self;
 127 }
 128
 129 void token_delete_all(token *t)
 130 {
 131     token *n;
 132
 133     do {
 134         n = t->next;
 135         token_delete(t);
 136         t = n;
 137     } while(t);
 138 }
 139
 140 token* token_copy_all(const token *cp)
 141 {
 142     token *cur;
 143     token *out;
 144
 145     out = cur = token_copy(cp);
 146     if (!out)
 147         return NULL;
 148
 149     while (cp->next) {
 150         cp = cp->next;
 151         cur->next = token_copy(cp);
 152         if (!cur->next) {
 153             token_delete_all(out);
 154             return NULL;
 155         }
 156         cur->next->prev = cur;
 157         cur = cur->next;
 158     }
 159
 160     return out;
 161 }
 162 #else
 163 static void lex_token_new(lex_file *lex)
 164 {
 165 #if 0
 166     if (lex->tok)
 167         token_delete(lex->tok);
 168     lex->tok = token_new();
 169 #else
 170     if (lex->tok.value)
 171         vec_shrinkto(lex->tok.value, 0);
 172
 173     lex->tok.constval.t  = 0;
 174     lex->tok.ctx.line    = lex->sline;
 175     lex->tok.ctx.file    = lex->name;
 176     lex->tok.ctx.column  = lex->column;
 177 #endif
 178 }
 179 #endif
 180
 181 lex_file* lex_open(const char *file)
 182 {
 183     lex_file  *lex;
 184     fs_file_t *in = fs_file_open(file, "rb");
 185
 186     if (!in) {
 187         lexerror(NULL, "open failed: '%s'\n", file);
 188         return NULL;
 189     }
 190
 191     lex = (lex_file*)mem_a(sizeof(*lex));
 192     if (!lex) {
 193         fs_file_close(in);
 194         lexerror(NULL, "out of memory\n");
 195         return NULL;
 196     }
 197
 198     memset(lex, 0, sizeof(*lex));
 199
 200     lex->file    = in;
 201     lex->name    = util_strdup(file);
 202     lex->line    = 1; /* we start counting at 1 */
 203     lex->column  = 0;
 204     lex->peekpos = 0;
 205     lex->eof     = false;
 206
 207     vec_push(lex_filenames, lex->name);
 208     return lex;
 209 }
 210
 211 lex_file* lex_open_string(const char *str, size_t len, const char *name)
 212 {
 213     lex_file *lex;
 214
 215     lex = (lex_file*)mem_a(sizeof(*lex));
 216     if (!lex) {
 217         lexerror(NULL, "out of memory\n");
 218         return NULL;
 219     }
 220
 221     memset(lex, 0, sizeof(*lex));
 222
 223     lex->file = NULL;
 224     lex->open_string        = str;
 225     lex->open_string_length = len;
 226     lex->open_string_pos    = 0;
 227
 228     lex->name    = util_strdup(name ? name : "<string-source>");
 229     lex->line    = 1; /* we start counting at 1 */
 230     lex->peekpos = 0;
 231     lex->eof     = false;
 232     lex->column  = 0;
 233
 234     vec_push(lex_filenames, lex->name);
 235
 236     return lex;
 237 }
 238
 239 void lex_cleanup(void)
 240 {
 241     size_t i;
 242     for (i = 0; i < vec_size(lex_filenames); ++i)
 243         mem_d(lex_filenames[i]);
 244     vec_free(lex_filenames);
 245 }
 246
 247 void lex_close(lex_file *lex)
 248 {
 249     size_t i;
 250     for (i = 0; i < vec_size(lex->frames); ++i)
 251         mem_d(lex->frames[i].name);
 252     vec_free(lex->frames);
 253
 254     if (lex->modelname)
 255         vec_free(lex->modelname);
 256
 257     if (lex->file)
 258         fs_file_close(lex->file);
 259 #if 0
 260     if (lex->tok)
 261         token_delete(lex->tok);
 262 #else
 263     vec_free(lex->tok.value);
 264 #endif
 265     /* mem_d(lex->name); collected in lex_filenames */
 266     mem_d(lex);
 267 }
 268
 269 static int lex_fgetc(lex_file *lex)
 270 {
 271     if (lex->file) {
 272         lex->column++;
 273         return fs_file_getc(lex->file);
 274     }
 275     if (lex->open_string) {
 276         if (lex->open_string_pos >= lex->open_string_length)
 277             return FS_FILE_EOF;
 278         lex->column++;
 279         return lex->open_string[lex->open_string_pos++];
 280     }
 281     return FS_FILE_EOF;
 282 }
 283
 284 /* Get or put-back data
 285  * The following to functions do NOT understand what kind of data they
 286  * are working on.
 287  * The are merely wrapping get/put in order to count line numbers.
 288  */
 289 static void lex_ungetch(lex_file *lex, int ch);
 290 static int lex_try_trigraph(lex_file *lex, int old)
 291 {
 292     int c2, c3;
 293     c2 = lex_fgetc(lex);
 294     if (!lex->push_line && c2 == '\n') {
 295         lex->line++;
 296         lex->column = 0;
 297     }
 298
 299     if (c2 != '?') {
 300         lex_ungetch(lex, c2);
 301         return old;
 302     }
 303
 304     c3 = lex_fgetc(lex);
 305     if (!lex->push_line && c3 == '\n') {
 306         lex->line++;
 307         lex->column = 0;
 308     }
 309
 310     switch (c3) {
 311         case '=': return '#';
 312         case '/': return '\\';
 313         case '\'': return '^';
 314         case '(': return '[';
 315         case ')': return ']';
 316         case '!': return '|';
 317         case '<': return '{';
 318         case '>': return '}';
 319         case '-': return '~';
 320         default:
 321             lex_ungetch(lex, c3);
 322             lex_ungetch(lex, c2);
 323             return old;
 324     }
 325 }
 326
 327 static int lex_try_digraph(lex_file *lex, int ch)
 328 {
 329     int c2;
 330     c2 = lex_fgetc(lex);
 331     /* we just used fgetc() so count lines
 332      * need to offset a \n the ungetch would recognize
 333      */
 334     if (!lex->push_line && c2 == '\n')
 335         lex->line++;
 336     if      (ch == '<' && c2 == ':')
 337         return '[';
 338     else if (ch == ':' && c2 == '>')
 339         return ']';
 340     else if (ch == '<' && c2 == '%')
 341         return '{';
 342     else if (ch == '%' && c2 == '>')
 343         return '}';
 344     else if (ch == '%' && c2 == ':')
 345         return '#';
 346     lex_ungetch(lex, c2);
 347     return ch;
 348 }
 349
 350 static int lex_getch(lex_file *lex)
 351 {
 352     int ch;
 353
 354     if (lex->peekpos) {
 355         lex->peekpos--;
 356         if (!lex->push_line && lex->peek[lex->peekpos] == '\n') {
 357             lex->line++;
 358             lex->column = 0;
 359         }
 360         return lex->peek[lex->peekpos];
 361     }
 362
 363     ch = lex_fgetc(lex);
 364     if (!lex->push_line && ch == '\n') {
 365         lex->line++;
 366         lex->column = 0;
 367     }
 368     else if (ch == '?')
 369         return lex_try_trigraph(lex, ch);
 370     else if (!lex->flags.nodigraphs && (ch == '<' || ch == ':' || ch == '%'))
 371         return lex_try_digraph(lex, ch);
 372     return ch;
 373 }
 374
 375 static void lex_ungetch(lex_file *lex, int ch)
 376 {
 377     lex->peek[lex->peekpos++] = ch;
 378     lex->column--;
 379     if (!lex->push_line && ch == '\n') {
 380         lex->line--;
 381         lex->column = 0;
 382     }
 383 }
 384
 385 /* classify characters
 386  * some additions to the is*() functions of ctype.h
 387  */
 388
 389 /* Idents are alphanumberic, but they start with alpha or _ */
 390 static bool isident_start(int ch)
 391 {
 392     return util_isalpha(ch) || ch == '_';
 393 }
 394
 395 static bool isident(int ch)
 396 {
 397     return isident_start(ch) || util_isdigit(ch);
 398 }
 399
 400 /* isxdigit_only is used when we already know it's not a digit
 401  * and want to see if it's a hex digit anyway.
 402  */
 403 static bool isxdigit_only(int ch)
 404 {
 405     return (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
 406 }
 407
 408 /* Append a character to the token buffer */
 409 static void lex_tokench(lex_file *lex, int ch)
 410 {
 411     vec_push(lex->tok.value, ch);
 412 }
 413
 414 /* Append a trailing null-byte */
 415 static void lex_endtoken(lex_file *lex)
 416 {
 417     vec_push(lex->tok.value, 0);
 418     vec_shrinkby(lex->tok.value, 1);
 419 }
 420
 421 static bool lex_try_pragma(lex_file *lex)
 422 {
 423     int ch;
 424     char *pragma  = NULL;
 425     char *command = NULL;
 426     char *param   = NULL;
 427     size_t line;
 428
 429     if (lex->flags.preprocessing)
 430         return false;
 431
 432     line = lex->line;
 433
 434     ch = lex_getch(lex);
 435     if (ch != '#') {
 436         lex_ungetch(lex, ch);
 437         return false;
 438     }
 439
 440     for (ch = lex_getch(lex); vec_size(pragma) < 8 && ch >= 'a' && ch <= 'z'; ch = lex_getch(lex))
 441         vec_push(pragma, ch);
 442     vec_push(pragma, 0);
 443
 444     if (ch != ' ' || strcmp(pragma, "pragma")) {
 445         lex_ungetch(lex, ch);
 446         goto unroll;
 447     }
 448
 449     for (ch = lex_getch(lex); vec_size(command) < 32 && ch >= 'a' && ch <= 'z'; ch = lex_getch(lex))
 450         vec_push(command, ch);
 451     vec_push(command, 0);
 452
 453     if (ch != '(') {
 454         lex_ungetch(lex, ch);
 455         goto unroll;
 456     }
 457
 458     for (ch = lex_getch(lex); vec_size(param) < 1024 && ch != ')' && ch != '\n'; ch = lex_getch(lex))
 459         vec_push(param, ch);
 460     vec_push(param, 0);
 461
 462     if (ch != ')') {
 463         lex_ungetch(lex, ch);
 464         goto unroll;
 465     }
 466
 467     if (!strcmp(command, "push")) {
 468         if (!strcmp(param, "line")) {
 469             lex->push_line++;
 470             if (lex->push_line == 1)
 471                 --line;
 472         }
 473         else
 474             goto unroll;
 475     }
 476     else if (!strcmp(command, "pop")) {
 477         if (!strcmp(param, "line")) {
 478             if (lex->push_line)
 479                 lex->push_line--;
 480             if (lex->push_line == 0)
 481                 --line;
 482         }
 483         else
 484             goto unroll;
 485     }
 486     else if (!strcmp(command, "file")) {
 487         lex->name = util_strdup(param);
 488         vec_push(lex_filenames, lex->name);
 489     }
 490     else if (!strcmp(command, "line")) {
 491         line = strtol(param, NULL, 0)-1;
 492     }
 493     else
 494         goto unroll;
 495
 496     lex->line = line;
 497     while (ch != '\n' && ch != FS_FILE_EOF)
 498         ch = lex_getch(lex);
 499     vec_free(command);
 500     vec_free(param);
 501     vec_free(pragma);
 502     return true;
 503
 504 unroll:
 505     if (command) {
 506         vec_pop(command);
 507         while (vec_size(command)) {
 508             lex_ungetch(lex, (unsigned char)vec_last(command));
 509             vec_pop(command);
 510         }
 511         vec_free(command);
 512         lex_ungetch(lex, ' ');
 513     }
 514     if (param) {
 515         vec_pop(param);
 516         while (vec_size(param)) {
 517             lex_ungetch(lex, (unsigned char)vec_last(param));
 518             vec_pop(param);
 519         }
 520         vec_free(param);
 521         lex_ungetch(lex, ' ');
 522     }
 523     if (pragma) {
 524         vec_pop(pragma);
 525         while (vec_size(pragma)) {
 526             lex_ungetch(lex, (unsigned char)vec_last(pragma));
 527             vec_pop(pragma);
 528         }
 529         vec_free(pragma);
 530     }
 531     lex_ungetch(lex, '#');
 532
 533     lex->line = line;
 534     return false;
 535 }
 536
 537 /* Skip whitespace and comments and return the first
 538  * non-white character.
 539  * As this makes use of the above getch() ungetch() functions,
 540  * we don't need to care at all about line numbering anymore.
 541  *
 542  * In theory, this function should only be used at the beginning
 543  * of lexing, or when we *know* the next character is part of the token.
 544  * Otherwise, if the parser throws an error, the linenumber may not be
 545  * the line of the error, but the line of the next token AFTER the error.
 546  *
 547  * This is currently only problematic when using c-like string-continuation,
 548  * since comments and whitespaces are allowed between 2 such strings.
 549  * Example:
 550 printf(   "line one\n"
 551 // A comment
 552           "A continuation of the previous string"
 553 // This line is skipped
 554       , foo);
 555
 556  * In this case, if the parse decides it didn't actually want a string,
 557  * and uses lex->line to print an error, it will show the ', foo);' line's
 558  * linenumber.
 559  *
 560  * On the other hand, the parser is supposed to remember the line of the next
 561  * token's beginning. In this case we would want skipwhite() to be called
 562  * AFTER reading a token, so that the parser, before reading the NEXT token,
 563  * doesn't store teh *comment's* linenumber, but the actual token's linenumber.
 564  *
 565  * THIS SOLUTION
 566  *    here is to store the line of the first character after skipping
 567  *    the initial whitespace in lex->sline, this happens in lex_do.
 568  */
 569 static int lex_skipwhite(lex_file *lex, bool hadwhite)
 570 {
 571     int ch = 0;
 572     bool haswhite = hadwhite;
 573
 574     do
 575     {
 576         ch = lex_getch(lex);
 577         while (ch != FS_FILE_EOF && util_isspace(ch)) {
 578             if (ch == '\n') {
 579                 if (lex_try_pragma(lex))
 580                     continue;
 581             }
 582             if (lex->flags.preprocessing) {
 583                 if (ch == '\n') {
 584                     /* end-of-line */
 585                     /* see if there was whitespace first */
 586                     if (haswhite) { /* (vec_size(lex->tok.value)) { */
 587                         lex_ungetch(lex, ch);
 588                         lex_endtoken(lex);
 589                         return TOKEN_WHITE;
 590                     }
 591                     /* otherwise return EOL */
 592                     return TOKEN_EOL;
 593                 }
 594                 haswhite = true;
 595                 lex_tokench(lex, ch);
 596             }
 597             ch = lex_getch(lex);
 598         }
 599
 600         if (ch == '/') {
 601             ch = lex_getch(lex);
 602             if (ch == '/')
 603             {
 604                 /* one line comment */
 605                 ch = lex_getch(lex);
 606
 607                 if (lex->flags.preprocessing) {
 608                     haswhite = true;
 609                     /*
 610                     lex_tokench(lex, '/');
 611                     lex_tokench(lex, '/');
 612                     */
 613                     lex_tokench(lex, ' ');
 614                     lex_tokench(lex, ' ');
 615                 }
 616
 617                 while (ch != FS_FILE_EOF && ch != '\n') {
 618                     if (lex->flags.preprocessing)
 619                         lex_tokench(lex, ' '); /* ch); */
 620                     ch = lex_getch(lex);
 621                 }
 622                 if (lex->flags.preprocessing) {
 623                     lex_ungetch(lex, '\n');
 624                     lex_endtoken(lex);
 625                     return TOKEN_WHITE;
 626                 }
 627                 continue;
 628             }
 629             if (ch == '*')
 630             {
 631                 /* multiline comment */
 632                 if (lex->flags.preprocessing) {
 633                     haswhite = true;
 634                     /*
 635                     lex_tokench(lex, '/');
 636                     lex_tokench(lex, '*');
 637                     */
 638                     lex_tokench(lex, ' ');
 639                     lex_tokench(lex, ' ');
 640                 }
 641
 642                 while (ch != FS_FILE_EOF)
 643                 {
 644                     ch = lex_getch(lex);
 645                     if (ch == '*') {
 646                         ch = lex_getch(lex);
 647                         if (ch == '/') {
 648                             if (lex->flags.preprocessing) {
 649                                 /*
 650                                 lex_tokench(lex, '*');
 651                                 lex_tokench(lex, '/');
 652                                 */
 653                                 lex_tokench(lex, ' ');
 654                                 lex_tokench(lex, ' ');
 655                             }
 656                             break;
 657                         }
 658                         lex_ungetch(lex, ch);
 659                     }
 660                     if (lex->flags.preprocessing) {
 661                         if (ch == '\n')
 662                             lex_tokench(lex, '\n');
 663                         else
 664                             lex_tokench(lex, ' '); /* ch); */
 665                     }
 666                 }
 667                 ch = ' '; /* cause TRUE in the isspace check */
 668                 continue;
 669             }
 670             /* Otherwise roll back to the slash and break out of the loop */
 671             lex_ungetch(lex, ch);
 672             ch = '/';
 673             break;
 674         }
 675     } while (ch != FS_FILE_EOF && util_isspace(ch));
 676
 677     if (haswhite) {
 678         lex_endtoken(lex);
 679         lex_ungetch(lex, ch);
 680         return TOKEN_WHITE;
 681     }
 682     return ch;
 683 }
 684
 685 /* Get a token */
 686 static bool GMQCC_WARN lex_finish_ident(lex_file *lex)
 687 {
 688     int ch;
 689
 690     ch = lex_getch(lex);
 691     while (ch != FS_FILE_EOF && isident(ch))
 692     {
 693         lex_tokench(lex, ch);
 694         ch = lex_getch(lex);
 695     }
 696
 697     /* last ch was not an ident ch: */
 698     lex_ungetch(lex, ch);
 699
 700     return true;
 701 }
 702
 703 /* read one ident for the frame list */
 704 static int lex_parse_frame(lex_file *lex)
 705 {
 706     int ch;
 707
 708     lex_token_new(lex);
 709
 710     ch = lex_getch(lex);
 711     while (ch != FS_FILE_EOF && ch != '\n' && util_isspace(ch))
 712         ch = lex_getch(lex);
 713
 714     if (ch == '\n')
 715         return 1;
 716
 717     if (!isident_start(ch)) {
 718         lexerror(lex, "invalid framename, must start with one of a-z or _, got %c", ch);
 719         return -1;
 720     }
 721
 722     lex_tokench(lex, ch);
 723     if (!lex_finish_ident(lex))
 724         return -1;
 725     lex_endtoken(lex);
 726     return 0;
 727 }
 728
 729 /* read a list of $frames */
 730 static bool lex_finish_frames(lex_file *lex)
 731 {
 732     do {
 733         size_t i;
 734         int    rc;
 735         frame_macro m;
 736
 737         rc = lex_parse_frame(lex);
 738         if (rc > 0) /* end of line */
 739             return true;
 740         if (rc < 0) /* error */
 741             return false;
 742
 743         for (i = 0; i < vec_size(lex->frames); ++i) {
 744             if (!strcmp(lex->tok.value, lex->frames[i].name)) {
 745                 lex->frames[i].value = lex->framevalue++;
 746                 if (lexwarn(lex, WARN_FRAME_MACROS, "duplicate frame macro defined: `%s`", lex->tok.value))
 747                     return false;
 748                 break;
 749             }
 750         }
 751         if (i < vec_size(lex->frames))
 752             continue;
 753
 754         m.value = lex->framevalue++;
 755         m.name = util_strdup(lex->tok.value);
 756         vec_shrinkto(lex->tok.value, 0);
 757         vec_push(lex->frames, m);
 758     } while (true);
 759
 760     return false;
 761 }
 762
 763 static int GMQCC_WARN lex_finish_string(lex_file *lex, int quote)
 764 {
 765     utf8ch_t chr = 0;
 766     int ch = 0;
 767     int nextch;
 768     bool hex;
 769     bool oct;
 770     char u8buf[8]; /* way more than enough */
 771     int  u8len, uc;
 772
 773     while (ch != FS_FILE_EOF)
 774     {
 775         ch = lex_getch(lex);
 776         if (ch == quote)
 777             return TOKEN_STRINGCONST;
 778
 779         if (lex->flags.preprocessing && ch == '\\') {
 780             lex_tokench(lex, ch);
 781             ch = lex_getch(lex);
 782             if (ch == FS_FILE_EOF) {
 783                 lexerror(lex, "unexpected end of file");
 784                 lex_ungetch(lex, FS_FILE_EOF); /* next token to be TOKEN_EOF */
 785                 return (lex->tok.ttype = TOKEN_ERROR);
 786             }
 787             lex_tokench(lex, ch);
 788         }
 789         else if (ch == '\\') {
 790             ch = lex_getch(lex);
 791             if (ch == FS_FILE_EOF) {
 792                 lexerror(lex, "unexpected end of file");
 793                 lex_ungetch(lex, FS_FILE_EOF); /* next token to be TOKEN_EOF */
 794                 return (lex->tok.ttype = TOKEN_ERROR);
 795             }
 796
 797             switch (ch) {
 798             case '\\': break;
 799             case '\'': break;
 800             case '"':  break;
 801             case 'a':  ch = '\a'; break;
 802             case 'b':  ch = '\b'; break;
 803             case 'r':  ch = '\r'; break;
 804             case 'n':  ch = '\n'; break;
 805             case 't':  ch = '\t'; break;
 806             case 'f':  ch = '\f'; break;
 807             case 'v':  ch = '\v'; break;
 808             case 'x':
 809             case 'X':
 810                 /* same procedure as in fteqcc */
 811                 ch = 0;
 812                 nextch = lex_getch(lex);
 813                 if      (nextch >= '0' && nextch <= '9')
 814                     ch += nextch - '0';
 815                 else if (nextch >= 'a' && nextch <= 'f')
 816                     ch += nextch - 'a' + 10;
 817                 else if (nextch >= 'A' && nextch <= 'F')
 818                     ch += nextch - 'A' + 10;
 819                 else {
 820                     lexerror(lex, "bad character code");
 821                     lex_ungetch(lex, nextch);
 822                     return (lex->tok.ttype = TOKEN_ERROR);
 823                 }
 824
 825                 ch *= 0x10;
 826                 nextch = lex_getch(lex);
 827                 if      (nextch >= '0' && nextch <= '9')
 828                     ch += nextch - '0';
 829                 else if (nextch >= 'a' && nextch <= 'f')
 830                     ch += nextch - 'a' + 10;
 831                 else if (nextch >= 'A' && nextch <= 'F')
 832                     ch += nextch - 'A' + 10;
 833                 else {
 834                     lexerror(lex, "bad character code");
 835                     lex_ungetch(lex, nextch);
 836                     return (lex->tok.ttype = TOKEN_ERROR);
 837                 }
 838                 break;
 839
 840             /* fteqcc support */
 841             case '0': case '1': case '2': case '3':
 842             case '4': case '5': case '6': case '7':
 843             case '8': case '9':
 844                 ch = 18 + ch - '0';
 845                 break;
 846             case '<':  ch = 29; break;
 847             case '-':  ch = 30; break;
 848             case '>':  ch = 31; break;
 849             case '[':  ch = 16; break;
 850             case ']':  ch = 17; break;
 851             case '{':
 852                 chr = 0;
 853                 nextch = lex_getch(lex);
 854                 hex = (nextch == 'x');
 855                 oct = (nextch == '0');
 856                 if (!hex && !oct)
 857                     lex_ungetch(lex, nextch);
 858                 for (nextch = lex_getch(lex); nextch != '}'; nextch = lex_getch(lex)) {
 859                     if (!hex && !oct) {
 860                         if (nextch >= '0' && nextch <= '9')
 861                             chr = chr * 10 + nextch - '0';
 862                         else {
 863                             lexerror(lex, "bad character code");
 864                             return (lex->tok.ttype = TOKEN_ERROR);
 865                         }
 866                     } else if (!oct) {
 867                         if (nextch >= '0' && nextch <= '9')
 868                             chr = chr * 0x10 + nextch - '0';
 869                         else if (nextch >= 'a' && nextch <= 'f')
 870                             chr = chr * 0x10 + nextch - 'a' + 10;
 871                         else if (nextch >= 'A' && nextch <= 'F')
 872                             chr = chr * 0x10 + nextch - 'A' + 10;
 873                         else {
 874                             lexerror(lex, "bad character code");
 875                             return (lex->tok.ttype = TOKEN_ERROR);
 876                         }
 877                     } else {
 878                         if (nextch >= '0' && nextch <= '9')
 879                             chr = chr * 8 + chr - '0';
 880                         else {
 881                             lexerror(lex, "bad character code");
 882                             return (lex->tok.ttype = TOKEN_ERROR);
 883                         }
 884                     }
 885                     if (chr > 0x10FFFF || (!OPTS_FLAG(UTF8) && chr > 255))
 886                     {
 887                         lexerror(lex, "character code out of range");
 888                         return (lex->tok.ttype = TOKEN_ERROR);
 889                     }
 890                 }
 891                 if (OPTS_FLAG(UTF8) && chr >= 128) {
 892                     u8len = utf8_from(u8buf, chr);
 893                     if (!u8len)
 894                         ch = 0;
 895                     else {
 896                         --u8len;
 897                         lex->column += u8len;
 898                         for (uc = 0; uc < u8len; ++uc)
 899                             lex_tokench(lex, u8buf[uc]);
 900                         /*
 901                          * the last character will be inserted with the tokench() call
 902                          * below the switch
 903                          */
 904                         ch = u8buf[uc];
 905                     }
 906                 }
 907                 else
 908                     ch = chr;
 909                 break;
 910             case '\n':  ch = '\n'; break;
 911
 912             default:
 913                 lexwarn(lex, WARN_UNKNOWN_CONTROL_SEQUENCE, "unrecognized control sequence: \\%c", ch);
 914                 /* so we just add the character plus backslash no matter what it actually is */
 915                 lex_tokench(lex, '\\');
 916             }
 917             /* add the character finally */
 918             lex_tokench(lex, ch);
 919         }
 920         else
 921             lex_tokench(lex, ch);
 922     }
 923     lexerror(lex, "unexpected end of file within string constant");
 924     lex_ungetch(lex, FS_FILE_EOF); /* next token to be TOKEN_EOF */
 925     return (lex->tok.ttype = TOKEN_ERROR);
 926 }
 927
 928 static int GMQCC_WARN lex_finish_digit(lex_file *lex, int lastch)
 929 {
 930     bool ishex = false;
 931
 932     int  ch = lastch;
 933
 934     /* parse a number... */
 935     if (ch == '.')
 936         lex->tok.ttype = TOKEN_FLOATCONST;
 937     else
 938         lex->tok.ttype = TOKEN_INTCONST;
 939
 940     lex_tokench(lex, ch);
 941
 942     ch = lex_getch(lex);
 943     if (ch != '.' && !util_isdigit(ch))
 944     {
 945         if (lastch != '0' || ch != 'x')
 946         {
 947             /* end of the number or EOF */
 948             lex_ungetch(lex, ch);
 949             lex_endtoken(lex);
 950
 951             lex->tok.constval.i = lastch - '0';
 952             return lex->tok.ttype;
 953         }
 954
 955         ishex = true;
 956     }
 957
 958     /* EOF would have been caught above */
 959
 960     if (ch != '.')
 961     {
 962         lex_tokench(lex, ch);
 963         ch = lex_getch(lex);
 964         while (util_isdigit(ch) || (ishex && isxdigit_only(ch)))
 965         {
 966             lex_tokench(lex, ch);
 967             ch = lex_getch(lex);
 968         }
 969     }
 970     /* NOT else, '.' can come from above as well */
 971     if (lex->tok.ttype != TOKEN_FLOATCONST && ch == '.' && !ishex)
 972     {
 973         /* Allow floating comma in non-hex mode */
 974         lex->tok.ttype = TOKEN_FLOATCONST;
 975         lex_tokench(lex, ch);
 976
 977         /* continue digits-only */
 978         ch = lex_getch(lex);
 979         while (util_isdigit(ch))
 980         {
 981             lex_tokench(lex, ch);
 982             ch = lex_getch(lex);
 983         }
 984     }
 985     /* put back the last character */
 986     /* but do not put back the trailing 'f' or a float */
 987     if (lex->tok.ttype == TOKEN_FLOATCONST && ch == 'f')
 988         ch = lex_getch(lex);
 989
 990     /* generally we don't want words to follow numbers: */
 991     if (isident(ch)) {
 992         lexerror(lex, "unexpected trailing characters after number");
 993         return (lex->tok.ttype = TOKEN_ERROR);
 994     }
 995     lex_ungetch(lex, ch);
 996
 997     lex_endtoken(lex);
 998     if (lex->tok.ttype == TOKEN_FLOATCONST)
 999         lex->tok.constval.f = strtod(lex->tok.value, NULL);
1000     else
1001         lex->tok.constval.i = strtol(lex->tok.value, NULL, 0);
1002     return lex->tok.ttype;
1003 }
1004
1005 int lex_do(lex_file *lex)
1006 {
1007     int ch, nextch, thirdch;
1008     bool hadwhite = false;
1009
1010     lex_token_new(lex);
1011 #if 0
1012     if (!lex->tok)
1013         return TOKEN_FATAL;
1014 #endif
1015
1016     while (true) {
1017         ch = lex_skipwhite(lex, hadwhite);
1018         hadwhite = true;
1019         if (!lex->flags.mergelines || ch != '\\')
1020             break;
1021         ch = lex_getch(lex);
1022         if (ch == '\r')
1023             ch = lex_getch(lex);
1024         if (ch != '\n') {
1025             lex_ungetch(lex, ch);
1026             ch = '\\';
1027             break;
1028         }
1029         /* we reached a linemerge */
1030         lex_tokench(lex, '\n');
1031         continue;
1032     }
1033
1034     if (lex->flags.preprocessing && (ch == TOKEN_WHITE || ch == TOKEN_EOL || ch == TOKEN_FATAL)) {
1035         return (lex->tok.ttype = ch);
1036     }
1037
1038     lex->sline = lex->line;
1039     lex->tok.ctx.line = lex->sline;
1040     lex->tok.ctx.file = lex->name;
1041
1042     if (lex->eof)
1043         return (lex->tok.ttype = TOKEN_FATAL);
1044
1045     if (ch == FS_FILE_EOF) {
1046         lex->eof = true;
1047         return (lex->tok.ttype = TOKEN_EOF);
1048     }
1049
1050     /* modelgen / spiritgen commands */
1051     if (ch == '$' && !lex->flags.preprocessing) {
1052         const char *v;
1053         size_t frame;
1054
1055         ch = lex_getch(lex);
1056         if (!isident_start(ch)) {
1057             lexerror(lex, "hanging '$' modelgen/spritegen command line");
1058             return lex_do(lex);
1059         }
1060         lex_tokench(lex, ch);
1061         if (!lex_finish_ident(lex))
1062             return (lex->tok.ttype = TOKEN_ERROR);
1063         lex_endtoken(lex);
1064         /* skip the known commands */
1065         v = lex->tok.value;
1066
1067         if (!strcmp(v, "frame") || !strcmp(v, "framesave"))
1068         {
1069             /* frame/framesave command works like an enum
1070              * similar to fteqcc we handle this in the lexer.
1071              * The reason for this is that it is sensitive to newlines,
1072              * which the parser is unaware of
1073              */
1074             if (!lex_finish_frames(lex))
1075                  return (lex->tok.ttype = TOKEN_ERROR);
1076             return lex_do(lex);
1077         }
1078
1079         if (!strcmp(v, "framevalue"))
1080         {
1081             ch = lex_getch(lex);
1082             while (ch != FS_FILE_EOF && util_isspace(ch) && ch != '\n')
1083                 ch = lex_getch(lex);
1084
1085             if (!util_isdigit(ch)) {
1086                 lexerror(lex, "$framevalue requires an integer parameter");
1087                 return lex_do(lex);
1088             }
1089
1090             lex_token_new(lex);
1091             lex->tok.ttype = lex_finish_digit(lex, ch);
1092             lex_endtoken(lex);
1093             if (lex->tok.ttype != TOKEN_INTCONST) {
1094                 lexerror(lex, "$framevalue requires an integer parameter");
1095                 return lex_do(lex);
1096             }
1097             lex->framevalue = lex->tok.constval.i;
1098             return lex_do(lex);
1099         }
1100
1101         if (!strcmp(v, "framerestore"))
1102         {
1103             int rc;
1104
1105             lex_token_new(lex);
1106
1107             rc = lex_parse_frame(lex);
1108
1109             if (rc > 0) {
1110                 lexerror(lex, "$framerestore requires a framename parameter");
1111                 return lex_do(lex);
1112             }
1113             if (rc < 0)
1114                 return (lex->tok.ttype = TOKEN_FATAL);
1115
1116             v = lex->tok.value;
1117             for (frame = 0; frame < vec_size(lex->frames); ++frame) {
1118                 if (!strcmp(v, lex->frames[frame].name)) {
1119                     lex->framevalue = lex->frames[frame].value;
1120                     return lex_do(lex);
1121                 }
1122             }
1123             lexerror(lex, "unknown framename `%s`", v);
1124             return lex_do(lex);
1125         }
1126
1127         if (!strcmp(v, "modelname"))
1128         {
1129             int rc;
1130
1131             lex_token_new(lex);
1132
1133             rc = lex_parse_frame(lex);
1134
1135             if (rc > 0) {
1136                 lexerror(lex, "$modelname requires a parameter");
1137                 return lex_do(lex);
1138             }
1139             if (rc < 0)
1140                 return (lex->tok.ttype = TOKEN_FATAL);
1141
1142             if (lex->modelname) {
1143                 frame_macro m;
1144                 m.value = lex->framevalue;
1145                 m.name = lex->modelname;
1146                 lex->modelname = NULL;
1147                 vec_push(lex->frames, m);
1148             }
1149             lex->modelname = lex->tok.value;
1150             lex->tok.value = NULL;
1151             return lex_do(lex);
1152         }
1153
1154         if (!strcmp(v, "flush"))
1155         {
1156             size_t fi;
1157             for (fi = 0; fi < vec_size(lex->frames); ++fi)
1158                 mem_d(lex->frames[fi].name);
1159             vec_free(lex->frames);
1160             /* skip line (fteqcc does it too) */
1161             ch = lex_getch(lex);
1162             while (ch != FS_FILE_EOF && ch != '\n')
1163                 ch = lex_getch(lex);
1164             return lex_do(lex);
1165         }
1166
1167         if (!strcmp(v, "cd") ||
1168             !strcmp(v, "origin") ||
1169             !strcmp(v, "base") ||
1170             !strcmp(v, "flags") ||
1171             !strcmp(v, "scale") ||
1172             !strcmp(v, "skin"))
1173         {
1174             /* skip line */
1175             ch = lex_getch(lex);
1176             while (ch != FS_FILE_EOF && ch != '\n')
1177                 ch = lex_getch(lex);
1178             return lex_do(lex);
1179         }
1180
1181         for (frame = 0; frame < vec_size(lex->frames); ++frame) {
1182             if (!strcmp(v, lex->frames[frame].name)) {
1183                 lex->tok.constval.i = lex->frames[frame].value;
1184                 return (lex->tok.ttype = TOKEN_INTCONST);
1185             }
1186         }
1187
1188         lexerror(lex, "invalid frame macro");
1189         return lex_do(lex);
1190     }
1191
1192     /* single-character tokens */
1193     switch (ch)
1194     {
1195         case '[':
1196             nextch = lex_getch(lex);
1197             if (nextch == '[') {
1198                 lex_tokench(lex, ch);
1199                 lex_tokench(lex, nextch);
1200                 lex_endtoken(lex);
1201                 return (lex->tok.ttype = TOKEN_ATTRIBUTE_OPEN);
1202             }
1203             lex_ungetch(lex, nextch);
1204             /* FALL THROUGH */
1205         case '(':
1206         case ':':
1207         case '?':
1208             lex_tokench(lex, ch);
1209             lex_endtoken(lex);
1210             if (lex->flags.noops)
1211                 return (lex->tok.ttype = ch);
1212             else
1213                 return (lex->tok.ttype = TOKEN_OPERATOR);
1214
1215         case ']':
1216             if (lex->flags.noops) {
1217                 nextch = lex_getch(lex);
1218                 if (nextch == ']') {
1219                     lex_tokench(lex, ch);
1220                     lex_tokench(lex, nextch);
1221                     lex_endtoken(lex);
1222                     return (lex->tok.ttype = TOKEN_ATTRIBUTE_CLOSE);
1223                 }
1224                 lex_ungetch(lex, nextch);
1225             }
1226             /* FALL THROUGH */
1227         case ')':
1228         case ';':
1229         case '{':
1230         case '}':
1231
1232         case '#':
1233             lex_tokench(lex, ch);
1234             lex_endtoken(lex);
1235             return (lex->tok.ttype = ch);
1236         default:
1237             break;
1238     }
1239
1240     if (ch == '.') {
1241         nextch = lex_getch(lex);
1242         /* digits starting with a dot */
1243         if (util_isdigit(nextch)) {
1244             lex_ungetch(lex, nextch);
1245             lex->tok.ttype = lex_finish_digit(lex, ch);
1246             lex_endtoken(lex);
1247             return lex->tok.ttype;
1248         }
1249         lex_ungetch(lex, nextch);
1250     }
1251
1252     if (lex->flags.noops)
1253     {
1254         /* Detect characters early which are normally
1255          * operators OR PART of an operator.
1256          */
1257         switch (ch)
1258         {
1259             /*
1260             case '+':
1261             case '-':
1262             */
1263             case '*':
1264             case '/':
1265             case '<':
1266             case '>':
1267             case '=':
1268             case '&':
1269             case '|':
1270             case '^':
1271             case '~':
1272             case ',':
1273             case '!':
1274                 lex_tokench(lex, ch);
1275                 lex_endtoken(lex);
1276                 return (lex->tok.ttype = ch);
1277             default:
1278                 break;
1279         }
1280     }
1281
1282     if (ch == '.')
1283     {
1284         lex_tokench(lex, ch);
1285         /* peak ahead once */
1286         nextch = lex_getch(lex);
1287         if (nextch != '.') {
1288             lex_ungetch(lex, nextch);
1289             lex_endtoken(lex);
1290             if (lex->flags.noops)
1291                 return (lex->tok.ttype = ch);
1292             else
1293                 return (lex->tok.ttype = TOKEN_OPERATOR);
1294         }
1295         /* peak ahead again */
1296         nextch = lex_getch(lex);
1297         if (nextch != '.') {
1298             lex_ungetch(lex, nextch);
1299             lex_ungetch(lex, '.');
1300             lex_endtoken(lex);
1301             if (lex->flags.noops)
1302                 return (lex->tok.ttype = ch);
1303             else
1304                 return (lex->tok.ttype = TOKEN_OPERATOR);
1305         }
1306         /* fill the token to be "..." */
1307         lex_tokench(lex, ch);
1308         lex_tokench(lex, ch);
1309         lex_endtoken(lex);
1310         return (lex->tok.ttype = TOKEN_DOTS);
1311     }
1312
1313     if (ch == ',' || ch == '.') {
1314         lex_tokench(lex, ch);
1315         lex_endtoken(lex);
1316         return (lex->tok.ttype = TOKEN_OPERATOR);
1317     }
1318
1319     if (ch == '+' || ch == '-' || /* ++, --, +=, -=  and -> as well! */
1320         ch == '>' || ch == '<' || /* <<, >>, <=, >=  and >< as well! */
1321         ch == '=' || ch == '!' || /* <=>, ==, !=                     */
1322         ch == '&' || ch == '|' || /* &&, ||, &=, |=                  */
1323         ch == '~' || ch == '^'    /* ~=, ~, ^                        */
1324     )  {
1325         lex_tokench(lex, ch);
1326
1327         nextch = lex_getch(lex);
1328         if ((nextch == '=' && ch != '<') ||
1329             (nextch == ch  && ch != '!') ||
1330             (nextch == '<' && ch == '>')) {
1331             lex_tokench(lex, nextch);
1332         } else if (ch == '<' && nextch == '=') {
1333             lex_tokench(lex, nextch);
1334             if ((thirdch = lex_getch(lex)) == '>')
1335                 lex_tokench(lex, thirdch);
1336             else
1337                 lex_ungetch(lex, thirdch);
1338
1339         } else if (ch == '-' && nextch == '>') {
1340             lex_tokench(lex, nextch);
1341         } else if (ch == '&' && nextch == '~') {
1342             thirdch = lex_getch(lex);
1343             if (thirdch != '=') {
1344                 lex_ungetch(lex, thirdch);
1345                 lex_ungetch(lex, nextch);
1346             }
1347             else {
1348                 lex_tokench(lex, nextch);
1349                 lex_tokench(lex, thirdch);
1350             }
1351         }
1352         else if (lex->flags.preprocessing &&
1353                  ch == '-' && util_isdigit(nextch))
1354         {
1355             lex->tok.ttype = lex_finish_digit(lex, nextch);
1356             if (lex->tok.ttype == TOKEN_INTCONST)
1357                 lex->tok.constval.i = -lex->tok.constval.i;
1358             else
1359                 lex->tok.constval.f = -lex->tok.constval.f;
1360             lex_endtoken(lex);
1361             return lex->tok.ttype;
1362         } else {
1363             lex_ungetch(lex, nextch);
1364         }
1365
1366         lex_endtoken(lex);
1367         return (lex->tok.ttype = TOKEN_OPERATOR);
1368     }
1369
1370     /*
1371     if (ch == '^' || ch == '~' || ch == '!')
1372     {
1373         lex_tokench(lex, ch);
1374         lex_endtoken(lex);
1375         return (lex->tok.ttype = TOKEN_OPERATOR);
1376     }
1377     */
1378
1379     if (ch == '*' || ch == '/') /* *=, /= */
1380     {
1381         lex_tokench(lex, ch);
1382
1383         nextch = lex_getch(lex);
1384         if (nextch == '=' || nextch == '*') {
1385             lex_tokench(lex, nextch);
1386         } else
1387             lex_ungetch(lex, nextch);
1388
1389         lex_endtoken(lex);
1390         return (lex->tok.ttype = TOKEN_OPERATOR);
1391     }
1392
1393     if (ch == '%') {
1394         lex_tokench(lex, ch);
1395         lex_endtoken(lex);
1396         return (lex->tok.ttype = TOKEN_OPERATOR);
1397     }
1398
1399     if (isident_start(ch))
1400     {
1401         const char *v;
1402
1403         lex_tokench(lex, ch);
1404         if (!lex_finish_ident(lex)) {
1405             /* error? */
1406             return (lex->tok.ttype = TOKEN_ERROR);
1407         }
1408         lex_endtoken(lex);
1409         lex->tok.ttype = TOKEN_IDENT;
1410
1411         v = lex->tok.value;
1412         if (!strcmp(v, "void")) {
1413             lex->tok.ttype = TOKEN_TYPENAME;
1414             lex->tok.constval.t = TYPE_VOID;
1415         } else if (!strcmp(v, "int")) {
1416             lex->tok.ttype = TOKEN_TYPENAME;
1417             lex->tok.constval.t = TYPE_INTEGER;
1418         } else if (!strcmp(v, "float")) {
1419             lex->tok.ttype = TOKEN_TYPENAME;
1420             lex->tok.constval.t = TYPE_FLOAT;
1421         } else if (!strcmp(v, "string")) {
1422             lex->tok.ttype = TOKEN_TYPENAME;
1423             lex->tok.constval.t = TYPE_STRING;
1424         } else if (!strcmp(v, "entity")) {
1425             lex->tok.ttype = TOKEN_TYPENAME;
1426             lex->tok.constval.t = TYPE_ENTITY;
1427         } else if (!strcmp(v, "vector")) {
1428             lex->tok.ttype = TOKEN_TYPENAME;
1429             lex->tok.constval.t = TYPE_VECTOR;
1430         } else {
1431             size_t kw;
1432             for (kw = 0; kw < GMQCC_ARRAY_COUNT(keywords_qc); ++kw) {
1433                 if (!strcmp(v, keywords_qc[kw]))
1434                     return (lex->tok.ttype = TOKEN_KEYWORD);
1435             }
1436             if (OPTS_OPTION_U32(OPTION_STANDARD) != COMPILER_QCC) {
1437                 for (kw = 0; kw < GMQCC_ARRAY_COUNT(keywords_fg); ++kw) {
1438                     if (!strcmp(v, keywords_fg[kw]))
1439                         return (lex->tok.ttype = TOKEN_KEYWORD);
1440                 }
1441             }
1442         }
1443
1444         return lex->tok.ttype;
1445     }
1446
1447     if (ch == '"')
1448     {
1449         lex->flags.nodigraphs = true;
1450         if (lex->flags.preprocessing)
1451             lex_tokench(lex, ch);
1452         lex->tok.ttype = lex_finish_string(lex, '"');
1453         if (lex->flags.preprocessing)
1454             lex_tokench(lex, ch);
1455         while (!lex->flags.preprocessing && lex->tok.ttype == TOKEN_STRINGCONST)
1456         {
1457             /* Allow c style "string" "continuation" */
1458             ch = lex_skipwhite(lex, false);
1459             if (ch != '"') {
1460                 lex_ungetch(lex, ch);
1461                 break;
1462             }
1463
1464             lex->tok.ttype = lex_finish_string(lex, '"');
1465         }
1466         lex->flags.nodigraphs = false;
1467         lex_endtoken(lex);
1468         return lex->tok.ttype;
1469     }
1470
1471     if (ch == '\'')
1472     {
1473         /* we parse character constants like string,
1474          * but return TOKEN_CHARCONST, or a vector type if it fits...
1475          * Likewise actual unescaping has to be done by the parser.
1476          * The difference is we don't allow 'char' 'continuation'.
1477          */
1478         if (lex->flags.preprocessing)
1479             lex_tokench(lex, ch);
1480         lex->tok.ttype = lex_finish_string(lex, '\'');
1481         if (lex->flags.preprocessing)
1482             lex_tokench(lex, ch);
1483         lex_endtoken(lex);
1484
1485         lex->tok.ttype = TOKEN_CHARCONST;
1486
1487         /* It's a vector if we can successfully scan 3 floats */
1488         if (util_sscanf(lex->tok.value, " %f %f %f ",
1489                    &lex->tok.constval.v.x, &lex->tok.constval.v.y, &lex->tok.constval.v.z) == 3)
1490
1491         {
1492              lex->tok.ttype = TOKEN_VECTORCONST;
1493         }
1494         else
1495         {
1496             if (!lex->flags.preprocessing && strlen(lex->tok.value) > 1) {
1497                 utf8ch_t u8char;
1498                 /* check for a valid utf8 character */
1499                 if (!OPTS_FLAG(UTF8) || !utf8_to(&u8char, (const unsigned char *)lex->tok.value, 8)) {
1500                     if (lexwarn(lex, WARN_MULTIBYTE_CHARACTER,
1501                                 ( OPTS_FLAG(UTF8) ? "invalid multibyte character sequence `%s`"
1502                                                   : "multibyte character: `%s`" ),
1503                                 lex->tok.value))
1504                         return (lex->tok.ttype = TOKEN_ERROR);
1505                 }
1506                 else
1507                     lex->tok.constval.i = u8char;
1508             }
1509             else
1510                 lex->tok.constval.i = lex->tok.value[0];
1511         }
1512
1513         return lex->tok.ttype;
1514     }
1515
1516     if (util_isdigit(ch))
1517     {
1518         lex->tok.ttype = lex_finish_digit(lex, ch);
1519         lex_endtoken(lex);
1520         return lex->tok.ttype;
1521     }
1522
1523     if (lex->flags.preprocessing) {
1524         lex_tokench(lex, ch);
1525         lex_endtoken(lex);
1526         return (lex->tok.ttype = ch);
1527     }
1528
1529     lexerror(lex, "unknown token: `%c`", ch);
1530     return (lex->tok.ttype = TOKEN_ERROR);
1531 }