lexer.c

   1 /*
   2  * Copyright (C) 2012, 2013
   3  *     Wolfgang Bumiller
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   6  * this software and associated documentation files (the "Software"), to deal in
   7  * the Software without restriction, including without limitation the rights to
   8  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is furnished to do
  10  * so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in all
  13  * copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23 #include <string.h>
  24 #include <stdlib.h>
  25
  26 #include "gmqcc.h"
  27 #include "lexer.h"
  28
  29 /*
  30  * List of Keywords
  31  */
  32
  33 /* original */
  34 static const char *keywords_qc[] = {
  35     "for", "do", "while",
  36     "if", "else",
  37     "local",
  38     "return",
  39     "const"
  40 };
  41 /* For fte/gmgqcc */
  42 static const char *keywords_fg[] = {
  43     "switch", "case", "default",
  44     "struct", "union",
  45     "break", "continue",
  46     "typedef",
  47     "goto",
  48
  49     "__builtin_debug_printtype"
  50 };
  51
  52 /*
  53  * Lexer code
  54  */
  55 static char* *lex_filenames;
  56
  57 static void lexerror(lex_file *lex, const char *fmt, ...)
  58 {
  59     va_list ap;
  60
  61     va_start(ap, fmt);
  62     if (lex)
  63         con_vprintmsg(LVL_ERROR, lex->name, lex->sline, lex->column, "parse error", fmt, ap);
  64     else
  65         con_vprintmsg(LVL_ERROR, "", 0, 0, "parse error", fmt, ap);
  66     va_end(ap);
  67 }
  68
  69 static bool lexwarn(lex_file *lex, int warntype, const char *fmt, ...)
  70 {
  71     bool      r;
  72     lex_ctx_t ctx;
  73     va_list   ap;
  74
  75     ctx.file   = lex->name;
  76     ctx.line   = lex->sline;
  77     ctx.column = lex->column;
  78
  79     va_start(ap, fmt);
  80     r = vcompile_warning(ctx, warntype, fmt, ap);
  81     va_end(ap);
  82     return r;
  83 }
  84
  85
  86 #if 0
  87 token* token_new()
  88 {
  89     token *tok = (token*)mem_a(sizeof(token));
  90     if (!tok)
  91         return NULL;
  92     memset(tok, 0, sizeof(*tok));
  93     return tok;
  94 }
  95
  96 void token_delete(token *self)
  97 {
  98     if (self->next && self->next->prev == self)
  99         self->next->prev = self->prev;
 100     if (self->prev && self->prev->next == self)
 101         self->prev->next = self->next;
 102     MEM_VECTOR_CLEAR(self, value);
 103     mem_d(self);
 104 }
 105
 106 token* token_copy(const token *cp)
 107 {
 108     token* self = token_new();
 109     if (!self)
 110         return NULL;
 111     /* copy the value */
 112     self->value_alloc = cp->value_count + 1;
 113     self->value_count = cp->value_count;
 114     self->value = (char*)mem_a(self->value_alloc);
 115     if (!self->value) {
 116         mem_d(self);
 117         return NULL;
 118     }
 119     memcpy(self->value, cp->value, cp->value_count);
 120     self->value[self->value_alloc-1] = 0;
 121
 122     /* rest */
 123     self->ctx = cp->ctx;
 124     self->ttype = cp->ttype;
 125     memcpy(&self->constval, &cp->constval, sizeof(self->constval));
 126     return self;
 127 }
 128
 129 void token_delete_all(token *t)
 130 {
 131     token *n;
 132
 133     do {
 134         n = t->next;
 135         token_delete(t);
 136         t = n;
 137     } while(t);
 138 }
 139
 140 token* token_copy_all(const token *cp)
 141 {
 142     token *cur;
 143     token *out;
 144
 145     out = cur = token_copy(cp);
 146     if (!out)
 147         return NULL;
 148
 149     while (cp->next) {
 150         cp = cp->next;
 151         cur->next = token_copy(cp);
 152         if (!cur->next) {
 153             token_delete_all(out);
 154             return NULL;
 155         }
 156         cur->next->prev = cur;
 157         cur = cur->next;
 158     }
 159
 160     return out;
 161 }
 162 #else
 163 static void lex_token_new(lex_file *lex)
 164 {
 165 #if 0
 166     if (lex->tok)
 167         token_delete(lex->tok);
 168     lex->tok = token_new();
 169 #else
 170     if (lex->tok.value)
 171         vec_shrinkto(lex->tok.value, 0);
 172
 173     lex->tok.constval.t  = 0;
 174     lex->tok.ctx.line    = lex->sline;
 175     lex->tok.ctx.file    = lex->name;
 176     lex->tok.ctx.column  = lex->column;
 177 #endif
 178 }
 179 #endif
 180
 181 static void lex_ungetch(lex_file *lex, int ch);
 182 static int lex_getch(lex_file *lex);
 183
 184 lex_file* lex_open(const char *file)
 185 {
 186     lex_file  *lex;
 187     fs_file_t *in = fs_file_open(file, "rb");
 188     uint32_t   read;
 189
 190     if (!in) {
 191         lexerror(NULL, "open failed: '%s'\n", file);
 192         return NULL;
 193     }
 194
 195     lex = (lex_file*)mem_a(sizeof(*lex));
 196     if (!lex) {
 197         fs_file_close(in);
 198         lexerror(NULL, "out of memory\n");
 199         return NULL;
 200     }
 201
 202     memset(lex, 0, sizeof(*lex));
 203
 204     lex->file    = in;
 205     lex->name    = util_strdup(file);
 206     lex->line    = 1; /* we start counting at 1 */
 207     lex->column  = 0;
 208     lex->peekpos = 0;
 209     lex->eof     = false;
 210
 211     /* handle BOM */
 212     if ((read = (lex_getch(lex) << 16) | (lex_getch(lex) << 8) | lex_getch(lex)) != 0xEFBBBF) {
 213         lex_ungetch(lex, (read & 0x0000FF));
 214         lex_ungetch(lex, (read & 0x00FF00) >> 8);
 215         lex_ungetch(lex, (read & 0xFF0000) >> 16);
 216     } else {
 217         /*
 218          * otherwise the lexer has advanced 3 bytes for the BOM, we need
 219          * to set the column back to 0
 220          */
 221         lex->column = 0;
 222     }
 223
 224     vec_push(lex_filenames, lex->name);
 225     return lex;
 226 }
 227
 228 lex_file* lex_open_string(const char *str, size_t len, const char *name)
 229 {
 230     lex_file *lex;
 231
 232     lex = (lex_file*)mem_a(sizeof(*lex));
 233     if (!lex) {
 234         lexerror(NULL, "out of memory\n");
 235         return NULL;
 236     }
 237
 238     memset(lex, 0, sizeof(*lex));
 239
 240     lex->file = NULL;
 241     lex->open_string        = str;
 242     lex->open_string_length = len;
 243     lex->open_string_pos    = 0;
 244
 245     lex->name    = util_strdup(name ? name : "<string-source>");
 246     lex->line    = 1; /* we start counting at 1 */
 247     lex->peekpos = 0;
 248     lex->eof     = false;
 249     lex->column  = 0;
 250
 251     vec_push(lex_filenames, lex->name);
 252
 253     return lex;
 254 }
 255
 256 void lex_cleanup(void)
 257 {
 258     size_t i;
 259     for (i = 0; i < vec_size(lex_filenames); ++i)
 260         mem_d(lex_filenames[i]);
 261     vec_free(lex_filenames);
 262 }
 263
 264 void lex_close(lex_file *lex)
 265 {
 266     size_t i;
 267     for (i = 0; i < vec_size(lex->frames); ++i)
 268         mem_d(lex->frames[i].name);
 269     vec_free(lex->frames);
 270
 271     if (lex->modelname)
 272         vec_free(lex->modelname);
 273
 274     if (lex->file)
 275         fs_file_close(lex->file);
 276 #if 0
 277     if (lex->tok)
 278         token_delete(lex->tok);
 279 #else
 280     vec_free(lex->tok.value);
 281 #endif
 282     /* mem_d(lex->name); collected in lex_filenames */
 283     mem_d(lex);
 284 }
 285
 286
 287
 288 static int lex_fgetc(lex_file *lex)
 289 {
 290     if (lex->file) {
 291         lex->column++;
 292         return fs_file_getc(lex->file);
 293     }
 294     if (lex->open_string) {
 295         if (lex->open_string_pos >= lex->open_string_length)
 296             return FS_FILE_EOF;
 297         lex->column++;
 298         return lex->open_string[lex->open_string_pos++];
 299     }
 300     return FS_FILE_EOF;
 301 }
 302
 303 /* Get or put-back data
 304  * The following to functions do NOT understand what kind of data they
 305  * are working on.
 306  * The are merely wrapping get/put in order to count line numbers.
 307  */
 308 static int lex_try_trigraph(lex_file *lex, int old)
 309 {
 310     int c2, c3;
 311     c2 = lex_fgetc(lex);
 312     if (!lex->push_line && c2 == '\n') {
 313         lex->line++;
 314         lex->column = 0;
 315     }
 316
 317     if (c2 != '?') {
 318         lex_ungetch(lex, c2);
 319         return old;
 320     }
 321
 322     c3 = lex_fgetc(lex);
 323     if (!lex->push_line && c3 == '\n') {
 324         lex->line++;
 325         lex->column = 0;
 326     }
 327
 328     switch (c3) {
 329         case '=': return '#';
 330         case '/': return '\\';
 331         case '\'': return '^';
 332         case '(': return '[';
 333         case ')': return ']';
 334         case '!': return '|';
 335         case '<': return '{';
 336         case '>': return '}';
 337         case '-': return '~';
 338         default:
 339             lex_ungetch(lex, c3);
 340             lex_ungetch(lex, c2);
 341             return old;
 342     }
 343 }
 344
 345 static int lex_try_digraph(lex_file *lex, int ch)
 346 {
 347     int c2;
 348     c2 = lex_fgetc(lex);
 349     /* we just used fgetc() so count lines
 350      * need to offset a \n the ungetch would recognize
 351      */
 352     if (!lex->push_line && c2 == '\n')
 353         lex->line++;
 354     if      (ch == '<' && c2 == ':')
 355         return '[';
 356     else if (ch == ':' && c2 == '>')
 357         return ']';
 358     else if (ch == '<' && c2 == '%')
 359         return '{';
 360     else if (ch == '%' && c2 == '>')
 361         return '}';
 362     else if (ch == '%' && c2 == ':')
 363         return '#';
 364     lex_ungetch(lex, c2);
 365     return ch;
 366 }
 367
 368 static int lex_getch(lex_file *lex)
 369 {
 370     int ch;
 371
 372     if (lex->peekpos) {
 373         lex->peekpos--;
 374         if (!lex->push_line && lex->peek[lex->peekpos] == '\n') {
 375             lex->line++;
 376             lex->column = 0;
 377         }
 378         return lex->peek[lex->peekpos];
 379     }
 380
 381     ch = lex_fgetc(lex);
 382     if (!lex->push_line && ch == '\n') {
 383         lex->line++;
 384         lex->column = 0;
 385     }
 386     else if (ch == '?')
 387         return lex_try_trigraph(lex, ch);
 388     else if (!lex->flags.nodigraphs && (ch == '<' || ch == ':' || ch == '%'))
 389         return lex_try_digraph(lex, ch);
 390     return ch;
 391 }
 392
 393 static void lex_ungetch(lex_file *lex, int ch)
 394 {
 395     lex->peek[lex->peekpos++] = ch;
 396     lex->column--;
 397     if (!lex->push_line && ch == '\n') {
 398         lex->line--;
 399         lex->column = 0;
 400     }
 401 }
 402
 403 /* classify characters
 404  * some additions to the is*() functions of ctype.h
 405  */
 406
 407 /* Idents are alphanumberic, but they start with alpha or _ */
 408 static bool isident_start(int ch)
 409 {
 410     return util_isalpha(ch) || ch == '_';
 411 }
 412
 413 static bool isident(int ch)
 414 {
 415     return isident_start(ch) || util_isdigit(ch);
 416 }
 417
 418 /* isxdigit_only is used when we already know it's not a digit
 419  * and want to see if it's a hex digit anyway.
 420  */
 421 static bool isxdigit_only(int ch)
 422 {
 423     return (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
 424 }
 425
 426 /* Append a character to the token buffer */
 427 static void lex_tokench(lex_file *lex, int ch)
 428 {
 429     vec_push(lex->tok.value, ch);
 430 }
 431
 432 /* Append a trailing null-byte */
 433 static void lex_endtoken(lex_file *lex)
 434 {
 435     vec_push(lex->tok.value, 0);
 436     vec_shrinkby(lex->tok.value, 1);
 437 }
 438
 439 static bool lex_try_pragma(lex_file *lex)
 440 {
 441     int ch;
 442     char *pragma  = NULL;
 443     char *command = NULL;
 444     char *param   = NULL;
 445     size_t line;
 446
 447     if (lex->flags.preprocessing)
 448         return false;
 449
 450     line = lex->line;
 451
 452     ch = lex_getch(lex);
 453     if (ch != '#') {
 454         lex_ungetch(lex, ch);
 455         return false;
 456     }
 457
 458     for (ch = lex_getch(lex); vec_size(pragma) < 8 && ch >= 'a' && ch <= 'z'; ch = lex_getch(lex))
 459         vec_push(pragma, ch);
 460     vec_push(pragma, 0);
 461
 462     if (ch != ' ' || strcmp(pragma, "pragma")) {
 463         lex_ungetch(lex, ch);
 464         goto unroll;
 465     }
 466
 467     for (ch = lex_getch(lex); vec_size(command) < 32 && ch >= 'a' && ch <= 'z'; ch = lex_getch(lex))
 468         vec_push(command, ch);
 469     vec_push(command, 0);
 470
 471     if (ch != '(') {
 472         lex_ungetch(lex, ch);
 473         goto unroll;
 474     }
 475
 476     for (ch = lex_getch(lex); vec_size(param) < 1024 && ch != ')' && ch != '\n'; ch = lex_getch(lex))
 477         vec_push(param, ch);
 478     vec_push(param, 0);
 479
 480     if (ch != ')') {
 481         lex_ungetch(lex, ch);
 482         goto unroll;
 483     }
 484
 485     if (!strcmp(command, "push")) {
 486         if (!strcmp(param, "line")) {
 487             lex->push_line++;
 488             if (lex->push_line == 1)
 489                 --line;
 490         }
 491         else
 492             goto unroll;
 493     }
 494     else if (!strcmp(command, "pop")) {
 495         if (!strcmp(param, "line")) {
 496             if (lex->push_line)
 497                 lex->push_line--;
 498             if (lex->push_line == 0)
 499                 --line;
 500         }
 501         else
 502             goto unroll;
 503     }
 504     else if (!strcmp(command, "file")) {
 505         lex->name = util_strdup(param);
 506         vec_push(lex_filenames, lex->name);
 507     }
 508     else if (!strcmp(command, "line")) {
 509         line = strtol(param, NULL, 0)-1;
 510     }
 511     else
 512         goto unroll;
 513
 514     lex->line = line;
 515     while (ch != '\n' && ch != FS_FILE_EOF)
 516         ch = lex_getch(lex);
 517     vec_free(command);
 518     vec_free(param);
 519     vec_free(pragma);
 520     return true;
 521
 522 unroll:
 523     if (command) {
 524         vec_pop(command);
 525         while (vec_size(command)) {
 526             lex_ungetch(lex, (unsigned char)vec_last(command));
 527             vec_pop(command);
 528         }
 529         vec_free(command);
 530         lex_ungetch(lex, ' ');
 531     }
 532     if (param) {
 533         vec_pop(param);
 534         while (vec_size(param)) {
 535             lex_ungetch(lex, (unsigned char)vec_last(param));
 536             vec_pop(param);
 537         }
 538         vec_free(param);
 539         lex_ungetch(lex, ' ');
 540     }
 541     if (pragma) {
 542         vec_pop(pragma);
 543         while (vec_size(pragma)) {
 544             lex_ungetch(lex, (unsigned char)vec_last(pragma));
 545             vec_pop(pragma);
 546         }
 547         vec_free(pragma);
 548     }
 549     lex_ungetch(lex, '#');
 550
 551     lex->line = line;
 552     return false;
 553 }
 554
 555 /* Skip whitespace and comments and return the first
 556  * non-white character.
 557  * As this makes use of the above getch() ungetch() functions,
 558  * we don't need to care at all about line numbering anymore.
 559  *
 560  * In theory, this function should only be used at the beginning
 561  * of lexing, or when we *know* the next character is part of the token.
 562  * Otherwise, if the parser throws an error, the linenumber may not be
 563  * the line of the error, but the line of the next token AFTER the error.
 564  *
 565  * This is currently only problematic when using c-like string-continuation,
 566  * since comments and whitespaces are allowed between 2 such strings.
 567  * Example:
 568 printf(   "line one\n"
 569 // A comment
 570           "A continuation of the previous string"
 571 // This line is skipped
 572       , foo);
 573
 574  * In this case, if the parse decides it didn't actually want a string,
 575  * and uses lex->line to print an error, it will show the ', foo);' line's
 576  * linenumber.
 577  *
 578  * On the other hand, the parser is supposed to remember the line of the next
 579  * token's beginning. In this case we would want skipwhite() to be called
 580  * AFTER reading a token, so that the parser, before reading the NEXT token,
 581  * doesn't store teh *comment's* linenumber, but the actual token's linenumber.
 582  *
 583  * THIS SOLUTION
 584  *    here is to store the line of the first character after skipping
 585  *    the initial whitespace in lex->sline, this happens in lex_do.
 586  */
 587 static int lex_skipwhite(lex_file *lex, bool hadwhite)
 588 {
 589     int ch = 0;
 590     bool haswhite = hadwhite;
 591
 592     do
 593     {
 594         ch = lex_getch(lex);
 595         while (ch != FS_FILE_EOF && util_isspace(ch)) {
 596             if (ch == '\n') {
 597                 if (lex_try_pragma(lex))
 598                     continue;
 599             }
 600             if (lex->flags.preprocessing) {
 601                 if (ch == '\n') {
 602                     /* end-of-line */
 603                     /* see if there was whitespace first */
 604                     if (haswhite) { /* (vec_size(lex->tok.value)) { */
 605                         lex_ungetch(lex, ch);
 606                         lex_endtoken(lex);
 607                         return TOKEN_WHITE;
 608                     }
 609                     /* otherwise return EOL */
 610                     return TOKEN_EOL;
 611                 }
 612                 haswhite = true;
 613                 lex_tokench(lex, ch);
 614             }
 615             ch = lex_getch(lex);
 616         }
 617
 618         if (ch == '/') {
 619             ch = lex_getch(lex);
 620             if (ch == '/')
 621             {
 622                 /* one line comment */
 623                 ch = lex_getch(lex);
 624
 625                 if (lex->flags.preprocessing) {
 626                     haswhite = true;
 627                     /*
 628                     lex_tokench(lex, '/');
 629                     lex_tokench(lex, '/');
 630                     */
 631                     lex_tokench(lex, ' ');
 632                     lex_tokench(lex, ' ');
 633                 }
 634
 635                 while (ch != FS_FILE_EOF && ch != '\n') {
 636                     if (lex->flags.preprocessing)
 637                         lex_tokench(lex, ' '); /* ch); */
 638                     ch = lex_getch(lex);
 639                 }
 640                 if (lex->flags.preprocessing) {
 641                     lex_ungetch(lex, '\n');
 642                     lex_endtoken(lex);
 643                     return TOKEN_WHITE;
 644                 }
 645                 continue;
 646             }
 647             if (ch == '*')
 648             {
 649                 /* multiline comment */
 650                 if (lex->flags.preprocessing) {
 651                     haswhite = true;
 652                     /*
 653                     lex_tokench(lex, '/');
 654                     lex_tokench(lex, '*');
 655                     */
 656                     lex_tokench(lex, ' ');
 657                     lex_tokench(lex, ' ');
 658                 }
 659
 660                 while (ch != FS_FILE_EOF)
 661                 {
 662                     ch = lex_getch(lex);
 663                     if (ch == '*') {
 664                         ch = lex_getch(lex);
 665                         if (ch == '/') {
 666                             if (lex->flags.preprocessing) {
 667                                 /*
 668                                 lex_tokench(lex, '*');
 669                                 lex_tokench(lex, '/');
 670                                 */
 671                                 lex_tokench(lex, ' ');
 672                                 lex_tokench(lex, ' ');
 673                             }
 674                             break;
 675                         }
 676                         lex_ungetch(lex, ch);
 677                     }
 678                     if (lex->flags.preprocessing) {
 679                         if (ch == '\n')
 680                             lex_tokench(lex, '\n');
 681                         else
 682                             lex_tokench(lex, ' '); /* ch); */
 683                     }
 684                 }
 685                 ch = ' '; /* cause TRUE in the isspace check */
 686                 continue;
 687             }
 688             /* Otherwise roll back to the slash and break out of the loop */
 689             lex_ungetch(lex, ch);
 690             ch = '/';
 691             break;
 692         }
 693     } while (ch != FS_FILE_EOF && util_isspace(ch));
 694
 695     if (haswhite) {
 696         lex_endtoken(lex);
 697         lex_ungetch(lex, ch);
 698         return TOKEN_WHITE;
 699     }
 700     return ch;
 701 }
 702
 703 /* Get a token */
 704 static bool GMQCC_WARN lex_finish_ident(lex_file *lex)
 705 {
 706     int ch;
 707
 708     ch = lex_getch(lex);
 709     while (ch != FS_FILE_EOF && isident(ch))
 710     {
 711         lex_tokench(lex, ch);
 712         ch = lex_getch(lex);
 713     }
 714
 715     /* last ch was not an ident ch: */
 716     lex_ungetch(lex, ch);
 717
 718     return true;
 719 }
 720
 721 /* read one ident for the frame list */
 722 static int lex_parse_frame(lex_file *lex)
 723 {
 724     int ch;
 725
 726     lex_token_new(lex);
 727
 728     ch = lex_getch(lex);
 729     while (ch != FS_FILE_EOF && ch != '\n' && util_isspace(ch))
 730         ch = lex_getch(lex);
 731
 732     if (ch == '\n')
 733         return 1;
 734
 735     if (!isident_start(ch)) {
 736         lexerror(lex, "invalid framename, must start with one of a-z or _, got %c", ch);
 737         return -1;
 738     }
 739
 740     lex_tokench(lex, ch);
 741     if (!lex_finish_ident(lex))
 742         return -1;
 743     lex_endtoken(lex);
 744     return 0;
 745 }
 746
 747 /* read a list of $frames */
 748 static bool lex_finish_frames(lex_file *lex)
 749 {
 750     do {
 751         size_t i;
 752         int    rc;
 753         frame_macro m;
 754
 755         rc = lex_parse_frame(lex);
 756         if (rc > 0) /* end of line */
 757             return true;
 758         if (rc < 0) /* error */
 759             return false;
 760
 761         for (i = 0; i < vec_size(lex->frames); ++i) {
 762             if (!strcmp(lex->tok.value, lex->frames[i].name)) {
 763                 lex->frames[i].value = lex->framevalue++;
 764                 if (lexwarn(lex, WARN_FRAME_MACROS, "duplicate frame macro defined: `%s`", lex->tok.value))
 765                     return false;
 766                 break;
 767             }
 768         }
 769         if (i < vec_size(lex->frames))
 770             continue;
 771
 772         m.value = lex->framevalue++;
 773         m.name = util_strdup(lex->tok.value);
 774         vec_shrinkto(lex->tok.value, 0);
 775         vec_push(lex->frames, m);
 776     } while (true);
 777
 778     return false;
 779 }
 780
 781 static int GMQCC_WARN lex_finish_string(lex_file *lex, int quote)
 782 {
 783     utf8ch_t chr = 0;
 784     int ch = 0;
 785     int nextch;
 786     bool hex;
 787     bool oct;
 788     char u8buf[8]; /* way more than enough */
 789     int  u8len, uc;
 790
 791     while (ch != FS_FILE_EOF)
 792     {
 793         ch = lex_getch(lex);
 794         if (ch == quote)
 795             return TOKEN_STRINGCONST;
 796
 797         if (lex->flags.preprocessing && ch == '\\') {
 798             lex_tokench(lex, ch);
 799             ch = lex_getch(lex);
 800             if (ch == FS_FILE_EOF) {
 801                 lexerror(lex, "unexpected end of file");
 802                 lex_ungetch(lex, FS_FILE_EOF); /* next token to be TOKEN_EOF */
 803                 return (lex->tok.ttype = TOKEN_ERROR);
 804             }
 805             lex_tokench(lex, ch);
 806         }
 807         else if (ch == '\\') {
 808             ch = lex_getch(lex);
 809             if (ch == FS_FILE_EOF) {
 810                 lexerror(lex, "unexpected end of file");
 811                 lex_ungetch(lex, FS_FILE_EOF); /* next token to be TOKEN_EOF */
 812                 return (lex->tok.ttype = TOKEN_ERROR);
 813             }
 814
 815             switch (ch) {
 816             case '\\': break;
 817             case '\'': break;
 818             case '"':  break;
 819             case 'a':  ch = '\a'; break;
 820             case 'b':  ch = '\b'; break;
 821             case 'r':  ch = '\r'; break;
 822             case 'n':  ch = '\n'; break;
 823             case 't':  ch = '\t'; break;
 824             case 'f':  ch = '\f'; break;
 825             case 'v':  ch = '\v'; break;
 826             case 'x':
 827             case 'X':
 828                 /* same procedure as in fteqcc */
 829                 ch = 0;
 830                 nextch = lex_getch(lex);
 831                 if      (nextch >= '0' && nextch <= '9')
 832                     ch += nextch - '0';
 833                 else if (nextch >= 'a' && nextch <= 'f')
 834                     ch += nextch - 'a' + 10;
 835                 else if (nextch >= 'A' && nextch <= 'F')
 836                     ch += nextch - 'A' + 10;
 837                 else {
 838                     lexerror(lex, "bad character code");
 839                     lex_ungetch(lex, nextch);
 840                     return (lex->tok.ttype = TOKEN_ERROR);
 841                 }
 842
 843                 ch *= 0x10;
 844                 nextch = lex_getch(lex);
 845                 if      (nextch >= '0' && nextch <= '9')
 846                     ch += nextch - '0';
 847                 else if (nextch >= 'a' && nextch <= 'f')
 848                     ch += nextch - 'a' + 10;
 849                 else if (nextch >= 'A' && nextch <= 'F')
 850                     ch += nextch - 'A' + 10;
 851                 else {
 852                     lexerror(lex, "bad character code");
 853                     lex_ungetch(lex, nextch);
 854                     return (lex->tok.ttype = TOKEN_ERROR);
 855                 }
 856                 break;
 857
 858             /* fteqcc support */
 859             case '0': case '1': case '2': case '3':
 860             case '4': case '5': case '6': case '7':
 861             case '8': case '9':
 862                 ch = 18 + ch - '0';
 863                 break;
 864             case '<':  ch = 29; break;
 865             case '-':  ch = 30; break;
 866             case '>':  ch = 31; break;
 867             case '[':  ch = 16; break;
 868             case ']':  ch = 17; break;
 869             case '{':
 870                 chr = 0;
 871                 nextch = lex_getch(lex);
 872                 hex = (nextch == 'x');
 873                 oct = (nextch == '0');
 874                 if (!hex && !oct)
 875                     lex_ungetch(lex, nextch);
 876                 for (nextch = lex_getch(lex); nextch != '}'; nextch = lex_getch(lex)) {
 877                     if (!hex && !oct) {
 878                         if (nextch >= '0' && nextch <= '9')
 879                             chr = chr * 10 + nextch - '0';
 880                         else {
 881                             lexerror(lex, "bad character code");
 882                             return (lex->tok.ttype = TOKEN_ERROR);
 883                         }
 884                     } else if (!oct) {
 885                         if (nextch >= '0' && nextch <= '9')
 886                             chr = chr * 0x10 + nextch - '0';
 887                         else if (nextch >= 'a' && nextch <= 'f')
 888                             chr = chr * 0x10 + nextch - 'a' + 10;
 889                         else if (nextch >= 'A' && nextch <= 'F')
 890                             chr = chr * 0x10 + nextch - 'A' + 10;
 891                         else {
 892                             lexerror(lex, "bad character code");
 893                             return (lex->tok.ttype = TOKEN_ERROR);
 894                         }
 895                     } else {
 896                         if (nextch >= '0' && nextch <= '9')
 897                             chr = chr * 8 + chr - '0';
 898                         else {
 899                             lexerror(lex, "bad character code");
 900                             return (lex->tok.ttype = TOKEN_ERROR);
 901                         }
 902                     }
 903                     if (chr > 0x10FFFF || (!OPTS_FLAG(UTF8) && chr > 255))
 904                     {
 905                         lexerror(lex, "character code out of range");
 906                         return (lex->tok.ttype = TOKEN_ERROR);
 907                     }
 908                 }
 909                 if (OPTS_FLAG(UTF8) && chr >= 128) {
 910                     u8len = utf8_from(u8buf, chr);
 911                     if (!u8len)
 912                         ch = 0;
 913                     else {
 914                         --u8len;
 915                         lex->column += u8len;
 916                         for (uc = 0; uc < u8len; ++uc)
 917                             lex_tokench(lex, u8buf[uc]);
 918                         /*
 919                          * the last character will be inserted with the tokench() call
 920                          * below the switch
 921                          */
 922                         ch = u8buf[uc];
 923                     }
 924                 }
 925                 else
 926                     ch = chr;
 927                 break;
 928             case '\n':  ch = '\n'; break;
 929
 930             default:
 931                 lexwarn(lex, WARN_UNKNOWN_CONTROL_SEQUENCE, "unrecognized control sequence: \\%c", ch);
 932                 /* so we just add the character plus backslash no matter what it actually is */
 933                 lex_tokench(lex, '\\');
 934             }
 935             /* add the character finally */
 936             lex_tokench(lex, ch);
 937         }
 938         else
 939             lex_tokench(lex, ch);
 940     }
 941     lexerror(lex, "unexpected end of file within string constant");
 942     lex_ungetch(lex, FS_FILE_EOF); /* next token to be TOKEN_EOF */
 943     return (lex->tok.ttype = TOKEN_ERROR);
 944 }
 945
 946 static int GMQCC_WARN lex_finish_digit(lex_file *lex, int lastch)
 947 {
 948     bool ishex = false;
 949
 950     int  ch = lastch;
 951
 952     /* parse a number... */
 953     if (ch == '.')
 954         lex->tok.ttype = TOKEN_FLOATCONST;
 955     else
 956         lex->tok.ttype = TOKEN_INTCONST;
 957
 958     lex_tokench(lex, ch);
 959
 960     ch = lex_getch(lex);
 961     if (ch != '.' && !util_isdigit(ch))
 962     {
 963         if (lastch != '0' || ch != 'x')
 964         {
 965             /* end of the number or EOF */
 966             lex_ungetch(lex, ch);
 967             lex_endtoken(lex);
 968
 969             lex->tok.constval.i = lastch - '0';
 970             return lex->tok.ttype;
 971         }
 972
 973         ishex = true;
 974     }
 975
 976     /* EOF would have been caught above */
 977
 978     if (ch != '.')
 979     {
 980         lex_tokench(lex, ch);
 981         ch = lex_getch(lex);
 982         while (util_isdigit(ch) || (ishex && isxdigit_only(ch)))
 983         {
 984             lex_tokench(lex, ch);
 985             ch = lex_getch(lex);
 986         }
 987     }
 988     /* NOT else, '.' can come from above as well */
 989     if (lex->tok.ttype != TOKEN_FLOATCONST && ch == '.' && !ishex)
 990     {
 991         /* Allow floating comma in non-hex mode */
 992         lex->tok.ttype = TOKEN_FLOATCONST;
 993         lex_tokench(lex, ch);
 994
 995         /* continue digits-only */
 996         ch = lex_getch(lex);
 997         while (util_isdigit(ch))
 998         {
 999             lex_tokench(lex, ch);
1000             ch = lex_getch(lex);
1001         }
1002     }
1003     /* put back the last character */
1004     /* but do not put back the trailing 'f' or a float */
1005     if (lex->tok.ttype == TOKEN_FLOATCONST && ch == 'f')
1006         ch = lex_getch(lex);
1007
1008     /* generally we don't want words to follow numbers: */
1009     if (isident(ch)) {
1010         lexerror(lex, "unexpected trailing characters after number");
1011         return (lex->tok.ttype = TOKEN_ERROR);
1012     }
1013     lex_ungetch(lex, ch);
1014
1015     lex_endtoken(lex);
1016     if (lex->tok.ttype == TOKEN_FLOATCONST)
1017         lex->tok.constval.f = strtod(lex->tok.value, NULL);
1018     else
1019         lex->tok.constval.i = strtol(lex->tok.value, NULL, 0);
1020     return lex->tok.ttype;
1021 }
1022
1023 int lex_do(lex_file *lex)
1024 {
1025     int ch, nextch, thirdch;
1026     bool hadwhite = false;
1027
1028     lex_token_new(lex);
1029 #if 0
1030     if (!lex->tok)
1031         return TOKEN_FATAL;
1032 #endif
1033
1034     while (true) {
1035         ch = lex_skipwhite(lex, hadwhite);
1036         hadwhite = true;
1037         if (!lex->flags.mergelines || ch != '\\')
1038             break;
1039         ch = lex_getch(lex);
1040         if (ch == '\r')
1041             ch = lex_getch(lex);
1042         if (ch != '\n') {
1043             lex_ungetch(lex, ch);
1044             ch = '\\';
1045             break;
1046         }
1047         /* we reached a linemerge */
1048         lex_tokench(lex, '\n');
1049         continue;
1050     }
1051
1052     if (lex->flags.preprocessing && (ch == TOKEN_WHITE || ch == TOKEN_EOL || ch == TOKEN_FATAL)) {
1053         return (lex->tok.ttype = ch);
1054     }
1055
1056     lex->sline = lex->line;
1057     lex->tok.ctx.line = lex->sline;
1058     lex->tok.ctx.file = lex->name;
1059
1060     if (lex->eof)
1061         return (lex->tok.ttype = TOKEN_FATAL);
1062
1063     if (ch == FS_FILE_EOF) {
1064         lex->eof = true;
1065         return (lex->tok.ttype = TOKEN_EOF);
1066     }
1067
1068     /* modelgen / spiritgen commands */
1069     if (ch == '$' && !lex->flags.preprocessing) {
1070         const char *v;
1071         size_t frame;
1072
1073         ch = lex_getch(lex);
1074         if (!isident_start(ch)) {
1075             lexerror(lex, "hanging '$' modelgen/spritegen command line");
1076             return lex_do(lex);
1077         }
1078         lex_tokench(lex, ch);
1079         if (!lex_finish_ident(lex))
1080             return (lex->tok.ttype = TOKEN_ERROR);
1081         lex_endtoken(lex);
1082         /* skip the known commands */
1083         v = lex->tok.value;
1084
1085         if (!strcmp(v, "frame") || !strcmp(v, "framesave"))
1086         {
1087             /* frame/framesave command works like an enum
1088              * similar to fteqcc we handle this in the lexer.
1089              * The reason for this is that it is sensitive to newlines,
1090              * which the parser is unaware of
1091              */
1092             if (!lex_finish_frames(lex))
1093                  return (lex->tok.ttype = TOKEN_ERROR);
1094             return lex_do(lex);
1095         }
1096
1097         if (!strcmp(v, "framevalue"))
1098         {
1099             ch = lex_getch(lex);
1100             while (ch != FS_FILE_EOF && util_isspace(ch) && ch != '\n')
1101                 ch = lex_getch(lex);
1102
1103             if (!util_isdigit(ch)) {
1104                 lexerror(lex, "$framevalue requires an integer parameter");
1105                 return lex_do(lex);
1106             }
1107
1108             lex_token_new(lex);
1109             lex->tok.ttype = lex_finish_digit(lex, ch);
1110             lex_endtoken(lex);
1111             if (lex->tok.ttype != TOKEN_INTCONST) {
1112                 lexerror(lex, "$framevalue requires an integer parameter");
1113                 return lex_do(lex);
1114             }
1115             lex->framevalue = lex->tok.constval.i;
1116             return lex_do(lex);
1117         }
1118
1119         if (!strcmp(v, "framerestore"))
1120         {
1121             int rc;
1122
1123             lex_token_new(lex);
1124
1125             rc = lex_parse_frame(lex);
1126
1127             if (rc > 0) {
1128                 lexerror(lex, "$framerestore requires a framename parameter");
1129                 return lex_do(lex);
1130             }
1131             if (rc < 0)
1132                 return (lex->tok.ttype = TOKEN_FATAL);
1133
1134             v = lex->tok.value;
1135             for (frame = 0; frame < vec_size(lex->frames); ++frame) {
1136                 if (!strcmp(v, lex->frames[frame].name)) {
1137                     lex->framevalue = lex->frames[frame].value;
1138                     return lex_do(lex);
1139                 }
1140             }
1141             lexerror(lex, "unknown framename `%s`", v);
1142             return lex_do(lex);
1143         }
1144
1145         if (!strcmp(v, "modelname"))
1146         {
1147             int rc;
1148
1149             lex_token_new(lex);
1150
1151             rc = lex_parse_frame(lex);
1152
1153             if (rc > 0) {
1154                 lexerror(lex, "$modelname requires a parameter");
1155                 return lex_do(lex);
1156             }
1157             if (rc < 0)
1158                 return (lex->tok.ttype = TOKEN_FATAL);
1159
1160             if (lex->modelname) {
1161                 frame_macro m;
1162                 m.value = lex->framevalue;
1163                 m.name = lex->modelname;
1164                 lex->modelname = NULL;
1165                 vec_push(lex->frames, m);
1166             }
1167             lex->modelname = lex->tok.value;
1168             lex->tok.value = NULL;
1169             return lex_do(lex);
1170         }
1171
1172         if (!strcmp(v, "flush"))
1173         {
1174             size_t fi;
1175             for (fi = 0; fi < vec_size(lex->frames); ++fi)
1176                 mem_d(lex->frames[fi].name);
1177             vec_free(lex->frames);
1178             /* skip line (fteqcc does it too) */
1179             ch = lex_getch(lex);
1180             while (ch != FS_FILE_EOF && ch != '\n')
1181                 ch = lex_getch(lex);
1182             return lex_do(lex);
1183         }
1184
1185         if (!strcmp(v, "cd") ||
1186             !strcmp(v, "origin") ||
1187             !strcmp(v, "base") ||
1188             !strcmp(v, "flags") ||
1189             !strcmp(v, "scale") ||
1190             !strcmp(v, "skin"))
1191         {
1192             /* skip line */
1193             ch = lex_getch(lex);
1194             while (ch != FS_FILE_EOF && ch != '\n')
1195                 ch = lex_getch(lex);
1196             return lex_do(lex);
1197         }
1198
1199         for (frame = 0; frame < vec_size(lex->frames); ++frame) {
1200             if (!strcmp(v, lex->frames[frame].name)) {
1201                 lex->tok.constval.i = lex->frames[frame].value;
1202                 return (lex->tok.ttype = TOKEN_INTCONST);
1203             }
1204         }
1205
1206         lexerror(lex, "invalid frame macro");
1207         return lex_do(lex);
1208     }
1209
1210     /* single-character tokens */
1211     switch (ch)
1212     {
1213         case '[':
1214             nextch = lex_getch(lex);
1215             if (nextch == '[') {
1216                 lex_tokench(lex, ch);
1217                 lex_tokench(lex, nextch);
1218                 lex_endtoken(lex);
1219                 return (lex->tok.ttype = TOKEN_ATTRIBUTE_OPEN);
1220             }
1221             lex_ungetch(lex, nextch);
1222             /* FALL THROUGH */
1223         case '(':
1224         case ':':
1225         case '?':
1226             lex_tokench(lex, ch);
1227             lex_endtoken(lex);
1228             if (lex->flags.noops)
1229                 return (lex->tok.ttype = ch);
1230             else
1231                 return (lex->tok.ttype = TOKEN_OPERATOR);
1232
1233         case ']':
1234             if (lex->flags.noops) {
1235                 nextch = lex_getch(lex);
1236                 if (nextch == ']') {
1237                     lex_tokench(lex, ch);
1238                     lex_tokench(lex, nextch);
1239                     lex_endtoken(lex);
1240                     return (lex->tok.ttype = TOKEN_ATTRIBUTE_CLOSE);
1241                 }
1242                 lex_ungetch(lex, nextch);
1243             }
1244             /* FALL THROUGH */
1245         case ')':
1246         case ';':
1247         case '{':
1248         case '}':
1249
1250         case '#':
1251             lex_tokench(lex, ch);
1252             lex_endtoken(lex);
1253             return (lex->tok.ttype = ch);
1254         default:
1255             break;
1256     }
1257
1258     if (ch == '.') {
1259         nextch = lex_getch(lex);
1260         /* digits starting with a dot */
1261         if (util_isdigit(nextch)) {
1262             lex_ungetch(lex, nextch);
1263             lex->tok.ttype = lex_finish_digit(lex, ch);
1264             lex_endtoken(lex);
1265             return lex->tok.ttype;
1266         }
1267         lex_ungetch(lex, nextch);
1268     }
1269
1270     if (lex->flags.noops)
1271     {
1272         /* Detect characters early which are normally
1273          * operators OR PART of an operator.
1274          */
1275         switch (ch)
1276         {
1277             /*
1278             case '+':
1279             case '-':
1280             */
1281             case '*':
1282             case '/':
1283             case '<':
1284             case '>':
1285             case '=':
1286             case '&':
1287             case '|':
1288             case '^':
1289             case '~':
1290             case ',':
1291             case '!':
1292                 lex_tokench(lex, ch);
1293                 lex_endtoken(lex);
1294                 return (lex->tok.ttype = ch);
1295             default:
1296                 break;
1297         }
1298     }
1299
1300     if (ch == '.')
1301     {
1302         lex_tokench(lex, ch);
1303         /* peak ahead once */
1304         nextch = lex_getch(lex);
1305         if (nextch != '.') {
1306             lex_ungetch(lex, nextch);
1307             lex_endtoken(lex);
1308             if (lex->flags.noops)
1309                 return (lex->tok.ttype = ch);
1310             else
1311                 return (lex->tok.ttype = TOKEN_OPERATOR);
1312         }
1313         /* peak ahead again */
1314         nextch = lex_getch(lex);
1315         if (nextch != '.') {
1316             lex_ungetch(lex, nextch);
1317             lex_ungetch(lex, '.');
1318             lex_endtoken(lex);
1319             if (lex->flags.noops)
1320                 return (lex->tok.ttype = ch);
1321             else
1322                 return (lex->tok.ttype = TOKEN_OPERATOR);
1323         }
1324         /* fill the token to be "..." */
1325         lex_tokench(lex, ch);
1326         lex_tokench(lex, ch);
1327         lex_endtoken(lex);
1328         return (lex->tok.ttype = TOKEN_DOTS);
1329     }
1330
1331     if (ch == ',' || ch == '.') {
1332         lex_tokench(lex, ch);
1333         lex_endtoken(lex);
1334         return (lex->tok.ttype = TOKEN_OPERATOR);
1335     }
1336
1337     if (ch == '+' || ch == '-' || /* ++, --, +=, -=  and -> as well! */
1338         ch == '>' || ch == '<' || /* <<, >>, <=, >=  and >< as well! */
1339         ch == '=' || ch == '!' || /* <=>, ==, !=                     */
1340         ch == '&' || ch == '|' || /* &&, ||, &=, |=                  */
1341         ch == '~' || ch == '^'    /* ~=, ~, ^                        */
1342     )  {
1343         lex_tokench(lex, ch);
1344
1345         nextch = lex_getch(lex);
1346         if ((nextch == '=' && ch != '<') ||
1347             (nextch == ch  && ch != '!') ||
1348             (nextch == '<' && ch == '>')) {
1349             lex_tokench(lex, nextch);
1350         } else if (ch == '<' && nextch == '=') {
1351             lex_tokench(lex, nextch);
1352             if ((thirdch = lex_getch(lex)) == '>')
1353                 lex_tokench(lex, thirdch);
1354             else
1355                 lex_ungetch(lex, thirdch);
1356
1357         } else if (ch == '-' && nextch == '>') {
1358             lex_tokench(lex, nextch);
1359         } else if (ch == '&' && nextch == '~') {
1360             thirdch = lex_getch(lex);
1361             if (thirdch != '=') {
1362                 lex_ungetch(lex, thirdch);
1363                 lex_ungetch(lex, nextch);
1364             }
1365             else {
1366                 lex_tokench(lex, nextch);
1367                 lex_tokench(lex, thirdch);
1368             }
1369         }
1370         else if (lex->flags.preprocessing &&
1371                  ch == '-' && util_isdigit(nextch))
1372         {
1373             lex->tok.ttype = lex_finish_digit(lex, nextch);
1374             if (lex->tok.ttype == TOKEN_INTCONST)
1375                 lex->tok.constval.i = -lex->tok.constval.i;
1376             else
1377                 lex->tok.constval.f = -lex->tok.constval.f;
1378             lex_endtoken(lex);
1379             return lex->tok.ttype;
1380         } else {
1381             lex_ungetch(lex, nextch);
1382         }
1383
1384         lex_endtoken(lex);
1385         return (lex->tok.ttype = TOKEN_OPERATOR);
1386     }
1387
1388     /*
1389     if (ch == '^' || ch == '~' || ch == '!')
1390     {
1391         lex_tokench(lex, ch);
1392         lex_endtoken(lex);
1393         return (lex->tok.ttype = TOKEN_OPERATOR);
1394     }
1395     */
1396
1397     if (ch == '*' || ch == '/') /* *=, /= */
1398     {
1399         lex_tokench(lex, ch);
1400
1401         nextch = lex_getch(lex);
1402         if (nextch == '=' || nextch == '*') {
1403             lex_tokench(lex, nextch);
1404         } else
1405             lex_ungetch(lex, nextch);
1406
1407         lex_endtoken(lex);
1408         return (lex->tok.ttype = TOKEN_OPERATOR);
1409     }
1410
1411     if (ch == '%') {
1412         lex_tokench(lex, ch);
1413         lex_endtoken(lex);
1414         return (lex->tok.ttype = TOKEN_OPERATOR);
1415     }
1416
1417     if (isident_start(ch))
1418     {
1419         const char *v;
1420
1421         lex_tokench(lex, ch);
1422         if (!lex_finish_ident(lex)) {
1423             /* error? */
1424             return (lex->tok.ttype = TOKEN_ERROR);
1425         }
1426         lex_endtoken(lex);
1427         lex->tok.ttype = TOKEN_IDENT;
1428
1429         v = lex->tok.value;
1430         if (!strcmp(v, "void")) {
1431             lex->tok.ttype = TOKEN_TYPENAME;
1432             lex->tok.constval.t = TYPE_VOID;
1433         } else if (!strcmp(v, "int")) {
1434             lex->tok.ttype = TOKEN_TYPENAME;
1435             lex->tok.constval.t = TYPE_INTEGER;
1436         } else if (!strcmp(v, "float")) {
1437             lex->tok.ttype = TOKEN_TYPENAME;
1438             lex->tok.constval.t = TYPE_FLOAT;
1439         } else if (!strcmp(v, "string")) {
1440             lex->tok.ttype = TOKEN_TYPENAME;
1441             lex->tok.constval.t = TYPE_STRING;
1442         } else if (!strcmp(v, "entity")) {
1443             lex->tok.ttype = TOKEN_TYPENAME;
1444             lex->tok.constval.t = TYPE_ENTITY;
1445         } else if (!strcmp(v, "vector")) {
1446             lex->tok.ttype = TOKEN_TYPENAME;
1447             lex->tok.constval.t = TYPE_VECTOR;
1448         } else {
1449             size_t kw;
1450             for (kw = 0; kw < GMQCC_ARRAY_COUNT(keywords_qc); ++kw) {
1451                 if (!strcmp(v, keywords_qc[kw]))
1452                     return (lex->tok.ttype = TOKEN_KEYWORD);
1453             }
1454             if (OPTS_OPTION_U32(OPTION_STANDARD) != COMPILER_QCC) {
1455                 for (kw = 0; kw < GMQCC_ARRAY_COUNT(keywords_fg); ++kw) {
1456                     if (!strcmp(v, keywords_fg[kw]))
1457                         return (lex->tok.ttype = TOKEN_KEYWORD);
1458                 }
1459             }
1460         }
1461
1462         return lex->tok.ttype;
1463     }
1464
1465     if (ch == '"')
1466     {
1467         lex->flags.nodigraphs = true;
1468         if (lex->flags.preprocessing)
1469             lex_tokench(lex, ch);
1470         lex->tok.ttype = lex_finish_string(lex, '"');
1471         if (lex->flags.preprocessing)
1472             lex_tokench(lex, ch);
1473         while (!lex->flags.preprocessing && lex->tok.ttype == TOKEN_STRINGCONST)
1474         {
1475             /* Allow c style "string" "continuation" */
1476             ch = lex_skipwhite(lex, false);
1477             if (ch != '"') {
1478                 lex_ungetch(lex, ch);
1479                 break;
1480             }
1481
1482             lex->tok.ttype = lex_finish_string(lex, '"');
1483         }
1484         lex->flags.nodigraphs = false;
1485         lex_endtoken(lex);
1486         return lex->tok.ttype;
1487     }
1488
1489     if (ch == '\'')
1490     {
1491         /* we parse character constants like string,
1492          * but return TOKEN_CHARCONST, or a vector type if it fits...
1493          * Likewise actual unescaping has to be done by the parser.
1494          * The difference is we don't allow 'char' 'continuation'.
1495          */
1496         if (lex->flags.preprocessing)
1497             lex_tokench(lex, ch);
1498         lex->tok.ttype = lex_finish_string(lex, '\'');
1499         if (lex->flags.preprocessing)
1500             lex_tokench(lex, ch);
1501         lex_endtoken(lex);
1502
1503         lex->tok.ttype = TOKEN_CHARCONST;
1504
1505         /* It's a vector if we can successfully scan 3 floats */
1506         if (util_sscanf(lex->tok.value, " %f %f %f ",
1507                    &lex->tok.constval.v.x, &lex->tok.constval.v.y, &lex->tok.constval.v.z) == 3)
1508
1509         {
1510              lex->tok.ttype = TOKEN_VECTORCONST;
1511         }
1512         else
1513         {
1514             if (!lex->flags.preprocessing && strlen(lex->tok.value) > 1) {
1515                 utf8ch_t u8char;
1516                 /* check for a valid utf8 character */
1517                 if (!OPTS_FLAG(UTF8) || !utf8_to(&u8char, (const unsigned char *)lex->tok.value, 8)) {
1518                     if (lexwarn(lex, WARN_MULTIBYTE_CHARACTER,
1519                                 ( OPTS_FLAG(UTF8) ? "invalid multibyte character sequence `%s`"
1520                                                   : "multibyte character: `%s`" ),
1521                                 lex->tok.value))
1522                         return (lex->tok.ttype = TOKEN_ERROR);
1523                 }
1524                 else
1525                     lex->tok.constval.i = u8char;
1526             }
1527             else
1528                 lex->tok.constval.i = lex->tok.value[0];
1529         }
1530
1531         return lex->tok.ttype;
1532     }
1533
1534     if (util_isdigit(ch))
1535     {
1536         lex->tok.ttype = lex_finish_digit(lex, ch);
1537         lex_endtoken(lex);
1538         return lex->tok.ttype;
1539     }
1540
1541     if (lex->flags.preprocessing) {
1542         lex_tokench(lex, ch);
1543         lex_endtoken(lex);
1544         return (lex->tok.ttype = ch);
1545     }
1546
1547     lexerror(lex, "unknown token: `%c`", ch);
1548     return (lex->tok.ttype = TOKEN_ERROR);
1549 }