lexer.c

   1 #include <stdio.h>
   2 #include <stdlib.h>
   3 #include <string.h>
   4 #include <stdarg.h>
   5
   6 #include "gmqcc.h"
   7 #include "lexer.h"
   8
   9 MEM_VEC_FUNCTIONS(token, char, value)
  10
  11 void lexerror(lex_file *lex, const char *fmt, ...)
  12 {
  13         va_list ap;
  14
  15         if (lex)
  16                 printf("error %s:%lu: ", lex->name, (unsigned long)lex->sline);
  17         else
  18                 printf("error: ");
  19
  20         va_start(ap, fmt);
  21         vprintf(fmt, ap);
  22         va_end(ap);
  23
  24         printf("\n");
  25 }
  26
  27 void lexwarn(lex_file *lex, int warn, const char *fmt, ...)
  28 {
  29         va_list ap;
  30
  31         if (!OPTS_WARN(warn))
  32             return;
  33
  34         if (lex)
  35                 printf("warning %s:%lu: ", lex->name, (unsigned long)lex->sline);
  36         else
  37                 printf("warning: ");
  38
  39         va_start(ap, fmt);
  40         vprintf(fmt, ap);
  41         va_end(ap);
  42
  43         printf("\n");
  44 }
  45
  46 token* token_new()
  47 {
  48         token *tok = (token*)mem_a(sizeof(token));
  49         if (!tok)
  50                 return NULL;
  51         memset(tok, 0, sizeof(*tok));
  52         return tok;
  53 }
  54
  55 void token_delete(token *self)
  56 {
  57         if (self->next && self->next->prev == self)
  58                 self->next->prev = self->prev;
  59         if (self->prev && self->prev->next == self)
  60                 self->prev->next = self->next;
  61         MEM_VECTOR_CLEAR(self, value);
  62         mem_d(self);
  63 }
  64
  65 token* token_copy(const token *cp)
  66 {
  67         token* self = token_new();
  68         if (!self)
  69                 return NULL;
  70         /* copy the value */
  71         self->value_alloc = cp->value_count + 1;
  72         self->value_count = cp->value_count;
  73         self->value = (char*)mem_a(self->value_alloc);
  74         if (!self->value) {
  75                 mem_d(self);
  76                 return NULL;
  77         }
  78         memcpy(self->value, cp->value, cp->value_count);
  79         self->value[self->value_alloc-1] = 0;
  80
  81         /* rest */
  82         self->ctx = cp->ctx;
  83         self->ttype = cp->ttype;
  84         memcpy(&self->constval, &cp->constval, sizeof(self->constval));
  85         return self;
  86 }
  87
  88 void token_delete_all(token *t)
  89 {
  90         token *n;
  91
  92         do {
  93                 n = t->next;
  94                 token_delete(t);
  95                 t = n;
  96         } while(t);
  97 }
  98
  99 token* token_copy_all(const token *cp)
 100 {
 101         token *cur;
 102         token *out;
 103
 104         out = cur = token_copy(cp);
 105         if (!out)
 106                 return NULL;
 107
 108         while (cp->next) {
 109                 cp = cp->next;
 110                 cur->next = token_copy(cp);
 111                 if (!cur->next) {
 112                         token_delete_all(out);
 113                         return NULL;
 114                 }
 115                 cur->next->prev = cur;
 116                 cur = cur->next;
 117         }
 118
 119         return out;
 120 }
 121
 122 lex_file* lex_open(const char *file)
 123 {
 124         lex_file *lex;
 125         FILE *in = fopen(file, "rb");
 126
 127         if (!in) {
 128                 lexerror(NULL, "open failed: '%s'\n", file);
 129                 return NULL;
 130         }
 131
 132         lex = (lex_file*)mem_a(sizeof(*lex));
 133         if (!lex) {
 134                 fclose(in);
 135                 lexerror(NULL, "out of memory\n");
 136                 return NULL;
 137         }
 138
 139         memset(lex, 0, sizeof(*lex));
 140
 141         lex->file = in;
 142         lex->name = util_strdup(file);
 143         lex->line = 1; /* we start counting at 1 */
 144
 145         lex->peekpos = 0;
 146
 147         return lex;
 148 }
 149
 150 void lex_close(lex_file *lex)
 151 {
 152         if (lex->file)
 153                 fclose(lex->file);
 154         if (lex->tok)
 155                 token_delete(lex->tok);
 156         mem_d(lex->name);
 157         mem_d(lex);
 158 }
 159
 160 /* Get or put-back data
 161  * The following to functions do NOT understand what kind of data they
 162  * are working on.
 163  * The are merely wrapping get/put in order to count line numbers.
 164  */
 165 static int lex_getch(lex_file *lex)
 166 {
 167         int ch;
 168
 169         if (lex->peekpos) {
 170                 lex->peekpos--;
 171                 if (lex->peek[lex->peekpos] == '\n')
 172                         lex->line++;
 173                 return lex->peek[lex->peekpos];
 174         }
 175
 176         ch = fgetc(lex->file);
 177         if (ch == '\n')
 178                 lex->line++;
 179         return ch;
 180 }
 181
 182 static void lex_ungetch(lex_file *lex, int ch)
 183 {
 184         lex->peek[lex->peekpos++] = ch;
 185         if (ch == '\n')
 186                 lex->line--;
 187 }
 188
 189 /* classify characters
 190  * some additions to the is*() functions of ctype.h
 191  */
 192
 193 /* Idents are alphanumberic, but they start with alpha or _ */
 194 static bool isident_start(int ch)
 195 {
 196         return isalpha(ch) || ch == '_';
 197 }
 198
 199 static bool isident(int ch)
 200 {
 201         return isident_start(ch) || isdigit(ch);
 202 }
 203
 204 /* isxdigit_only is used when we already know it's not a digit
 205  * and want to see if it's a hex digit anyway.
 206  */
 207 static bool isxdigit_only(int ch)
 208 {
 209         return (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
 210 }
 211
 212 /* Skip whitespace and comments and return the first
 213  * non-white character.
 214  * As this makes use of the above getch() ungetch() functions,
 215  * we don't need to care at all about line numbering anymore.
 216  *
 217  * In theory, this function should only be used at the beginning
 218  * of lexing, or when we *know* the next character is part of the token.
 219  * Otherwise, if the parser throws an error, the linenumber may not be
 220  * the line of the error, but the line of the next token AFTER the error.
 221  *
 222  * This is currently only problematic when using c-like string-continuation,
 223  * since comments and whitespaces are allowed between 2 such strings.
 224  * Example:
 225 printf(   "line one\n"
 226 // A comment
 227           "A continuation of the previous string"
 228 // This line is skipped
 229       , foo);
 230
 231  * In this case, if the parse decides it didn't actually want a string,
 232  * and uses lex->line to print an error, it will show the ', foo);' line's
 233  * linenumber.
 234  *
 235  * On the other hand, the parser is supposed to remember the line of the next
 236  * token's beginning. In this case we would want skipwhite() to be called
 237  * AFTER reading a token, so that the parser, before reading the NEXT token,
 238  * doesn't store teh *comment's* linenumber, but the actual token's linenumber.
 239  *
 240  * THIS SOLUTION
 241  *    here is to store the line of the first character after skipping
 242  *    the initial whitespace in lex->sline, this happens in lex_do.
 243  */
 244 static int lex_skipwhite(lex_file *lex)
 245 {
 246         int ch = 0;
 247
 248         do
 249         {
 250                 ch = lex_getch(lex);
 251                 while (ch != EOF && isspace(ch)) ch = lex_getch(lex);
 252
 253                 if (ch == '/') {
 254                         ch = lex_getch(lex);
 255                         if (ch == '/')
 256                         {
 257                                 /* one line comment */
 258                                 ch = lex_getch(lex);
 259
 260                                 /* check for special: '/', '/', '*', '/' */
 261                                 if (ch == '*') {
 262                                         ch = lex_getch(lex);
 263                                         if (ch == '/') {
 264                                                 ch = ' ';
 265                                                 continue;
 266                                         }
 267                                 }
 268
 269                                 while (ch != EOF && ch != '\n') {
 270                                         ch = lex_getch(lex);
 271                                 }
 272                                 continue;
 273                         }
 274                         if (ch == '*')
 275                         {
 276                                 /* multiline comment */
 277                                 while (ch != EOF)
 278                                 {
 279                                         ch = lex_getch(lex);
 280                                         if (ch == '*') {
 281                                                 ch = lex_getch(lex);
 282                                                 if (ch == '/') {
 283                                                         ch = lex_getch(lex);
 284                                                         break;
 285                                                 }
 286                                         }
 287                                 }
 288                                 if (ch == '/') /* allow *//* direct following comment */
 289                                 {
 290                                         lex_ungetch(lex, ch);
 291                                         ch = ' '; /* cause TRUE in the isspace check */
 292                                 }
 293                                 continue;
 294                         }
 295                         /* Otherwise roll back to the slash and break out of the loop */
 296                         lex_ungetch(lex, ch);
 297                         ch = '/';
 298                         break;
 299                 }
 300         } while (ch != EOF && isspace(ch));
 301
 302         return ch;
 303 }
 304
 305 /* Append a character to the token buffer */
 306 static bool GMQCC_WARN lex_tokench(lex_file *lex, int ch)
 307 {
 308         if (!token_value_add(lex->tok, ch)) {
 309                 lexerror(lex, "out of memory");
 310                 return false;
 311         }
 312         return true;
 313 }
 314
 315 /* Append a trailing null-byte */
 316 static bool GMQCC_WARN lex_endtoken(lex_file *lex)
 317 {
 318         if (!token_value_add(lex->tok, 0)) {
 319                 lexerror(lex, "out of memory");
 320                 return false;
 321         }
 322         lex->tok->value_count--;
 323         return true;
 324 }
 325
 326 /* Get a token */
 327 static bool GMQCC_WARN lex_finish_ident(lex_file *lex)
 328 {
 329         int ch;
 330
 331         ch = lex_getch(lex);
 332         while (ch != EOF && isident(ch))
 333         {
 334                 if (!lex_tokench(lex, ch))
 335                         return (lex->tok->ttype = TOKEN_FATAL);
 336                 ch = lex_getch(lex);
 337         }
 338
 339         /* last ch was not an ident ch: */
 340         lex_ungetch(lex, ch);
 341
 342         return true;
 343 }
 344
 345 static int GMQCC_WARN lex_finish_string(lex_file *lex, int quote)
 346 {
 347         int ch = 0;
 348
 349         while (ch != EOF)
 350         {
 351                 ch = lex_getch(lex);
 352                 if (ch == quote)
 353                         return TOKEN_STRINGCONST;
 354
 355                 if (ch == '\\') {
 356                         ch = lex_getch(lex);
 357                         if (ch == EOF) {
 358                                 lexerror(lex, "unexpected end of file");
 359                                 lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
 360                                 return (lex->tok->ttype = TOKEN_ERROR);
 361                         }
 362
 363             switch (ch) {
 364             case '\\': break;
 365             case 'a':  ch = '\a'; break;
 366             case 'b':  ch = '\b'; break;
 367             case 'r':  ch = '\r'; break;
 368             case 'n':  ch = '\n'; break;
 369             case 't':  ch = '\t'; break;
 370             case 'f':  ch = '\f'; break;
 371             case 'v':  ch = '\v'; break;
 372             default:
 373                 lexwarn(lex, WARN_UNKNOWN_CONTROL_SEQUENCE, "unrecognized control sequence: \\%c", ch);
 374                             /* so we just add the character plus backslash no matter what it actually is */
 375                             if (!lex_tokench(lex, '\\'))
 376                                     return (lex->tok->ttype = TOKEN_FATAL);
 377             }
 378             /* add the character finally */
 379                         if (!lex_tokench(lex, ch))
 380                                 return (lex->tok->ttype = TOKEN_FATAL);
 381                 }
 382                 else if (!lex_tokench(lex, ch))
 383                         return (lex->tok->ttype = TOKEN_FATAL);
 384         }
 385         lexerror(lex, "unexpected end of file within string constant");
 386         lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
 387         return (lex->tok->ttype = TOKEN_ERROR);
 388 }
 389
 390 static int GMQCC_WARN lex_finish_digit(lex_file *lex, int lastch)
 391 {
 392         bool ishex = false;
 393
 394         int  ch = lastch;
 395
 396         /* parse a number... */
 397         lex->tok->ttype = TOKEN_INTCONST;
 398
 399         if (!lex_tokench(lex, ch))
 400                 return (lex->tok->ttype = TOKEN_FATAL);
 401
 402         ch = lex_getch(lex);
 403         if (ch != '.' && !isdigit(ch))
 404         {
 405                 if (lastch != '0' || ch != 'x')
 406                 {
 407                         /* end of the number or EOF */
 408                         lex_ungetch(lex, ch);
 409                         if (!lex_endtoken(lex))
 410                                 return (lex->tok->ttype = TOKEN_FATAL);
 411
 412                         lex->tok->constval.i = lastch - '0';
 413                         return lex->tok->ttype;
 414                 }
 415
 416                 ishex = true;
 417         }
 418
 419         /* EOF would have been caught above */
 420
 421         if (ch != '.')
 422         {
 423                 if (!lex_tokench(lex, ch))
 424                         return (lex->tok->ttype = TOKEN_FATAL);
 425                 ch = lex_getch(lex);
 426                 while (isdigit(ch) || (ishex && isxdigit_only(ch)))
 427                 {
 428                         if (!lex_tokench(lex, ch))
 429                                 return (lex->tok->ttype = TOKEN_FATAL);
 430                         ch = lex_getch(lex);
 431                 }
 432         }
 433         /* NOT else, '.' can come from above as well */
 434         if (ch == '.' && !ishex)
 435         {
 436                 /* Allow floating comma in non-hex mode */
 437                 lex->tok->ttype = TOKEN_FLOATCONST;
 438                 if (!lex_tokench(lex, ch))
 439                         return (lex->tok->ttype = TOKEN_FATAL);
 440
 441                 /* continue digits-only */
 442                 ch = lex_getch(lex);
 443                 while (isdigit(ch))
 444                 {
 445                         if (!lex_tokench(lex, ch))
 446                                 return (lex->tok->ttype = TOKEN_FATAL);
 447                         ch = lex_getch(lex);
 448                 }
 449         }
 450         /* put back the last character */
 451         /* but do not put back the trailing 'f' or a float */
 452         if (lex->tok->ttype == TOKEN_FLOATCONST && ch == 'f')
 453                 ch = lex_getch(lex);
 454
 455         /* generally we don't want words to follow numbers: */
 456         if (isident(ch)) {
 457                 lexerror(lex, "unexpected trailing characters after number");
 458                 return (lex->tok->ttype = TOKEN_ERROR);
 459         }
 460         lex_ungetch(lex, ch);
 461
 462         if (!lex_endtoken(lex))
 463                 return (lex->tok->ttype = TOKEN_FATAL);
 464         if (lex->tok->ttype == TOKEN_FLOATCONST)
 465                 lex->tok->constval.f = strtod(lex->tok->value, NULL);
 466         else
 467                 lex->tok->constval.i = strtol(lex->tok->value, NULL, 0);
 468         return lex->tok->ttype;
 469 }
 470
 471 int lex_do(lex_file *lex)
 472 {
 473         int ch, nextch;
 474
 475         if (lex->tok)
 476                 token_delete(lex->tok);
 477         lex->tok = token_new();
 478         if (!lex->tok)
 479                 return TOKEN_FATAL;
 480
 481         ch = lex_skipwhite(lex);
 482         lex->sline = lex->line;
 483         lex->tok->ctx.line = lex->sline;
 484         lex->tok->ctx.file = lex->name;
 485
 486         if (ch == EOF)
 487                 return (lex->tok->ttype = TOKEN_EOF);
 488
 489         /* single-character tokens */
 490         switch (ch)
 491         {
 492                 case ';':
 493                 case '(':
 494                 case ')':
 495                 case '{':
 496                 case '}':
 497                 case '[':
 498                 case ']':
 499
 500                 case '#':
 501
 502                         return (lex->tok->ttype = ch);
 503                 default:
 504                         break;
 505         }
 506
 507         if (lex->flags.noops)
 508         {
 509                 /* Detect characters early which are normally
 510                  * operators OR PART of an operator.
 511                  */
 512                 switch (ch)
 513                 {
 514                         case '+':
 515                         case '-':
 516                         case '*':
 517                         case '/':
 518                         case '<':
 519                         case '>':
 520                         case '=':
 521                         case '&':
 522                         case '|':
 523                         case '^':
 524                         case '~':
 525                         case ',':
 526                                 return ch;
 527                         default:
 528                                 break;
 529                 }
 530         }
 531
 532         if (ch == ',') {
 533             if (!lex_tokench(lex, ch) ||
 534                 !lex_endtoken(lex))
 535             {
 536                 return (lex->tok->ttype = TOKEN_FATAL);
 537             }
 538             return (lex->tok->ttype = TOKEN_OPERATOR);
 539         }
 540
 541         if (ch == '+' || ch == '-' || /* ++, --, +=, -=  and -> as well! */
 542             ch == '>' || ch == '<' || /* <<, >>, <=, >= */
 543             ch == '=' ||              /* == */
 544             ch == '&' || ch == '|')   /* &&, ||, &=, |= */
 545         {
 546                 if (!lex_tokench(lex, ch))
 547                         return (lex->tok->ttype = TOKEN_FATAL);
 548
 549                 nextch = lex_getch(lex);
 550                 if (nextch == ch || nextch == '=') {
 551                         if (!lex_tokench(lex, nextch))
 552                                 return (lex->tok->ttype = TOKEN_FATAL);
 553                 } else if (ch == '-' && nextch == '>') {
 554                         if (!lex_tokench(lex, nextch))
 555                                 return (lex->tok->ttype = TOKEN_FATAL);
 556                 } else
 557                         lex_ungetch(lex, nextch);
 558
 559                 if (!lex_endtoken(lex))
 560                         return (lex->tok->ttype = TOKEN_FATAL);
 561                 return (lex->tok->ttype = TOKEN_OPERATOR);
 562         }
 563
 564         if (ch == '^' || ch == '~' || ch == '!')
 565         {
 566                 if (!lex_tokench(lex, ch) ||
 567                         !lex_endtoken(lex))
 568                 {
 569                         return (lex->tok->ttype = TOKEN_FATAL);
 570                 }
 571                 return (lex->tok->ttype = TOKEN_OPERATOR);
 572         }
 573
 574         if (ch == '*' || ch == '/') /* *=, /= */
 575         {
 576                 if (!lex_tokench(lex, ch))
 577                         return (lex->tok->ttype = TOKEN_FATAL);
 578
 579                 nextch = lex_getch(lex);
 580                 if (nextch == '=') {
 581                         if (!lex_tokench(lex, nextch))
 582                                 return (lex->tok->ttype = TOKEN_FATAL);
 583                 } else
 584                         lex_ungetch(lex, nextch);
 585
 586                 if (!lex_endtoken(lex))
 587                         return (lex->tok->ttype = TOKEN_FATAL);
 588                 return (lex->tok->ttype = TOKEN_OPERATOR);
 589         }
 590
 591         if (isident_start(ch))
 592         {
 593                 const char *v;
 594                 if (!lex_tokench(lex, ch))
 595                         return (lex->tok->ttype = TOKEN_FATAL);
 596                 if (!lex_finish_ident(lex)) {
 597                         /* error? */
 598                         return (lex->tok->ttype = TOKEN_ERROR);
 599                 }
 600                 if (!lex_endtoken(lex))
 601                         return (lex->tok->ttype = TOKEN_FATAL);
 602                 lex->tok->ttype = TOKEN_IDENT;
 603
 604                 v = lex->tok->value;
 605                 if (!strcmp(v, "void")) {
 606                         lex->tok->ttype = TOKEN_TYPENAME;
 607                     lex->tok->constval.t = TYPE_VOID;
 608                 } else if (!strcmp(v, "int")) {
 609                         lex->tok->ttype = TOKEN_TYPENAME;
 610                     lex->tok->constval.t = TYPE_INTEGER;
 611                 } else if (!strcmp(v, "float")) {
 612                         lex->tok->ttype = TOKEN_TYPENAME;
 613                     lex->tok->constval.t = TYPE_FLOAT;
 614                 } else if (!strcmp(v, "string")) {
 615                         lex->tok->ttype = TOKEN_TYPENAME;
 616                     lex->tok->constval.t = TYPE_STRING;
 617                 } else if (!strcmp(v, "entity")) {
 618                         lex->tok->ttype = TOKEN_TYPENAME;
 619                     lex->tok->constval.t = TYPE_ENTITY;
 620                 } else if (!strcmp(v, "vector")) {
 621                         lex->tok->ttype = TOKEN_TYPENAME;
 622                     lex->tok->constval.t = TYPE_VECTOR;
 623                 } else if (!strcmp(v, "for")  ||
 624                          !strcmp(v, "while")  ||
 625                          !strcmp(v, "do")     ||
 626                          !strcmp(v, "if")     ||
 627                          !strcmp(v, "else")   ||
 628                          !strcmp(v, "var")    ||
 629                          !strcmp(v, "return") ||
 630                          !strcmp(v, "const"))
 631                         lex->tok->ttype = TOKEN_KEYWORD;
 632
 633                 return lex->tok->ttype;
 634         }
 635
 636         if (ch == '"')
 637         {
 638                 lex->tok->ttype = lex_finish_string(lex, '"');
 639                 while (lex->tok->ttype == TOKEN_STRINGCONST)
 640                 {
 641                         /* Allow c style "string" "continuation" */
 642                         ch = lex_skipwhite(lex);
 643                         if (ch != '"') {
 644                                 lex_ungetch(lex, ch);
 645                                 break;
 646                         }
 647
 648                         lex->tok->ttype = lex_finish_string(lex, '"');
 649                 }
 650                 if (!lex_endtoken(lex))
 651                         return (lex->tok->ttype = TOKEN_FATAL);
 652                 return lex->tok->ttype;
 653         }
 654
 655         if (ch == '\'')
 656         {
 657                 /* we parse character constants like string,
 658                  * but return TOKEN_CHARCONST, or a vector type if it fits...
 659                  * Likewise actual unescaping has to be done by the parser.
 660                  * The difference is we don't allow 'char' 'continuation'.
 661                  */
 662                  lex->tok->ttype = lex_finish_string(lex, '\'');
 663                  if (!lex_endtoken(lex))
 664                          return (lex->tok->ttype = TOKEN_FATAL);
 665
 666                  /* It's a vector if we can successfully scan 3 floats */
 667                  if (sscanf(lex->tok->value, " %f %f %f ", &lex->tok->constval.v.x, &lex->tok->constval.v.y, &lex->tok->constval.v.z) == 3)
 668                  {
 669                          lex->tok->ttype = TOKEN_VECTORCONST;
 670                  }
 671
 672                  return lex->tok->ttype;
 673         }
 674
 675         if (isdigit(ch))
 676         {
 677                 lex->tok->ttype = lex_finish_digit(lex, ch);
 678                 if (!lex_endtoken(lex))
 679                         return (lex->tok->ttype = TOKEN_FATAL);
 680                 return lex->tok->ttype;
 681         }
 682
 683         lexerror(lex, "unknown token");
 684         return (lex->tok->ttype = TOKEN_ERROR);
 685 }