lexer.c

   1 #include <stdio.h>
   2 #include <stdlib.h>
   3 #include <string.h>
   4 #include <stdarg.h>
   5
   6 #include "gmqcc.h"
   7 #include "lexer.h"
   8
   9 MEM_VEC_FUNCTIONS(token, char, value)
  10
  11 void lexerror(lex_file *lex, const char *fmt, ...)
  12 {
  13         va_list ap;
  14
  15         if (lex)
  16                 printf("error %s:%lu: ", lex->name, (unsigned long)lex->sline);
  17         else
  18                 printf("error: ");
  19
  20         va_start(ap, fmt);
  21         vprintf(fmt, ap);
  22         va_end(ap);
  23
  24         printf("\n");
  25 }
  26
  27 token* token_new()
  28 {
  29         token *tok = (token*)mem_a(sizeof(token));
  30         if (!tok)
  31                 return NULL;
  32         memset(tok, 0, sizeof(*tok));
  33         return tok;
  34 }
  35
  36 void token_delete(token *self)
  37 {
  38         if (self->next && self->next->prev == self)
  39                 self->next->prev = self->prev;
  40         if (self->prev && self->prev->next == self)
  41                 self->prev->next = self->next;
  42         MEM_VECTOR_CLEAR(self, value);
  43         mem_d(self);
  44 }
  45
  46 token* token_copy(const token *cp)
  47 {
  48         token* self = token_new();
  49         if (!self)
  50                 return NULL;
  51         /* copy the value */
  52         self->value_alloc = cp->value_count + 1;
  53         self->value_count = cp->value_count;
  54         self->value = (char*)mem_a(self->value_alloc);
  55         if (!self->value) {
  56                 mem_d(self);
  57                 return NULL;
  58         }
  59         memcpy(self->value, cp->value, cp->value_count);
  60         self->value[self->value_alloc-1] = 0;
  61
  62         /* rest */
  63         self->ctx = cp->ctx;
  64         self->ttype = cp->ttype;
  65         memcpy(&self->constval, &cp->constval, sizeof(self->constval));
  66         return self;
  67 }
  68
  69 void token_delete_all(token *t)
  70 {
  71         token *n;
  72
  73         do {
  74                 n = t->next;
  75                 token_delete(t);
  76                 t = n;
  77         } while(t);
  78 }
  79
  80 token* token_copy_all(const token *cp)
  81 {
  82         token *cur;
  83         token *out;
  84
  85         out = cur = token_copy(cp);
  86         if (!out)
  87                 return NULL;
  88
  89         while (cp->next) {
  90                 cp = cp->next;
  91                 cur->next = token_copy(cp);
  92                 if (!cur->next) {
  93                         token_delete_all(out);
  94                         return NULL;
  95                 }
  96                 cur->next->prev = cur;
  97                 cur = cur->next;
  98         }
  99
 100         return out;
 101 }
 102
 103 lex_file* lex_open(const char *file)
 104 {
 105         lex_file *lex;
 106         FILE *in = fopen(file, "rb");
 107
 108         if (!in) {
 109                 lexerror(NULL, "open failed: '%s'\n", file);
 110                 return NULL;
 111         }
 112
 113         lex = (lex_file*)mem_a(sizeof(*lex));
 114         if (!lex) {
 115                 fclose(in);
 116                 lexerror(NULL, "out of memory\n");
 117                 return NULL;
 118         }
 119
 120         memset(lex, 0, sizeof(*lex));
 121
 122         lex->file = in;
 123         lex->name = util_strdup(file);
 124         lex->line = 1; /* we start counting at 1 */
 125
 126         lex->peekpos = 0;
 127
 128         return lex;
 129 }
 130
 131 void lex_close(lex_file *lex)
 132 {
 133         if (lex->file)
 134                 fclose(lex->file);
 135         if (lex->tok)
 136                 token_delete(lex->tok);
 137         mem_d(lex->name);
 138         mem_d(lex);
 139 }
 140
 141 /* Get or put-back data
 142  * The following to functions do NOT understand what kind of data they
 143  * are working on.
 144  * The are merely wrapping get/put in order to count line numbers.
 145  */
 146 static int lex_getch(lex_file *lex)
 147 {
 148         int ch;
 149
 150         if (lex->peekpos) {
 151                 lex->peekpos--;
 152                 if (lex->peek[lex->peekpos] == '\n')
 153                         lex->line++;
 154                 return lex->peek[lex->peekpos];
 155         }
 156
 157         ch = fgetc(lex->file);
 158         if (ch == '\n')
 159                 lex->line++;
 160         return ch;
 161 }
 162
 163 static void lex_ungetch(lex_file *lex, int ch)
 164 {
 165         lex->peek[lex->peekpos++] = ch;
 166         if (ch == '\n')
 167                 lex->line--;
 168 }
 169
 170 /* classify characters
 171  * some additions to the is*() functions of ctype.h
 172  */
 173
 174 /* Idents are alphanumberic, but they start with alpha or _ */
 175 static bool isident_start(int ch)
 176 {
 177         return isalpha(ch) || ch == '_';
 178 }
 179
 180 static bool isident(int ch)
 181 {
 182         return isident_start(ch) || isdigit(ch);
 183 }
 184
 185 /* isxdigit_only is used when we already know it's not a digit
 186  * and want to see if it's a hex digit anyway.
 187  */
 188 static bool isxdigit_only(int ch)
 189 {
 190         return (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
 191 }
 192
 193 /* Skip whitespace and comments and return the first
 194  * non-white character.
 195  * As this makes use of the above getch() ungetch() functions,
 196  * we don't need to care at all about line numbering anymore.
 197  *
 198  * In theory, this function should only be used at the beginning
 199  * of lexing, or when we *know* the next character is part of the token.
 200  * Otherwise, if the parser throws an error, the linenumber may not be
 201  * the line of the error, but the line of the next token AFTER the error.
 202  *
 203  * This is currently only problematic when using c-like string-continuation,
 204  * since comments and whitespaces are allowed between 2 such strings.
 205  * Example:
 206 printf(   "line one\n"
 207 // A comment
 208           "A continuation of the previous string"
 209 // This line is skipped
 210       , foo);
 211
 212  * In this case, if the parse decides it didn't actually want a string,
 213  * and uses lex->line to print an error, it will show the ', foo);' line's
 214  * linenumber.
 215  *
 216  * On the other hand, the parser is supposed to remember the line of the next
 217  * token's beginning. In this case we would want skipwhite() to be called
 218  * AFTER reading a token, so that the parser, before reading the NEXT token,
 219  * doesn't store teh *comment's* linenumber, but the actual token's linenumber.
 220  *
 221  * THIS SOLUTION
 222  *    here is to store the line of the first character after skipping
 223  *    the initial whitespace in lex->sline, this happens in lex_do.
 224  */
 225 static int lex_skipwhite(lex_file *lex)
 226 {
 227         int ch = 0;
 228
 229         do
 230         {
 231                 ch = lex_getch(lex);
 232                 while (ch != EOF && isspace(ch)) ch = lex_getch(lex);
 233
 234                 if (ch == '/') {
 235                         ch = lex_getch(lex);
 236                         if (ch == '/')
 237                         {
 238                                 /* one line comment */
 239                                 ch = lex_getch(lex);
 240
 241                                 /* check for special: '/', '/', '*', '/' */
 242                                 if (ch == '*') {
 243                                         ch = lex_getch(lex);
 244                                         if (ch == '/') {
 245                                                 ch = ' ';
 246                                                 continue;
 247                                         }
 248                                 }
 249
 250                                 while (ch != EOF && ch != '\n') {
 251                                         ch = lex_getch(lex);
 252                                 }
 253                                 continue;
 254                         }
 255                         if (ch == '*')
 256                         {
 257                                 /* multiline comment */
 258                                 while (ch != EOF)
 259                                 {
 260                                         ch = lex_getch(lex);
 261                                         if (ch == '*') {
 262                                                 ch = lex_getch(lex);
 263                                                 if (ch == '/') {
 264                                                         ch = lex_getch(lex);
 265                                                         break;
 266                                                 }
 267                                         }
 268                                 }
 269                                 if (ch == '/') /* allow *//* direct following comment */
 270                                 {
 271                                         lex_ungetch(lex, ch);
 272                                         ch = ' '; /* cause TRUE in the isspace check */
 273                                 }
 274                                 continue;
 275                         }
 276                         /* Otherwise roll back to the slash and break out of the loop */
 277                         lex_ungetch(lex, ch);
 278                         ch = '/';
 279                         break;
 280                 }
 281         } while (ch != EOF && isspace(ch));
 282
 283         return ch;
 284 }
 285
 286 /* Append a character to the token buffer */
 287 static bool GMQCC_WARN lex_tokench(lex_file *lex, int ch)
 288 {
 289         if (!token_value_add(lex->tok, ch)) {
 290                 lexerror(lex, "out of memory");
 291                 return false;
 292         }
 293         return true;
 294 }
 295
 296 /* Append a trailing null-byte */
 297 static bool GMQCC_WARN lex_endtoken(lex_file *lex)
 298 {
 299         if (!token_value_add(lex->tok, 0)) {
 300                 lexerror(lex, "out of memory");
 301                 return false;
 302         }
 303         lex->tok->value_count--;
 304         return true;
 305 }
 306
 307 /* Get a token */
 308 static bool GMQCC_WARN lex_finish_ident(lex_file *lex)
 309 {
 310         int ch;
 311
 312         ch = lex_getch(lex);
 313         while (ch != EOF && isident(ch))
 314         {
 315                 if (!lex_tokench(lex, ch))
 316                         return (lex->tok->ttype = TOKEN_FATAL);
 317                 ch = lex_getch(lex);
 318         }
 319
 320         /* last ch was not an ident ch: */
 321         lex_ungetch(lex, ch);
 322
 323         return true;
 324 }
 325
 326 static int GMQCC_WARN lex_finish_string(lex_file *lex, int quote)
 327 {
 328         int ch = 0;
 329
 330         while (ch != EOF)
 331         {
 332                 ch = lex_getch(lex);
 333                 if (ch == quote)
 334                         return TOKEN_STRINGCONST;
 335
 336                 if (!lex_tokench(lex, ch))
 337                         return (lex->tok->ttype = TOKEN_FATAL);
 338
 339                 /* as lexer we only care about \" to not terminate the string prematurely */
 340                 if (ch == '\\') {
 341                         ch = lex_getch(lex);
 342                         if (ch == EOF) {
 343                                 lexerror(lex, "unexpected end of file");
 344                                 lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
 345                                 return (lex->tok->ttype = TOKEN_ERROR);
 346                         }
 347                         /* so we just add the next character no matter what it actually is */
 348                         if (!lex_tokench(lex, ch))
 349                                 return (lex->tok->ttype = TOKEN_FATAL);
 350                 }
 351         }
 352         lexerror(lex, "unexpected end of file within string constant");
 353         lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
 354         return (lex->tok->ttype = TOKEN_ERROR);
 355 }
 356
 357 static int GMQCC_WARN lex_finish_digit(lex_file *lex, int lastch)
 358 {
 359         bool ishex = false;
 360
 361         int  ch = lastch;
 362
 363         /* parse a number... */
 364         lex->tok->ttype = TOKEN_INTCONST;
 365
 366         if (!lex_tokench(lex, ch))
 367                 return (lex->tok->ttype = TOKEN_FATAL);
 368
 369         ch = lex_getch(lex);
 370         if (ch != '.' && !isdigit(ch))
 371         {
 372                 if (lastch != '0' || ch != 'x')
 373                 {
 374                         /* end of the number or EOF */
 375                         lex_ungetch(lex, ch);
 376                         if (!lex_endtoken(lex))
 377                                 return (lex->tok->ttype = TOKEN_FATAL);
 378
 379                         lex->tok->constval.i = lastch - '0';
 380                         return lex->tok->ttype;
 381                 }
 382
 383                 ishex = true;
 384         }
 385
 386         /* EOF would have been caught above */
 387
 388         if (ch != '.')
 389         {
 390                 if (!lex_tokench(lex, ch))
 391                         return (lex->tok->ttype = TOKEN_FATAL);
 392                 ch = lex_getch(lex);
 393                 while (isdigit(ch) || (ishex && isxdigit_only(ch)))
 394                 {
 395                         if (!lex_tokench(lex, ch))
 396                                 return (lex->tok->ttype = TOKEN_FATAL);
 397                         ch = lex_getch(lex);
 398                 }
 399         }
 400         /* NOT else, '.' can come from above as well */
 401         if (ch == '.' && !ishex)
 402         {
 403                 /* Allow floating comma in non-hex mode */
 404                 lex->tok->ttype = TOKEN_FLOATCONST;
 405                 if (!lex_tokench(lex, ch))
 406                         return (lex->tok->ttype = TOKEN_FATAL);
 407
 408                 /* continue digits-only */
 409                 ch = lex_getch(lex);
 410                 while (isdigit(ch))
 411                 {
 412                         if (!lex_tokench(lex, ch))
 413                                 return (lex->tok->ttype = TOKEN_FATAL);
 414                         ch = lex_getch(lex);
 415                 }
 416         }
 417         /* put back the last character */
 418         /* but do not put back the trailing 'f' or a float */
 419         if (lex->tok->ttype == TOKEN_FLOATCONST && ch == 'f')
 420                 ch = lex_getch(lex);
 421
 422         /* generally we don't want words to follow numbers: */
 423         if (isident(ch)) {
 424                 lexerror(lex, "unexpected trailing characters after number");
 425                 return (lex->tok->ttype = TOKEN_ERROR);
 426         }
 427         lex_ungetch(lex, ch);
 428
 429         if (!lex_endtoken(lex))
 430                 return (lex->tok->ttype = TOKEN_FATAL);
 431         if (lex->tok->ttype == TOKEN_FLOATCONST)
 432                 lex->tok->constval.f = strtod(lex->tok->value, NULL);
 433         else
 434                 lex->tok->constval.i = strtol(lex->tok->value, NULL, 0);
 435         return lex->tok->ttype;
 436 }
 437
 438 int lex_do(lex_file *lex)
 439 {
 440         int ch, nextch;
 441
 442         if (lex->tok)
 443                 token_delete(lex->tok);
 444         lex->tok = token_new();
 445         if (!lex->tok)
 446                 return TOKEN_FATAL;
 447
 448         ch = lex_skipwhite(lex);
 449         lex->sline = lex->line;
 450         lex->tok->ctx.line = lex->sline;
 451         lex->tok->ctx.file = lex->name;
 452
 453         if (ch == EOF)
 454                 return (lex->tok->ttype = TOKEN_EOF);
 455
 456         /* single-character tokens */
 457         switch (ch)
 458         {
 459                 case ';':
 460                 case '(':
 461                 case ')':
 462                 case '{':
 463                 case '}':
 464                 case '[':
 465                 case ']':
 466
 467                 case '#':
 468
 469                         return (lex->tok->ttype = ch);
 470                 default:
 471                         break;
 472         }
 473
 474         if (lex->flags.noops)
 475         {
 476                 /* Detect characters early which are normally
 477                  * operators OR PART of an operator.
 478                  */
 479                 switch (ch)
 480                 {
 481                         case '+':
 482                         case '-':
 483                         case '*':
 484                         case '/':
 485                         case '<':
 486                         case '>':
 487                         case '=':
 488                         case '&':
 489                         case '|':
 490                         case '^':
 491                         case '~':
 492                         case ',':
 493                                 return ch;
 494                         default:
 495                                 break;
 496                 }
 497         }
 498
 499         if (ch == ',') {
 500             if (!lex_tokench(lex, ch) ||
 501                 !lex_endtoken(lex))
 502             {
 503                 return (lex->tok->ttype = TOKEN_FATAL);
 504             }
 505             return (lex->tok->ttype = TOKEN_OPERATOR);
 506         }
 507
 508         if (ch == '+' || ch == '-' || /* ++, --, +=, -=  and -> as well! */
 509             ch == '>' || ch == '<' || /* <<, >>, <=, >= */
 510             ch == '=' ||              /* == */
 511             ch == '&' || ch == '|')   /* &&, ||, &=, |= */
 512         {
 513                 if (!lex_tokench(lex, ch))
 514                         return (lex->tok->ttype = TOKEN_FATAL);
 515
 516                 nextch = lex_getch(lex);
 517                 if (nextch == ch || nextch == '=') {
 518                         if (!lex_tokench(lex, nextch))
 519                                 return (lex->tok->ttype = TOKEN_FATAL);
 520                 } else if (ch == '-' && nextch == '>') {
 521                         if (!lex_tokench(lex, nextch))
 522                                 return (lex->tok->ttype = TOKEN_FATAL);
 523                 } else
 524                         lex_ungetch(lex, nextch);
 525
 526                 if (!lex_endtoken(lex))
 527                         return (lex->tok->ttype = TOKEN_FATAL);
 528                 return (lex->tok->ttype = TOKEN_OPERATOR);
 529         }
 530
 531         if (ch == '^' || ch == '~' || ch == '!')
 532         {
 533                 if (!lex_tokench(lex, ch) ||
 534                         !lex_endtoken(lex))
 535                 {
 536                         return (lex->tok->ttype = TOKEN_FATAL);
 537                 }
 538                 return (lex->tok->ttype = TOKEN_OPERATOR);
 539         }
 540
 541         if (ch == '*' || ch == '/') /* *=, /= */
 542         {
 543                 if (!lex_tokench(lex, ch))
 544                         return (lex->tok->ttype = TOKEN_FATAL);
 545
 546                 nextch = lex_getch(lex);
 547                 if (nextch == '=') {
 548                         if (!lex_tokench(lex, nextch))
 549                                 return (lex->tok->ttype = TOKEN_FATAL);
 550                 } else
 551                         lex_ungetch(lex, nextch);
 552
 553                 if (!lex_endtoken(lex))
 554                         return (lex->tok->ttype = TOKEN_FATAL);
 555                 return (lex->tok->ttype = TOKEN_OPERATOR);
 556         }
 557
 558         if (isident_start(ch))
 559         {
 560                 const char *v;
 561                 if (!lex_tokench(lex, ch))
 562                         return (lex->tok->ttype = TOKEN_FATAL);
 563                 if (!lex_finish_ident(lex)) {
 564                         /* error? */
 565                         return (lex->tok->ttype = TOKEN_ERROR);
 566                 }
 567                 if (!lex_endtoken(lex))
 568                         return (lex->tok->ttype = TOKEN_FATAL);
 569                 lex->tok->ttype = TOKEN_IDENT;
 570
 571                 v = lex->tok->value;
 572                 if (!strcmp(v, "void")) {
 573                         lex->tok->ttype = TOKEN_TYPENAME;
 574                     lex->tok->constval.t = TYPE_VOID;
 575                 } else if (!strcmp(v, "int")) {
 576                         lex->tok->ttype = TOKEN_TYPENAME;
 577                     lex->tok->constval.t = TYPE_INTEGER;
 578                 } else if (!strcmp(v, "float")) {
 579                         lex->tok->ttype = TOKEN_TYPENAME;
 580                     lex->tok->constval.t = TYPE_FLOAT;
 581                 } else if (!strcmp(v, "string")) {
 582                         lex->tok->ttype = TOKEN_TYPENAME;
 583                     lex->tok->constval.t = TYPE_STRING;
 584                 } else if (!strcmp(v, "entity")) {
 585                         lex->tok->ttype = TOKEN_TYPENAME;
 586                     lex->tok->constval.t = TYPE_ENTITY;
 587                 } else if (!strcmp(v, "vector")) {
 588                         lex->tok->ttype = TOKEN_TYPENAME;
 589                     lex->tok->constval.t = TYPE_VECTOR;
 590                 } else if (!strcmp(v, "for")  ||
 591                          !strcmp(v, "while")  ||
 592                          !strcmp(v, "do")     ||
 593                          !strcmp(v, "var")    ||
 594                          !strcmp(v, "return") ||
 595                          !strcmp(v, "const"))
 596                         lex->tok->ttype = TOKEN_KEYWORD;
 597
 598                 return lex->tok->ttype;
 599         }
 600
 601         if (ch == '"')
 602         {
 603                 lex->tok->ttype = lex_finish_string(lex, '"');
 604                 while (lex->tok->ttype == TOKEN_STRINGCONST)
 605                 {
 606                         /* Allow c style "string" "continuation" */
 607                         ch = lex_skipwhite(lex);
 608                         if (ch != '"') {
 609                                 lex_ungetch(lex, ch);
 610                                 break;
 611                         }
 612
 613                         lex->tok->ttype = lex_finish_string(lex, '"');
 614                 }
 615                 if (!lex_endtoken(lex))
 616                         return (lex->tok->ttype = TOKEN_FATAL);
 617                 return lex->tok->ttype;
 618         }
 619
 620         if (ch == '\'')
 621         {
 622                 /* we parse character constants like string,
 623                  * but return TOKEN_CHARCONST, or a vector type if it fits...
 624                  * Likewise actual unescaping has to be done by the parser.
 625                  * The difference is we don't allow 'char' 'continuation'.
 626                  */
 627                  lex->tok->ttype = lex_finish_string(lex, '\'');
 628                  if (!lex_endtoken(lex))
 629                          return (lex->tok->ttype = TOKEN_FATAL);
 630
 631                  /* It's a vector if we can successfully scan 3 floats */
 632                  if (sscanf(lex->tok->value, " %f %f %f ", &lex->tok->constval.v.x, &lex->tok->constval.v.y, &lex->tok->constval.v.z) == 3)
 633                  {
 634                          lex->tok->ttype = TOKEN_VECTORCONST;
 635                  }
 636
 637                  return lex->tok->ttype;
 638         }
 639
 640         if (isdigit(ch))
 641         {
 642                 lex->tok->ttype = lex_finish_digit(lex, ch);
 643                 if (!lex_endtoken(lex))
 644                         return (lex->tok->ttype = TOKEN_FATAL);
 645                 return lex->tok->ttype;
 646         }
 647
 648         lexerror(lex, "unknown token");
 649         return (lex->tok->ttype = TOKEN_ERROR);
 650 }