]> git.xonotic.org Git - xonotic/gmqcc.git/blob - lexer.c
Reorganizing expression parsing to allow prefix-operators like unary minus
[xonotic/gmqcc.git] / lexer.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <stdarg.h>
5
6 #include "gmqcc.h"
7 #include "lexer.h"
8
9 MEM_VEC_FUNCTIONS(token, char, value)
10
11 void lexerror(lex_file *lex, const char *fmt, ...)
12 {
13         va_list ap;
14
15         if (lex)
16                 printf("error %s:%lu: ", lex->name, (unsigned long)lex->sline);
17         else
18                 printf("error: ");
19
20         va_start(ap, fmt);
21         vprintf(fmt, ap);
22         va_end(ap);
23
24         printf("\n");
25 }
26
27 void lexwarn(lex_file *lex, int warn, const char *fmt, ...)
28 {
29         va_list ap;
30
31         if (!OPTS_WARN(warn))
32             return;
33
34         if (lex)
35                 printf("warning %s:%lu: ", lex->name, (unsigned long)lex->sline);
36         else
37                 printf("warning: ");
38
39         va_start(ap, fmt);
40         vprintf(fmt, ap);
41         va_end(ap);
42
43         printf("\n");
44 }
45
46 token* token_new()
47 {
48         token *tok = (token*)mem_a(sizeof(token));
49         if (!tok)
50                 return NULL;
51         memset(tok, 0, sizeof(*tok));
52         return tok;
53 }
54
55 void token_delete(token *self)
56 {
57         if (self->next && self->next->prev == self)
58                 self->next->prev = self->prev;
59         if (self->prev && self->prev->next == self)
60                 self->prev->next = self->next;
61         MEM_VECTOR_CLEAR(self, value);
62         mem_d(self);
63 }
64
65 token* token_copy(const token *cp)
66 {
67         token* self = token_new();
68         if (!self)
69                 return NULL;
70         /* copy the value */
71         self->value_alloc = cp->value_count + 1;
72         self->value_count = cp->value_count;
73         self->value = (char*)mem_a(self->value_alloc);
74         if (!self->value) {
75                 mem_d(self);
76                 return NULL;
77         }
78         memcpy(self->value, cp->value, cp->value_count);
79         self->value[self->value_alloc-1] = 0;
80
81         /* rest */
82         self->ctx = cp->ctx;
83         self->ttype = cp->ttype;
84         memcpy(&self->constval, &cp->constval, sizeof(self->constval));
85         return self;
86 }
87
88 void token_delete_all(token *t)
89 {
90         token *n;
91
92         do {
93                 n = t->next;
94                 token_delete(t);
95                 t = n;
96         } while(t);
97 }
98
99 token* token_copy_all(const token *cp)
100 {
101         token *cur;
102         token *out;
103
104         out = cur = token_copy(cp);
105         if (!out)
106                 return NULL;
107
108         while (cp->next) {
109                 cp = cp->next;
110                 cur->next = token_copy(cp);
111                 if (!cur->next) {
112                         token_delete_all(out);
113                         return NULL;
114                 }
115                 cur->next->prev = cur;
116                 cur = cur->next;
117         }
118
119         return out;
120 }
121
122 lex_file* lex_open(const char *file)
123 {
124         lex_file *lex;
125         FILE *in = util_fopen(file, "rb");
126
127         if (!in) {
128                 lexerror(NULL, "open failed: '%s'\n", file);
129                 return NULL;
130         }
131
132         lex = (lex_file*)mem_a(sizeof(*lex));
133         if (!lex) {
134                 fclose(in);
135                 lexerror(NULL, "out of memory\n");
136                 return NULL;
137         }
138
139         memset(lex, 0, sizeof(*lex));
140
141         lex->file = in;
142         lex->name = util_strdup(file);
143         lex->line = 1; /* we start counting at 1 */
144
145         lex->peekpos = 0;
146
147         return lex;
148 }
149
150 void lex_close(lex_file *lex)
151 {
152         if (lex->file)
153                 fclose(lex->file);
154         if (lex->tok)
155                 token_delete(lex->tok);
156         mem_d(lex->name);
157         mem_d(lex);
158 }
159
160 /* Get or put-back data
161  * The following to functions do NOT understand what kind of data they
162  * are working on.
163  * The are merely wrapping get/put in order to count line numbers.
164  */
165 static int lex_getch(lex_file *lex)
166 {
167         int ch;
168
169         if (lex->peekpos) {
170                 lex->peekpos--;
171                 if (lex->peek[lex->peekpos] == '\n')
172                         lex->line++;
173                 return lex->peek[lex->peekpos];
174         }
175
176         ch = fgetc(lex->file);
177         if (ch == '\n')
178                 lex->line++;
179         return ch;
180 }
181
182 static void lex_ungetch(lex_file *lex, int ch)
183 {
184         lex->peek[lex->peekpos++] = ch;
185         if (ch == '\n')
186                 lex->line--;
187 }
188
189 /* classify characters
190  * some additions to the is*() functions of ctype.h
191  */
192
193 /* Idents are alphanumberic, but they start with alpha or _ */
194 static bool isident_start(int ch)
195 {
196         return isalpha(ch) || ch == '_';
197 }
198
199 static bool isident(int ch)
200 {
201         return isident_start(ch) || isdigit(ch);
202 }
203
204 /* isxdigit_only is used when we already know it's not a digit
205  * and want to see if it's a hex digit anyway.
206  */
207 static bool isxdigit_only(int ch)
208 {
209         return (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
210 }
211
212 /* Skip whitespace and comments and return the first
213  * non-white character.
214  * As this makes use of the above getch() ungetch() functions,
215  * we don't need to care at all about line numbering anymore.
216  *
217  * In theory, this function should only be used at the beginning
218  * of lexing, or when we *know* the next character is part of the token.
219  * Otherwise, if the parser throws an error, the linenumber may not be
220  * the line of the error, but the line of the next token AFTER the error.
221  *
222  * This is currently only problematic when using c-like string-continuation,
223  * since comments and whitespaces are allowed between 2 such strings.
224  * Example:
225 printf(   "line one\n"
226 // A comment
227           "A continuation of the previous string"
228 // This line is skipped
229       , foo);
230
231  * In this case, if the parse decides it didn't actually want a string,
232  * and uses lex->line to print an error, it will show the ', foo);' line's
233  * linenumber.
234  *
235  * On the other hand, the parser is supposed to remember the line of the next
236  * token's beginning. In this case we would want skipwhite() to be called
237  * AFTER reading a token, so that the parser, before reading the NEXT token,
238  * doesn't store teh *comment's* linenumber, but the actual token's linenumber.
239  *
240  * THIS SOLUTION
241  *    here is to store the line of the first character after skipping
242  *    the initial whitespace in lex->sline, this happens in lex_do.
243  */
244 static int lex_skipwhite(lex_file *lex)
245 {
246         int ch = 0;
247
248         do
249         {
250                 ch = lex_getch(lex);
251                 while (ch != EOF && isspace(ch)) ch = lex_getch(lex);
252
253                 if (ch == '/') {
254                         ch = lex_getch(lex);
255                         if (ch == '/')
256                         {
257                                 /* one line comment */
258                                 ch = lex_getch(lex);
259
260                                 /* check for special: '/', '/', '*', '/' */
261                                 if (ch == '*') {
262                                         ch = lex_getch(lex);
263                                         if (ch == '/') {
264                                                 ch = ' ';
265                                                 continue;
266                                         }
267                                 }
268
269                                 while (ch != EOF && ch != '\n') {
270                                         ch = lex_getch(lex);
271                                 }
272                                 continue;
273                         }
274                         if (ch == '*')
275                         {
276                                 /* multiline comment */
277                                 while (ch != EOF)
278                                 {
279                                         ch = lex_getch(lex);
280                                         if (ch == '*') {
281                                                 ch = lex_getch(lex);
282                                                 if (ch == '/') {
283                                                         ch = lex_getch(lex);
284                                                         break;
285                                                 }
286                                         }
287                                 }
288                                 if (ch == '/') /* allow *//* direct following comment */
289                                 {
290                                         lex_ungetch(lex, ch);
291                                         ch = ' '; /* cause TRUE in the isspace check */
292                                 }
293                                 continue;
294                         }
295                         /* Otherwise roll back to the slash and break out of the loop */
296                         lex_ungetch(lex, ch);
297                         ch = '/';
298                         break;
299                 }
300         } while (ch != EOF && isspace(ch));
301
302         return ch;
303 }
304
305 /* Append a character to the token buffer */
306 static bool GMQCC_WARN lex_tokench(lex_file *lex, int ch)
307 {
308         if (!token_value_add(lex->tok, ch)) {
309                 lexerror(lex, "out of memory");
310                 return false;
311         }
312         return true;
313 }
314
315 /* Append a trailing null-byte */
316 static bool GMQCC_WARN lex_endtoken(lex_file *lex)
317 {
318         if (!token_value_add(lex->tok, 0)) {
319                 lexerror(lex, "out of memory");
320                 return false;
321         }
322         lex->tok->value_count--;
323         return true;
324 }
325
326 /* Get a token */
327 static bool GMQCC_WARN lex_finish_ident(lex_file *lex)
328 {
329         int ch;
330
331         ch = lex_getch(lex);
332         while (ch != EOF && isident(ch))
333         {
334                 if (!lex_tokench(lex, ch))
335                         return (lex->tok->ttype = TOKEN_FATAL);
336                 ch = lex_getch(lex);
337         }
338
339         /* last ch was not an ident ch: */
340         lex_ungetch(lex, ch);
341
342         return true;
343 }
344
345 static int GMQCC_WARN lex_finish_string(lex_file *lex, int quote)
346 {
347         int ch = 0;
348
349         while (ch != EOF)
350         {
351                 ch = lex_getch(lex);
352                 if (ch == quote)
353                         return TOKEN_STRINGCONST;
354
355                 if (ch == '\\') {
356                         ch = lex_getch(lex);
357                         if (ch == EOF) {
358                                 lexerror(lex, "unexpected end of file");
359                                 lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
360                                 return (lex->tok->ttype = TOKEN_ERROR);
361                         }
362
363             switch (ch) {
364             case '\\': break;
365             case 'a':  ch = '\a'; break;
366             case 'b':  ch = '\b'; break;
367             case 'r':  ch = '\r'; break;
368             case 'n':  ch = '\n'; break;
369             case 't':  ch = '\t'; break;
370             case 'f':  ch = '\f'; break;
371             case 'v':  ch = '\v'; break;
372             default:
373                 lexwarn(lex, WARN_UNKNOWN_CONTROL_SEQUENCE, "unrecognized control sequence: \\%c", ch);
374                             /* so we just add the character plus backslash no matter what it actually is */
375                             if (!lex_tokench(lex, '\\'))
376                                     return (lex->tok->ttype = TOKEN_FATAL);
377             }
378             /* add the character finally */
379                         if (!lex_tokench(lex, ch))
380                                 return (lex->tok->ttype = TOKEN_FATAL);
381                 }
382                 else if (!lex_tokench(lex, ch))
383                         return (lex->tok->ttype = TOKEN_FATAL);
384         }
385         lexerror(lex, "unexpected end of file within string constant");
386         lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
387         return (lex->tok->ttype = TOKEN_ERROR);
388 }
389
390 static int GMQCC_WARN lex_finish_digit(lex_file *lex, int lastch)
391 {
392         bool ishex = false;
393
394         int  ch = lastch;
395
396         /* parse a number... */
397         lex->tok->ttype = TOKEN_INTCONST;
398
399         if (!lex_tokench(lex, ch))
400                 return (lex->tok->ttype = TOKEN_FATAL);
401
402         ch = lex_getch(lex);
403         if (ch != '.' && !isdigit(ch))
404         {
405                 if (lastch != '0' || ch != 'x')
406                 {
407                         /* end of the number or EOF */
408                         lex_ungetch(lex, ch);
409                         if (!lex_endtoken(lex))
410                                 return (lex->tok->ttype = TOKEN_FATAL);
411
412                         lex->tok->constval.i = lastch - '0';
413                         return lex->tok->ttype;
414                 }
415
416                 ishex = true;
417         }
418
419         /* EOF would have been caught above */
420
421         if (ch != '.')
422         {
423                 if (!lex_tokench(lex, ch))
424                         return (lex->tok->ttype = TOKEN_FATAL);
425                 ch = lex_getch(lex);
426                 while (isdigit(ch) || (ishex && isxdigit_only(ch)))
427                 {
428                         if (!lex_tokench(lex, ch))
429                                 return (lex->tok->ttype = TOKEN_FATAL);
430                         ch = lex_getch(lex);
431                 }
432         }
433         /* NOT else, '.' can come from above as well */
434         if (ch == '.' && !ishex)
435         {
436                 /* Allow floating comma in non-hex mode */
437                 lex->tok->ttype = TOKEN_FLOATCONST;
438                 if (!lex_tokench(lex, ch))
439                         return (lex->tok->ttype = TOKEN_FATAL);
440
441                 /* continue digits-only */
442                 ch = lex_getch(lex);
443                 while (isdigit(ch))
444                 {
445                         if (!lex_tokench(lex, ch))
446                                 return (lex->tok->ttype = TOKEN_FATAL);
447                         ch = lex_getch(lex);
448                 }
449         }
450         /* put back the last character */
451         /* but do not put back the trailing 'f' or a float */
452         if (lex->tok->ttype == TOKEN_FLOATCONST && ch == 'f')
453                 ch = lex_getch(lex);
454
455         /* generally we don't want words to follow numbers: */
456         if (isident(ch)) {
457                 lexerror(lex, "unexpected trailing characters after number");
458                 return (lex->tok->ttype = TOKEN_ERROR);
459         }
460         lex_ungetch(lex, ch);
461
462         if (!lex_endtoken(lex))
463                 return (lex->tok->ttype = TOKEN_FATAL);
464         if (lex->tok->ttype == TOKEN_FLOATCONST)
465                 lex->tok->constval.f = strtod(lex->tok->value, NULL);
466         else
467                 lex->tok->constval.i = strtol(lex->tok->value, NULL, 0);
468         return lex->tok->ttype;
469 }
470
471 int lex_do(lex_file *lex)
472 {
473         int ch, nextch;
474
475         if (lex->tok)
476                 token_delete(lex->tok);
477         lex->tok = token_new();
478         if (!lex->tok)
479                 return TOKEN_FATAL;
480
481         ch = lex_skipwhite(lex);
482         lex->sline = lex->line;
483         lex->tok->ctx.line = lex->sline;
484         lex->tok->ctx.file = lex->name;
485
486         if (ch == EOF)
487                 return (lex->tok->ttype = TOKEN_EOF);
488
489         /* single-character tokens */
490         switch (ch)
491         {
492                 case ';':
493                 case '(':
494                 case ')':
495                 case '{':
496                 case '}':
497                 case '[':
498                 case ']':
499
500                 case '#':
501                 if (!lex_tokench(lex, ch) ||
502                     !lex_endtoken(lex))
503                 {
504                     return (lex->tok->ttype = TOKEN_FATAL);
505                 }
506                         return (lex->tok->ttype = ch);
507                 default:
508                         break;
509         }
510
511         if (lex->flags.noops)
512         {
513                 /* Detect characters early which are normally
514                  * operators OR PART of an operator.
515                  */
516                 switch (ch)
517                 {
518                         case '+':
519                         case '-':
520                         case '*':
521                         case '/':
522                         case '<':
523                         case '>':
524                         case '=':
525                         case '&':
526                         case '|':
527                         case '^':
528                         case '~':
529                         case ',':
530                     case '.':
531                     if (!lex_tokench(lex, ch) ||
532                         !lex_endtoken(lex))
533                     {
534                         return (lex->tok->ttype = TOKEN_FATAL);
535                     }
536                                 return (lex->tok->ttype = ch);
537                         default:
538                                 break;
539                 }
540         }
541
542         if (ch == ',' || ch == '.') {
543             if (!lex_tokench(lex, ch) ||
544                 !lex_endtoken(lex))
545             {
546                 return (lex->tok->ttype = TOKEN_FATAL);
547             }
548             return (lex->tok->ttype = TOKEN_OPERATOR);
549         }
550
551         if (ch == '+' || ch == '-' || /* ++, --, +=, -=  and -> as well! */
552             ch == '>' || ch == '<' || /* <<, >>, <=, >= */
553             ch == '=' ||              /* == */
554             ch == '&' || ch == '|')   /* &&, ||, &=, |= */
555         {
556                 if (!lex_tokench(lex, ch))
557                         return (lex->tok->ttype = TOKEN_FATAL);
558
559                 nextch = lex_getch(lex);
560                 if (nextch == ch || nextch == '=') {
561                         if (!lex_tokench(lex, nextch))
562                                 return (lex->tok->ttype = TOKEN_FATAL);
563                 } else if (ch == '-' && nextch == '>') {
564                         if (!lex_tokench(lex, nextch))
565                                 return (lex->tok->ttype = TOKEN_FATAL);
566                 } else
567                         lex_ungetch(lex, nextch);
568
569                 if (!lex_endtoken(lex))
570                         return (lex->tok->ttype = TOKEN_FATAL);
571                 return (lex->tok->ttype = TOKEN_OPERATOR);
572         }
573
574         if (ch == '^' || ch == '~' || ch == '!')
575         {
576                 if (!lex_tokench(lex, ch) ||
577                         !lex_endtoken(lex))
578                 {
579                         return (lex->tok->ttype = TOKEN_FATAL);
580                 }
581                 return (lex->tok->ttype = TOKEN_OPERATOR);
582         }
583
584         if (ch == '*' || ch == '/') /* *=, /= */
585         {
586                 if (!lex_tokench(lex, ch))
587                         return (lex->tok->ttype = TOKEN_FATAL);
588
589                 nextch = lex_getch(lex);
590                 if (nextch == '=') {
591                         if (!lex_tokench(lex, nextch))
592                                 return (lex->tok->ttype = TOKEN_FATAL);
593                 } else
594                         lex_ungetch(lex, nextch);
595
596                 if (!lex_endtoken(lex))
597                         return (lex->tok->ttype = TOKEN_FATAL);
598                 return (lex->tok->ttype = TOKEN_OPERATOR);
599         }
600
601         if (isident_start(ch))
602         {
603                 const char *v;
604                 if (!lex_tokench(lex, ch))
605                         return (lex->tok->ttype = TOKEN_FATAL);
606                 if (!lex_finish_ident(lex)) {
607                         /* error? */
608                         return (lex->tok->ttype = TOKEN_ERROR);
609                 }
610                 if (!lex_endtoken(lex))
611                         return (lex->tok->ttype = TOKEN_FATAL);
612                 lex->tok->ttype = TOKEN_IDENT;
613
614                 v = lex->tok->value;
615                 if (!strcmp(v, "void")) {
616                         lex->tok->ttype = TOKEN_TYPENAME;
617                     lex->tok->constval.t = TYPE_VOID;
618                 } else if (!strcmp(v, "int")) {
619                         lex->tok->ttype = TOKEN_TYPENAME;
620                     lex->tok->constval.t = TYPE_INTEGER;
621                 } else if (!strcmp(v, "float")) {
622                         lex->tok->ttype = TOKEN_TYPENAME;
623                     lex->tok->constval.t = TYPE_FLOAT;
624                 } else if (!strcmp(v, "string")) {
625                         lex->tok->ttype = TOKEN_TYPENAME;
626                     lex->tok->constval.t = TYPE_STRING;
627                 } else if (!strcmp(v, "entity")) {
628                         lex->tok->ttype = TOKEN_TYPENAME;
629                     lex->tok->constval.t = TYPE_ENTITY;
630                 } else if (!strcmp(v, "vector")) {
631                         lex->tok->ttype = TOKEN_TYPENAME;
632                     lex->tok->constval.t = TYPE_VECTOR;
633                 } else if (!strcmp(v, "for")  ||
634                          !strcmp(v, "while")  ||
635                          !strcmp(v, "do")     ||
636                          !strcmp(v, "if")     ||
637                          !strcmp(v, "else")   ||
638                          !strcmp(v, "var")    ||
639                          !strcmp(v, "local")  ||
640                          !strcmp(v, "return") ||
641                          !strcmp(v, "const"))
642                         lex->tok->ttype = TOKEN_KEYWORD;
643
644                 return lex->tok->ttype;
645         }
646
647         if (ch == '"')
648         {
649                 lex->tok->ttype = lex_finish_string(lex, '"');
650                 while (lex->tok->ttype == TOKEN_STRINGCONST)
651                 {
652                         /* Allow c style "string" "continuation" */
653                         ch = lex_skipwhite(lex);
654                         if (ch != '"') {
655                                 lex_ungetch(lex, ch);
656                                 break;
657                         }
658
659                         lex->tok->ttype = lex_finish_string(lex, '"');
660                 }
661                 if (!lex_endtoken(lex))
662                         return (lex->tok->ttype = TOKEN_FATAL);
663                 return lex->tok->ttype;
664         }
665
666         if (ch == '\'')
667         {
668                 /* we parse character constants like string,
669                  * but return TOKEN_CHARCONST, or a vector type if it fits...
670                  * Likewise actual unescaping has to be done by the parser.
671                  * The difference is we don't allow 'char' 'continuation'.
672                  */
673                  lex->tok->ttype = lex_finish_string(lex, '\'');
674                  if (!lex_endtoken(lex))
675                          return (lex->tok->ttype = TOKEN_FATAL);
676
677                  /* It's a vector if we can successfully scan 3 floats */
678 #ifdef WIN32
679                  if (sscanf_s(lex->tok->value, " %f %f %f ",
680                             &lex->tok->constval.v.x, &lex->tok->constval.v.y, &lex->tok->constval.v.z) == 3)
681 #else
682                  if (sscanf(lex->tok->value, " %f %f %f ",
683                             &lex->tok->constval.v.x, &lex->tok->constval.v.y, &lex->tok->constval.v.z) == 3)
684 #endif
685                  {
686                          lex->tok->ttype = TOKEN_VECTORCONST;
687                  }
688
689                  return lex->tok->ttype;
690         }
691
692         if (isdigit(ch))
693         {
694                 lex->tok->ttype = lex_finish_digit(lex, ch);
695                 if (!lex_endtoken(lex))
696                         return (lex->tok->ttype = TOKEN_FATAL);
697                 return lex->tok->ttype;
698         }
699
700         lexerror(lex, "unknown token");
701         return (lex->tok->ttype = TOKEN_ERROR);
702 }