9 MEM_VEC_FUNCTIONS(token, char, value)
11 void lexerror(lex_file *lex, const char *fmt, ...)
16 printf("error %s:%lu: ", lex->name, (unsigned long)lex->sline);
27 void lexwarn(lex_file *lex, int warn, const char *fmt, ...)
35 printf("warning %s:%lu: ", lex->name, (unsigned long)lex->sline);
48 token *tok = (token*)mem_a(sizeof(token));
51 memset(tok, 0, sizeof(*tok));
55 void token_delete(token *self)
57 if (self->next && self->next->prev == self)
58 self->next->prev = self->prev;
59 if (self->prev && self->prev->next == self)
60 self->prev->next = self->next;
61 MEM_VECTOR_CLEAR(self, value);
65 token* token_copy(const token *cp)
67 token* self = token_new();
71 self->value_alloc = cp->value_count + 1;
72 self->value_count = cp->value_count;
73 self->value = (char*)mem_a(self->value_alloc);
78 memcpy(self->value, cp->value, cp->value_count);
79 self->value[self->value_alloc-1] = 0;
83 self->ttype = cp->ttype;
84 memcpy(&self->constval, &cp->constval, sizeof(self->constval));
88 void token_delete_all(token *t)
99 token* token_copy_all(const token *cp)
104 out = cur = token_copy(cp);
110 cur->next = token_copy(cp);
112 token_delete_all(out);
115 cur->next->prev = cur;
122 lex_file* lex_open(const char *file)
125 FILE *in = fopen(file, "rb");
128 lexerror(NULL, "open failed: '%s'\n", file);
132 lex = (lex_file*)mem_a(sizeof(*lex));
135 lexerror(NULL, "out of memory\n");
139 memset(lex, 0, sizeof(*lex));
142 lex->name = util_strdup(file);
143 lex->line = 1; /* we start counting at 1 */
150 void lex_close(lex_file *lex)
155 token_delete(lex->tok);
160 /* Get or put-back data
161 * The following to functions do NOT understand what kind of data they
163 * The are merely wrapping get/put in order to count line numbers.
165 static int lex_getch(lex_file *lex)
171 if (lex->peek[lex->peekpos] == '\n')
173 return lex->peek[lex->peekpos];
176 ch = fgetc(lex->file);
182 static void lex_ungetch(lex_file *lex, int ch)
184 lex->peek[lex->peekpos++] = ch;
189 /* classify characters
190 * some additions to the is*() functions of ctype.h
193 /* Idents are alphanumberic, but they start with alpha or _ */
194 static bool isident_start(int ch)
196 return isalpha(ch) || ch == '_';
199 static bool isident(int ch)
201 return isident_start(ch) || isdigit(ch);
204 /* isxdigit_only is used when we already know it's not a digit
205 * and want to see if it's a hex digit anyway.
207 static bool isxdigit_only(int ch)
209 return (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
212 /* Skip whitespace and comments and return the first
213 * non-white character.
214 * As this makes use of the above getch() ungetch() functions,
215 * we don't need to care at all about line numbering anymore.
217 * In theory, this function should only be used at the beginning
218 * of lexing, or when we *know* the next character is part of the token.
219 * Otherwise, if the parser throws an error, the linenumber may not be
220 * the line of the error, but the line of the next token AFTER the error.
222 * This is currently only problematic when using c-like string-continuation,
223 * since comments and whitespaces are allowed between 2 such strings.
227 "A continuation of the previous string"
228 // This line is skipped
231 * In this case, if the parse decides it didn't actually want a string,
232 * and uses lex->line to print an error, it will show the ', foo);' line's
235 * On the other hand, the parser is supposed to remember the line of the next
236 * token's beginning. In this case we would want skipwhite() to be called
237 * AFTER reading a token, so that the parser, before reading the NEXT token,
238 * doesn't store teh *comment's* linenumber, but the actual token's linenumber.
241 * here is to store the line of the first character after skipping
242 * the initial whitespace in lex->sline, this happens in lex_do.
244 static int lex_skipwhite(lex_file *lex)
251 while (ch != EOF && isspace(ch)) ch = lex_getch(lex);
257 /* one line comment */
260 /* check for special: '/', '/', '*', '/' */
269 while (ch != EOF && ch != '\n') {
276 /* multiline comment */
288 if (ch == '/') /* allow *//* direct following comment */
290 lex_ungetch(lex, ch);
291 ch = ' '; /* cause TRUE in the isspace check */
295 /* Otherwise roll back to the slash and break out of the loop */
296 lex_ungetch(lex, ch);
300 } while (ch != EOF && isspace(ch));
305 /* Append a character to the token buffer */
306 static bool GMQCC_WARN lex_tokench(lex_file *lex, int ch)
308 if (!token_value_add(lex->tok, ch)) {
309 lexerror(lex, "out of memory");
315 /* Append a trailing null-byte */
316 static bool GMQCC_WARN lex_endtoken(lex_file *lex)
318 if (!token_value_add(lex->tok, 0)) {
319 lexerror(lex, "out of memory");
322 lex->tok->value_count--;
327 static bool GMQCC_WARN lex_finish_ident(lex_file *lex)
332 while (ch != EOF && isident(ch))
334 if (!lex_tokench(lex, ch))
335 return (lex->tok->ttype = TOKEN_FATAL);
339 /* last ch was not an ident ch: */
340 lex_ungetch(lex, ch);
345 static int GMQCC_WARN lex_finish_string(lex_file *lex, int quote)
353 return TOKEN_STRINGCONST;
358 lexerror(lex, "unexpected end of file");
359 lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
360 return (lex->tok->ttype = TOKEN_ERROR);
365 case 'a': ch = '\a'; break;
366 case 'b': ch = '\b'; break;
367 case 'r': ch = '\r'; break;
368 case 'n': ch = '\n'; break;
369 case 't': ch = '\t'; break;
370 case 'f': ch = '\f'; break;
371 case 'v': ch = '\v'; break;
373 lexwarn(lex, WARN_UNKNOWN_CONTROL_SEQUENCE, "unrecognized control sequence: \\%c", ch);
374 /* so we just add the character plus backslash no matter what it actually is */
375 if (!lex_tokench(lex, '\\'))
376 return (lex->tok->ttype = TOKEN_FATAL);
378 /* add the character finally */
379 if (!lex_tokench(lex, ch))
380 return (lex->tok->ttype = TOKEN_FATAL);
382 else if (!lex_tokench(lex, ch))
383 return (lex->tok->ttype = TOKEN_FATAL);
385 lexerror(lex, "unexpected end of file within string constant");
386 lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
387 return (lex->tok->ttype = TOKEN_ERROR);
390 static int GMQCC_WARN lex_finish_digit(lex_file *lex, int lastch)
396 /* parse a number... */
397 lex->tok->ttype = TOKEN_INTCONST;
399 if (!lex_tokench(lex, ch))
400 return (lex->tok->ttype = TOKEN_FATAL);
403 if (ch != '.' && !isdigit(ch))
405 if (lastch != '0' || ch != 'x')
407 /* end of the number or EOF */
408 lex_ungetch(lex, ch);
409 if (!lex_endtoken(lex))
410 return (lex->tok->ttype = TOKEN_FATAL);
412 lex->tok->constval.i = lastch - '0';
413 return lex->tok->ttype;
419 /* EOF would have been caught above */
423 if (!lex_tokench(lex, ch))
424 return (lex->tok->ttype = TOKEN_FATAL);
426 while (isdigit(ch) || (ishex && isxdigit_only(ch)))
428 if (!lex_tokench(lex, ch))
429 return (lex->tok->ttype = TOKEN_FATAL);
433 /* NOT else, '.' can come from above as well */
434 if (ch == '.' && !ishex)
436 /* Allow floating comma in non-hex mode */
437 lex->tok->ttype = TOKEN_FLOATCONST;
438 if (!lex_tokench(lex, ch))
439 return (lex->tok->ttype = TOKEN_FATAL);
441 /* continue digits-only */
445 if (!lex_tokench(lex, ch))
446 return (lex->tok->ttype = TOKEN_FATAL);
450 /* put back the last character */
451 /* but do not put back the trailing 'f' or a float */
452 if (lex->tok->ttype == TOKEN_FLOATCONST && ch == 'f')
455 /* generally we don't want words to follow numbers: */
457 lexerror(lex, "unexpected trailing characters after number");
458 return (lex->tok->ttype = TOKEN_ERROR);
460 lex_ungetch(lex, ch);
462 if (!lex_endtoken(lex))
463 return (lex->tok->ttype = TOKEN_FATAL);
464 if (lex->tok->ttype == TOKEN_FLOATCONST)
465 lex->tok->constval.f = strtod(lex->tok->value, NULL);
467 lex->tok->constval.i = strtol(lex->tok->value, NULL, 0);
468 return lex->tok->ttype;
471 int lex_do(lex_file *lex)
476 token_delete(lex->tok);
477 lex->tok = token_new();
481 ch = lex_skipwhite(lex);
482 lex->sline = lex->line;
483 lex->tok->ctx.line = lex->sline;
484 lex->tok->ctx.file = lex->name;
487 return (lex->tok->ttype = TOKEN_EOF);
489 /* single-character tokens */
502 return (lex->tok->ttype = ch);
507 if (lex->flags.noops)
509 /* Detect characters early which are normally
510 * operators OR PART of an operator.
533 if (!lex_tokench(lex, ch) ||
536 return (lex->tok->ttype = TOKEN_FATAL);
538 return (lex->tok->ttype = TOKEN_OPERATOR);
541 if (ch == '+' || ch == '-' || /* ++, --, +=, -= and -> as well! */
542 ch == '>' || ch == '<' || /* <<, >>, <=, >= */
543 ch == '=' || /* == */
544 ch == '&' || ch == '|') /* &&, ||, &=, |= */
546 if (!lex_tokench(lex, ch))
547 return (lex->tok->ttype = TOKEN_FATAL);
549 nextch = lex_getch(lex);
550 if (nextch == ch || nextch == '=') {
551 if (!lex_tokench(lex, nextch))
552 return (lex->tok->ttype = TOKEN_FATAL);
553 } else if (ch == '-' && nextch == '>') {
554 if (!lex_tokench(lex, nextch))
555 return (lex->tok->ttype = TOKEN_FATAL);
557 lex_ungetch(lex, nextch);
559 if (!lex_endtoken(lex))
560 return (lex->tok->ttype = TOKEN_FATAL);
561 return (lex->tok->ttype = TOKEN_OPERATOR);
564 if (ch == '^' || ch == '~' || ch == '!')
566 if (!lex_tokench(lex, ch) ||
569 return (lex->tok->ttype = TOKEN_FATAL);
571 return (lex->tok->ttype = TOKEN_OPERATOR);
574 if (ch == '*' || ch == '/') /* *=, /= */
576 if (!lex_tokench(lex, ch))
577 return (lex->tok->ttype = TOKEN_FATAL);
579 nextch = lex_getch(lex);
581 if (!lex_tokench(lex, nextch))
582 return (lex->tok->ttype = TOKEN_FATAL);
584 lex_ungetch(lex, nextch);
586 if (!lex_endtoken(lex))
587 return (lex->tok->ttype = TOKEN_FATAL);
588 return (lex->tok->ttype = TOKEN_OPERATOR);
591 if (isident_start(ch))
594 if (!lex_tokench(lex, ch))
595 return (lex->tok->ttype = TOKEN_FATAL);
596 if (!lex_finish_ident(lex)) {
598 return (lex->tok->ttype = TOKEN_ERROR);
600 if (!lex_endtoken(lex))
601 return (lex->tok->ttype = TOKEN_FATAL);
602 lex->tok->ttype = TOKEN_IDENT;
605 if (!strcmp(v, "void")) {
606 lex->tok->ttype = TOKEN_TYPENAME;
607 lex->tok->constval.t = TYPE_VOID;
608 } else if (!strcmp(v, "int")) {
609 lex->tok->ttype = TOKEN_TYPENAME;
610 lex->tok->constval.t = TYPE_INTEGER;
611 } else if (!strcmp(v, "float")) {
612 lex->tok->ttype = TOKEN_TYPENAME;
613 lex->tok->constval.t = TYPE_FLOAT;
614 } else if (!strcmp(v, "string")) {
615 lex->tok->ttype = TOKEN_TYPENAME;
616 lex->tok->constval.t = TYPE_STRING;
617 } else if (!strcmp(v, "entity")) {
618 lex->tok->ttype = TOKEN_TYPENAME;
619 lex->tok->constval.t = TYPE_ENTITY;
620 } else if (!strcmp(v, "vector")) {
621 lex->tok->ttype = TOKEN_TYPENAME;
622 lex->tok->constval.t = TYPE_VECTOR;
623 } else if (!strcmp(v, "for") ||
624 !strcmp(v, "while") ||
627 !strcmp(v, "else") ||
629 !strcmp(v, "return") ||
631 lex->tok->ttype = TOKEN_KEYWORD;
633 return lex->tok->ttype;
638 lex->tok->ttype = lex_finish_string(lex, '"');
639 while (lex->tok->ttype == TOKEN_STRINGCONST)
641 /* Allow c style "string" "continuation" */
642 ch = lex_skipwhite(lex);
644 lex_ungetch(lex, ch);
648 lex->tok->ttype = lex_finish_string(lex, '"');
650 if (!lex_endtoken(lex))
651 return (lex->tok->ttype = TOKEN_FATAL);
652 return lex->tok->ttype;
657 /* we parse character constants like string,
658 * but return TOKEN_CHARCONST, or a vector type if it fits...
659 * Likewise actual unescaping has to be done by the parser.
660 * The difference is we don't allow 'char' 'continuation'.
662 lex->tok->ttype = lex_finish_string(lex, '\'');
663 if (!lex_endtoken(lex))
664 return (lex->tok->ttype = TOKEN_FATAL);
666 /* It's a vector if we can successfully scan 3 floats */
667 if (sscanf(lex->tok->value, " %f %f %f ", &lex->tok->constval.v.x, &lex->tok->constval.v.y, &lex->tok->constval.v.z) == 3)
669 lex->tok->ttype = TOKEN_VECTORCONST;
672 return lex->tok->ttype;
677 lex->tok->ttype = lex_finish_digit(lex, ch);
678 if (!lex_endtoken(lex))
679 return (lex->tok->ttype = TOKEN_FATAL);
680 return lex->tok->ttype;
683 lexerror(lex, "unknown token");
684 return (lex->tok->ttype = TOKEN_ERROR);