9 MEM_VEC_FUNCTIONS(token, char, value)
11 void lexerror(lex_file *lex, const char *fmt, ...)
16 printf("error %s:%lu: ", lex->name, (unsigned long)lex->sline);
29 token *tok = (token*)mem_a(sizeof(token));
32 memset(tok, 0, sizeof(*tok));
36 void token_delete(token *self)
38 if (self->next && self->next->prev == self)
39 self->next->prev = self->prev;
40 if (self->prev && self->prev->next == self)
41 self->prev->next = self->next;
42 MEM_VECTOR_CLEAR(self, value);
46 token* token_copy(const token *cp)
48 token* self = token_new();
52 self->value_alloc = cp->value_count + 1;
53 self->value_count = cp->value_count;
54 self->value = (char*)mem_a(self->value_alloc);
59 memcpy(self->value, cp->value, cp->value_count);
60 self->value[self->value_alloc-1] = 0;
64 self->ttype = cp->ttype;
65 memcpy(&self->constval, &cp->constval, sizeof(self->constval));
69 void token_delete_all(token *t)
80 token* token_copy_all(const token *cp)
85 out = cur = token_copy(cp);
91 cur->next = token_copy(cp);
93 token_delete_all(out);
96 cur->next->prev = cur;
103 lex_file* lex_open(const char *file)
106 FILE *in = fopen(file, "rb");
109 lexerror(NULL, "open failed: '%s'\n", file);
113 lex = (lex_file*)mem_a(sizeof(*lex));
116 lexerror(NULL, "out of memory\n");
120 memset(lex, 0, sizeof(*lex));
123 lex->name = util_strdup(file);
124 lex->line = 1; /* we start counting at 1 */
131 void lex_close(lex_file *lex)
136 token_delete(lex->tok);
141 /* Get or put-back data
142 * The following to functions do NOT understand what kind of data they
144 * The are merely wrapping get/put in order to count line numbers.
146 static int lex_getch(lex_file *lex)
152 if (lex->peek[lex->peekpos] == '\n')
154 return lex->peek[lex->peekpos];
157 ch = fgetc(lex->file);
163 static void lex_ungetch(lex_file *lex, int ch)
165 lex->peek[lex->peekpos++] = ch;
170 /* classify characters
171 * some additions to the is*() functions of ctype.h
174 /* Idents are alphanumberic, but they start with alpha or _ */
175 static bool isident_start(int ch)
177 return isalpha(ch) || ch == '_';
180 static bool isident(int ch)
182 return isident_start(ch) || isdigit(ch);
185 /* isxdigit_only is used when we already know it's not a digit
186 * and want to see if it's a hex digit anyway.
188 static bool isxdigit_only(int ch)
190 return (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
193 /* Skip whitespace and comments and return the first
194 * non-white character.
195 * As this makes use of the above getch() ungetch() functions,
196 * we don't need to care at all about line numbering anymore.
198 * In theory, this function should only be used at the beginning
199 * of lexing, or when we *know* the next character is part of the token.
200 * Otherwise, if the parser throws an error, the linenumber may not be
201 * the line of the error, but the line of the next token AFTER the error.
203 * This is currently only problematic when using c-like string-continuation,
204 * since comments and whitespaces are allowed between 2 such strings.
208 "A continuation of the previous string"
209 // This line is skipped
212 * In this case, if the parse decides it didn't actually want a string,
213 * and uses lex->line to print an error, it will show the ', foo);' line's
216 * On the other hand, the parser is supposed to remember the line of the next
217 * token's beginning. In this case we would want skipwhite() to be called
218 * AFTER reading a token, so that the parser, before reading the NEXT token,
219 * doesn't store teh *comment's* linenumber, but the actual token's linenumber.
222 * here is to store the line of the first character after skipping
223 * the initial whitespace in lex->sline, this happens in lex_do.
225 static int lex_skipwhite(lex_file *lex)
232 while (ch != EOF && isspace(ch)) ch = lex_getch(lex);
238 /* one line comment */
241 /* check for special: '/', '/', '*', '/' */
250 while (ch != EOF && ch != '\n') {
257 /* multiline comment */
269 if (ch == '/') /* allow *//* direct following comment */
271 lex_ungetch(lex, ch);
272 ch = ' '; /* cause TRUE in the isspace check */
276 /* Otherwise roll back to the slash and break out of the loop */
277 lex_ungetch(lex, ch);
281 } while (ch != EOF && isspace(ch));
286 /* Append a character to the token buffer */
287 static bool GMQCC_WARN lex_tokench(lex_file *lex, int ch)
289 if (!token_value_add(lex->tok, ch)) {
290 lexerror(lex, "out of memory");
296 /* Append a trailing null-byte */
297 static bool GMQCC_WARN lex_endtoken(lex_file *lex)
299 if (!token_value_add(lex->tok, 0)) {
300 lexerror(lex, "out of memory");
303 lex->tok->value_count--;
308 static bool GMQCC_WARN lex_finish_ident(lex_file *lex)
313 while (ch != EOF && isident(ch))
315 if (!lex_tokench(lex, ch))
316 return (lex->tok->ttype = TOKEN_FATAL);
320 /* last ch was not an ident ch: */
321 lex_ungetch(lex, ch);
326 static int GMQCC_WARN lex_finish_string(lex_file *lex, int quote)
334 return TOKEN_STRINGCONST;
336 if (!lex_tokench(lex, ch))
337 return (lex->tok->ttype = TOKEN_FATAL);
339 /* as lexer we only care about \" to not terminate the string prematurely */
343 lexerror(lex, "unexpected end of file");
344 lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
345 return (lex->tok->ttype = TOKEN_ERROR);
347 /* so we just add the next character no matter what it actually is */
348 if (!lex_tokench(lex, ch))
349 return (lex->tok->ttype = TOKEN_FATAL);
352 lexerror(lex, "unexpected end of file within string constant");
353 lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
354 return (lex->tok->ttype = TOKEN_ERROR);
357 static int GMQCC_WARN lex_finish_digit(lex_file *lex, int lastch)
363 /* parse a number... */
364 lex->tok->ttype = TOKEN_INTCONST;
366 if (!lex_tokench(lex, ch))
367 return (lex->tok->ttype = TOKEN_FATAL);
370 if (ch != '.' && !isdigit(ch))
372 if (lastch != '0' || ch != 'x')
374 /* end of the number or EOF */
375 lex_ungetch(lex, ch);
376 if (!lex_endtoken(lex))
377 return (lex->tok->ttype = TOKEN_FATAL);
379 lex->tok->constval.i = lastch - '0';
380 return lex->tok->ttype;
386 /* EOF would have been caught above */
390 if (!lex_tokench(lex, ch))
391 return (lex->tok->ttype = TOKEN_FATAL);
393 while (isdigit(ch) || (ishex && isxdigit_only(ch)))
395 if (!lex_tokench(lex, ch))
396 return (lex->tok->ttype = TOKEN_FATAL);
400 /* NOT else, '.' can come from above as well */
401 if (ch == '.' && !ishex)
403 /* Allow floating comma in non-hex mode */
404 lex->tok->ttype = TOKEN_FLOATCONST;
405 if (!lex_tokench(lex, ch))
406 return (lex->tok->ttype = TOKEN_FATAL);
408 /* continue digits-only */
412 if (!lex_tokench(lex, ch))
413 return (lex->tok->ttype = TOKEN_FATAL);
417 /* put back the last character */
418 /* but do not put back the trailing 'f' or a float */
419 if (lex->tok->ttype == TOKEN_FLOATCONST && ch == 'f')
422 /* generally we don't want words to follow numbers: */
424 lexerror(lex, "unexpected trailing characters after number");
425 return (lex->tok->ttype = TOKEN_ERROR);
427 lex_ungetch(lex, ch);
429 if (!lex_endtoken(lex))
430 return (lex->tok->ttype = TOKEN_FATAL);
431 if (lex->tok->ttype == TOKEN_FLOATCONST)
432 lex->tok->constval.f = strtod(lex->tok->value, NULL);
434 lex->tok->constval.i = strtol(lex->tok->value, NULL, 0);
435 return lex->tok->ttype;
438 int lex_do(lex_file *lex)
443 token_delete(lex->tok);
444 lex->tok = token_new();
448 ch = lex_skipwhite(lex);
449 lex->sline = lex->line;
450 lex->tok->ctx.line = lex->sline;
451 lex->tok->ctx.file = lex->name;
454 return (lex->tok->ttype = TOKEN_EOF);
456 /* single-character tokens */
469 return (lex->tok->ttype = ch);
474 if (lex->flags.noops)
476 /* Detect characters early which are normally
477 * operators OR PART of an operator.
500 if (!lex_tokench(lex, ch) ||
503 return (lex->tok->ttype = TOKEN_FATAL);
505 return (lex->tok->ttype = TOKEN_OPERATOR);
508 if (ch == '+' || ch == '-' || /* ++, --, +=, -= and -> as well! */
509 ch == '>' || ch == '<' || /* <<, >>, <=, >= */
510 ch == '=' || /* == */
511 ch == '&' || ch == '|') /* &&, ||, &=, |= */
513 if (!lex_tokench(lex, ch))
514 return (lex->tok->ttype = TOKEN_FATAL);
516 nextch = lex_getch(lex);
517 if (nextch == ch || nextch == '=') {
518 if (!lex_tokench(lex, nextch))
519 return (lex->tok->ttype = TOKEN_FATAL);
520 } else if (ch == '-' && nextch == '>') {
521 if (!lex_tokench(lex, nextch))
522 return (lex->tok->ttype = TOKEN_FATAL);
524 lex_ungetch(lex, nextch);
526 if (!lex_endtoken(lex))
527 return (lex->tok->ttype = TOKEN_FATAL);
528 return (lex->tok->ttype = TOKEN_OPERATOR);
531 if (ch == '^' || ch == '~' || ch == '!')
533 if (!lex_tokench(lex, ch) ||
536 return (lex->tok->ttype = TOKEN_FATAL);
538 return (lex->tok->ttype = TOKEN_OPERATOR);
541 if (ch == '*' || ch == '/') /* *=, /= */
543 if (!lex_tokench(lex, ch))
544 return (lex->tok->ttype = TOKEN_FATAL);
546 nextch = lex_getch(lex);
548 if (!lex_tokench(lex, nextch))
549 return (lex->tok->ttype = TOKEN_FATAL);
551 lex_ungetch(lex, nextch);
553 if (!lex_endtoken(lex))
554 return (lex->tok->ttype = TOKEN_FATAL);
555 return (lex->tok->ttype = TOKEN_OPERATOR);
558 if (isident_start(ch))
561 if (!lex_tokench(lex, ch))
562 return (lex->tok->ttype = TOKEN_FATAL);
563 if (!lex_finish_ident(lex)) {
565 return (lex->tok->ttype = TOKEN_ERROR);
567 if (!lex_endtoken(lex))
568 return (lex->tok->ttype = TOKEN_FATAL);
569 lex->tok->ttype = TOKEN_IDENT;
572 if (!strcmp(v, "void")) {
573 lex->tok->ttype = TOKEN_TYPENAME;
574 lex->tok->constval.t = TYPE_VOID;
575 } else if (!strcmp(v, "int")) {
576 lex->tok->ttype = TOKEN_TYPENAME;
577 lex->tok->constval.t = TYPE_INTEGER;
578 } else if (!strcmp(v, "float")) {
579 lex->tok->ttype = TOKEN_TYPENAME;
580 lex->tok->constval.t = TYPE_FLOAT;
581 } else if (!strcmp(v, "string")) {
582 lex->tok->ttype = TOKEN_TYPENAME;
583 lex->tok->constval.t = TYPE_STRING;
584 } else if (!strcmp(v, "entity")) {
585 lex->tok->ttype = TOKEN_TYPENAME;
586 lex->tok->constval.t = TYPE_ENTITY;
587 } else if (!strcmp(v, "vector")) {
588 lex->tok->ttype = TOKEN_TYPENAME;
589 lex->tok->constval.t = TYPE_VECTOR;
590 } else if (!strcmp(v, "for") ||
591 !strcmp(v, "while") ||
594 !strcmp(v, "return") ||
596 lex->tok->ttype = TOKEN_KEYWORD;
598 return lex->tok->ttype;
603 lex->tok->ttype = lex_finish_string(lex, '"');
604 while (lex->tok->ttype == TOKEN_STRINGCONST)
606 /* Allow c style "string" "continuation" */
607 ch = lex_skipwhite(lex);
609 lex_ungetch(lex, ch);
613 lex->tok->ttype = lex_finish_string(lex, '"');
615 if (!lex_endtoken(lex))
616 return (lex->tok->ttype = TOKEN_FATAL);
617 return lex->tok->ttype;
622 /* we parse character constants like string,
623 * but return TOKEN_CHARCONST, or a vector type if it fits...
624 * Likewise actual unescaping has to be done by the parser.
625 * The difference is we don't allow 'char' 'continuation'.
627 lex->tok->ttype = lex_finish_string(lex, '\'');
628 if (!lex_endtoken(lex))
629 return (lex->tok->ttype = TOKEN_FATAL);
631 /* It's a vector if we can successfully scan 3 floats */
632 if (sscanf(lex->tok->value, " %f %f %f ", &lex->tok->constval.v.x, &lex->tok->constval.v.y, &lex->tok->constval.v.z) == 3)
634 lex->tok->ttype = TOKEN_VECTORCONST;
637 return lex->tok->ttype;
642 lex->tok->ttype = lex_finish_digit(lex, ch);
643 if (!lex_endtoken(lex))
644 return (lex->tok->ttype = TOKEN_FATAL);
645 return lex->tok->ttype;
648 lexerror(lex, "unknown token");
649 return (lex->tok->ttype = TOKEN_ERROR);