lex.c

   1 /*
   2  * Copyright (C) 2012
   3  *      Dale Weiler
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   6  * this software and associated documentation files (the "Software"), to deal in
   7  * the Software without restriction, including without limitation the rights to
   8  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is furnished to do
  10  * so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in all
  13  * copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23 #include <stdio.h>
  24 #include <limits.h>
  25 #include <stdlib.h>
  26 #include <ctype.h>
  27 #include <string.h>
  28 #include "gmqcc.h"
  29
  30 /*
  31  * Keywords are multichar, punctuation lexing is a bit more complicated
  32  * than keyword lexing.
  33  */
  34 static const char *const lex_keywords[] = {
  35         "do",    "else",     "if",     "while",
  36         "break", "continue", "return", "goto",
  37         "for",   "typedef"
  38 };
  39
  40 struct lex_file *lex_open(FILE *fp) {
  41         struct lex_file *lex = mem_a(sizeof(struct lex_file));
  42         if (!lex || !fp)
  43                 return NULL;
  44
  45         lex->file = fp;
  46         fseek(lex->file, 0, SEEK_END);
  47         lex->length = ftell(lex->file);
  48         lex->size   = lex->length; /* copy, this is never changed */
  49         fseek(lex->file, 0, SEEK_SET);
  50         lex->last = 0;
  51         lex->line = 0;
  52
  53         memset(lex->peek, 0, sizeof(lex->peek));
  54         return lex;
  55 }
  56
  57 void lex_close(struct lex_file *file) {
  58         if (!file) return;
  59
  60         fclose(file->file); /* may already be closed */
  61         mem_d (file);
  62 }
  63
  64 static void lex_addch(int ch, struct lex_file *file) {
  65         if (file->current <  sizeof(file->lastok)-1)
  66                 file->lastok[file->current++] = (char)ch;
  67         if (file->current == sizeof(file->lastok)-1)
  68                 file->lastok[file->current]   = (char)'\0';
  69 }
  70 static inline void lex_clear(struct lex_file *file) {
  71         file->current = 0;
  72 }
  73
  74 /*
  75  * read in inget/unget character from a lexer stream.
  76  * This doesn't play with file streams, the lexer has
  77  * it's own internal state for this.
  78  */
  79 static int lex_inget(struct lex_file *file) {
  80         file->length --;
  81         if (file->last > 0)
  82                 return file->peek[--file->last];
  83         return fgetc(file->file);
  84 }
  85 static void lex_unget(int ch, struct lex_file *file) {
  86         if (file->last < sizeof(file->peek))
  87                 file->peek[file->last++] = ch;
  88         file->length ++;
  89 }
  90
  91 /*
  92  * This is trigraph and digraph support, a feature not qc compiler
  93  * supports.  Moving up in this world!
  94  */
  95 static int lex_trigraph(struct lex_file *file) {
  96         int  ch;
  97         if ((ch = lex_inget(file)) != '?') {
  98                 lex_unget(ch, file);
  99                 return '?';
 100         }
 101
 102         ch = lex_inget(file);
 103         switch (ch) {
 104                 case '(' : return '[' ;
 105                 case ')' : return ']' ;
 106                 case '/' : return '\\';
 107                 case '\'': return '^' ;
 108                 case '<' : return '{' ;
 109                 case '>' : return '}' ;
 110                 case '!' : return '|' ;
 111                 case '-' : return '~' ;
 112                 case '=' : return '#' ;
 113                 default:
 114                         lex_unget('?', file);
 115                         lex_unget(ch , file);
 116                         return '?';
 117         }
 118         return '?';
 119 }
 120 static int lex_digraph(struct lex_file *file, int first) {
 121         int ch = lex_inget(file);
 122         switch (first) {
 123                 case '<':
 124                         if (ch == '%') return '{';
 125                         if (ch == ':') return '[';
 126                         break;
 127                 case '%':
 128                         if (ch == '>') return '}';
 129                         if (ch == ':') return '#';
 130                         break;
 131                 case ':':
 132                         if (ch == '>') return ']';
 133                         break;
 134         }
 135
 136         lex_unget(ch, file);
 137         return first;
 138 }
 139
 140 static int lex_getch(struct lex_file *file) {
 141         int ch = lex_inget(file);
 142
 143         static int str = 0;
 144         switch (ch) {
 145                 case '?' :
 146                         return lex_trigraph(file);
 147                 case '<' :
 148                 case ':' :
 149                 case '%' :
 150                 case '"' : str = !str; if (str) { file->line ++; }
 151                         return lex_digraph(file, ch);
 152
 153                 case '\n':
 154                         if (!str)
 155                                 file->line++;
 156         }
 157
 158         return ch;
 159 }
 160
 161 static int lex_get(struct lex_file *file) {
 162         int ch;
 163         if (!isspace(ch = lex_getch(file)))
 164                 return ch;
 165
 166         /* skip over all spaces */
 167         while (isspace(ch) && ch != '\n')
 168                 ch = lex_getch(file);
 169
 170         if (ch == '\n')
 171                 return ch;
 172         lex_unget(ch, file);
 173         return ' ';
 174 }
 175
 176 static int lex_skipchr(struct lex_file *file) {
 177         int ch;
 178         int it;
 179
 180         lex_clear(file);
 181         lex_addch('\'', file);
 182
 183         for (it = 0; it < 2 && ((ch = lex_inget(file)) != '\''); it++) {
 184                 lex_addch(ch, file);
 185
 186                 if (ch == '\n')
 187                         return ERROR_LEX;
 188                 if (ch == '\\')
 189                         lex_addch(lex_getch(file), file);
 190         }
 191         lex_addch('\'', file);
 192         lex_addch('\0', file);
 193
 194         if (it > 2)
 195                 return ERROR_LEX;
 196
 197         return LEX_CHRLIT;
 198 }
 199
 200 static int lex_skipstr(struct lex_file *file) {
 201         int ch;
 202         lex_clear(file);
 203         lex_addch('"', file);
 204
 205         while ((ch = lex_getch(file)) != '"') {
 206                 if (ch == '\n' || ch == EOF)
 207                         return ERROR_LEX;
 208
 209                 lex_addch(ch, file);
 210                 if (ch == '\\')
 211                         lex_addch(lex_inget(file), file);
 212         }
 213
 214         lex_addch('"', file);
 215         lex_addch('\0', file);
 216
 217         return LEX_STRLIT;
 218 }
 219 static int lex_skipcmt(struct lex_file *file) {
 220         int ch;
 221         lex_clear(file);
 222         ch = lex_getch(file);
 223
 224         if (ch == '/') {
 225                 lex_addch('/', file);
 226                 lex_addch('/', file);
 227
 228                 while ((ch = lex_getch(file)) != '\n') {
 229                         if (ch == '\\') {
 230                                 lex_addch(ch, file);
 231                                 lex_addch(lex_getch(file), file);
 232                         } else {
 233                                 lex_addch(ch, file);
 234                         }
 235                 }
 236                 lex_addch('\0', file);
 237                 return LEX_COMMENT;
 238         }
 239
 240         if (ch != '*') {
 241                 lex_unget(ch, file);
 242                 return '/';
 243         }
 244
 245         lex_addch('/', file);
 246
 247         /* hate this */
 248         do {
 249                 lex_addch(ch, file);
 250                 while ((ch = lex_getch(file)) != '*') {
 251                         if (ch == EOF)
 252                                 return error(file, ERROR_LEX, "malformatted comment");
 253                         else
 254                                 lex_addch(ch, file);
 255                 }
 256                 lex_addch(ch, file);
 257         } while ((ch = lex_getch(file)) != '/');
 258
 259         lex_addch('/',  file);
 260         lex_addch('\0', file);
 261
 262         return LEX_COMMENT;
 263 }
 264
 265 static int lex_getsource(struct lex_file *file) {
 266         int ch = lex_get(file);
 267
 268         /* skip char/string/comment */
 269         switch (ch) {
 270                 case '\'': return lex_skipchr(file);
 271                 case '"':  return lex_skipstr(file);
 272                 case '/':  return lex_skipcmt(file);
 273                 default:
 274                         return ch;
 275         }
 276 }
 277
 278 int lex_token(struct lex_file *file) {
 279         int ch = lex_getsource(file);
 280         int it;
 281
 282         /* valid identifier */
 283         if (ch > 0 && (ch == '_' || isalpha(ch))) {
 284                 lex_clear(file);
 285
 286                 /*
 287                  * Yes this is dirty, but there is no other _sane_ easy
 288                  * way to do it, this is what I call defensive programming
 289                  * if something breaks, add more defense :-)
 290                  */
 291                 while (ch >   0   && ch != ' ' && ch != '(' &&
 292                        ch != '\n' && ch != ';' && ch != ')') {
 293                         lex_addch(ch, file);
 294                         ch = lex_getsource(file);
 295                 }
 296                 lex_unget(ch,   file);
 297                 lex_addch('\0', file);
 298
 299                 /* look inside the table for a keyword .. */
 300                 for (it = 0; it < sizeof(lex_keywords)/sizeof(*lex_keywords); it++)
 301                         if (!strncmp(file->lastok, lex_keywords[it], sizeof(lex_keywords[it])))
 302                                 return it;
 303
 304                 /* try a type? */
 305                 #define TEST_TYPE(X)                                 \
 306                     do {                                             \
 307                         if (!strncmp(X, "float",  sizeof("float")))  \
 308                             return TOKEN_FLOAT;                      \
 309                         if (!strncmp(X, "vector", sizeof("vector"))) \
 310                             return TOKEN_VECTOR;                     \
 311                         if (!strncmp(X, "string", sizeof("string"))) \
 312                             return TOKEN_STRING;                     \
 313                         if (!strncmp(X, "entity", sizeof("entity"))) \
 314                             return TOKEN_ENTITY;                     \
 315                         if (!strncmp(X, "void"  , sizeof("void")))   \
 316                             return TOKEN_VOID;                       \
 317                     } while(0)
 318
 319                 TEST_TYPE(file->lastok);
 320
 321                 /* try the hashtable for typedefs? */
 322                 if (typedef_find(file->lastok))
 323                         TEST_TYPE(typedef_find(file->lastok)->name);
 324
 325                 #undef TEST_TYPE
 326                 return LEX_IDENT;
 327         }
 328         return ch;
 329 }
 330
 331 void lex_reset(struct lex_file *file) {
 332         file->current = 0;
 333         file->last    = 0;
 334         file->length  = file->size;
 335         fseek(file->file, 0, SEEK_SET);
 336
 337         memset(file->peek,   0, sizeof(file->peek  ));
 338         memset(file->lastok, 0, sizeof(file->lastok));
 339 }
 340
 341 /*
 342  * Include a file into the lexer / parsing process:  This really
 343  * should check if names are the same to prevent endless include
 344  * recrusion.
 345  */
 346 struct lex_file *lex_include(struct lex_file *lex, char *file) {
 347         util_strrq(file);
 348         if (strncmp(lex->name, file, strlen(lex->name)) == 0) {
 349                 error(lex, ERROR_LEX, "Source file cannot include itself\n");
 350                 exit (-1);
 351         }
 352
 353         FILE *fp = fopen(file, "r");
 354         if  (!fp) {
 355                 error(lex, ERROR_LEX, "Include file `%s` doesn't exist\n", file);
 356                 exit (-1);
 357         }
 358
 359         return lex_open(fp);
 360 }