lex.c

   1 /*
   2  * Copyright (C) 2012
   3  *      Dale Weiler
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   6  * this software and associated documentation files (the "Software"), to deal in
   7  * the Software without restriction, including without limitation the rights to
   8  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   9  * of the Software, and to permit persons to whom the Software is furnished to do
  10  * so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in all
  13  * copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21  * SOFTWARE.
  22  */
  23 #include "gmqcc.h"
  24
  25 /*
  26  * Keywords are multichar, punctuation lexing is a bit more complicated
  27  * than keyword lexing.
  28  */
  29 static const char *const lex_keywords[] = {
  30         "do",    "else",     "if",     "while",
  31         "break", "continue", "return", "goto",
  32         "for",   "typedef"
  33 };
  34
  35 struct lex_file *lex_open(FILE *fp) {
  36         struct lex_file *lex = mem_a(sizeof(struct lex_file));
  37         if (!lex || !fp)
  38                 return NULL;
  39
  40         lex->file = fp;
  41         fseek(lex->file, 0, SEEK_END);
  42         lex->length = ftell(lex->file);
  43         lex->size   = lex->length; /* copy, this is never changed */
  44         fseek(lex->file, 0, SEEK_SET);
  45         lex->last = 0;
  46         lex->line = 0;
  47
  48         memset(lex->peek, 0, sizeof(lex->peek));
  49         return lex;
  50 }
  51
  52 void lex_close(struct lex_file *file) {
  53         if (!file) return;
  54
  55         fclose(file->file); /* may already be closed */
  56         mem_d (file);
  57 }
  58
  59 static void lex_addch(int ch, struct lex_file *file) {
  60         if (file->current <  sizeof(file->lastok)-1)
  61                 file->lastok[file->current++] = (char)ch;
  62         if (file->current == sizeof(file->lastok)-1)
  63                 file->lastok[file->current]   = (char)'\0';
  64 }
  65 static inline void lex_clear(struct lex_file *file) {
  66         file->current = 0;
  67 }
  68
  69 /*
  70  * read in inget/unget character from a lexer stream.
  71  * This doesn't play with file streams, the lexer has
  72  * it's own internal state for this.
  73  */
  74 static int lex_inget(struct lex_file *file) {
  75         file->length --;
  76         if (file->last > 0)
  77                 return file->peek[--file->last];
  78         return fgetc(file->file);
  79 }
  80 static void lex_unget(int ch, struct lex_file *file) {
  81         if (file->last < sizeof(file->peek))
  82                 file->peek[file->last++] = ch;
  83         file->length ++;
  84 }
  85
  86 /*
  87  * This is trigraph and digraph support, a feature not qc compiler
  88  * supports.  Moving up in this world!
  89  */
  90 static int lex_trigraph(struct lex_file *file) {
  91         int  ch;
  92         if ((ch = lex_inget(file)) != '?') {
  93                 lex_unget(ch, file);
  94                 return '?';
  95         }
  96
  97         ch = lex_inget(file);
  98         switch (ch) {
  99                 case '(' : return '[' ;
 100                 case ')' : return ']' ;
 101                 case '/' : return '\\';
 102                 case '\'': return '^' ;
 103                 case '<' : return '{' ;
 104                 case '>' : return '}' ;
 105                 case '!' : return '|' ;
 106                 case '-' : return '~' ;
 107                 case '=' : return '#' ;
 108                 default:
 109                         lex_unget('?', file);
 110                         lex_unget(ch , file);
 111                         return '?';
 112         }
 113         return '?';
 114 }
 115 static int lex_digraph(struct lex_file *file, int first) {
 116         int ch = lex_inget(file);
 117         switch (first) {
 118                 case '<':
 119                         if (ch == '%') return '{';
 120                         if (ch == ':') return '[';
 121                         break;
 122                 case '%':
 123                         if (ch == '>') return '}';
 124                         if (ch == ':') return '#';
 125                         break;
 126                 case ':':
 127                         if (ch == '>') return ']';
 128                         break;
 129         }
 130
 131         lex_unget(ch, file);
 132         return first;
 133 }
 134
 135 static int lex_getch(struct lex_file *file) {
 136         int ch = lex_inget(file);
 137
 138         static int str = 0;
 139         switch (ch) {
 140                 case '?' :
 141                         return lex_trigraph(file);
 142                 case '<' :
 143                 case ':' :
 144                 case '%' :
 145                 case '"' : str = !str; if (str) { file->line ++; }
 146                         return lex_digraph(file, ch);
 147
 148                 case '\n':
 149                         if (!str)
 150                                 file->line++;
 151         }
 152
 153         return ch;
 154 }
 155
 156 static int lex_get(struct lex_file *file) {
 157         int ch;
 158         if (!isspace(ch = lex_getch(file)))
 159                 return ch;
 160
 161         /* skip over all spaces */
 162         while (isspace(ch) && ch != '\n')
 163                 ch = lex_getch(file);
 164
 165         if (ch == '\n')
 166                 return ch;
 167         lex_unget(ch, file);
 168         return ' ';
 169 }
 170
 171 static int lex_skipchr(struct lex_file *file) {
 172         int ch;
 173         int it;
 174
 175         lex_clear(file);
 176         lex_addch('\'', file);
 177
 178         for (it = 0; it < 2 && ((ch = lex_inget(file)) != '\''); it++) {
 179                 lex_addch(ch, file);
 180
 181                 if (ch == '\n')
 182                         return ERROR_LEX;
 183                 if (ch == '\\')
 184                         lex_addch(lex_getch(file), file);
 185         }
 186         lex_addch('\'', file);
 187         lex_addch('\0', file);
 188
 189         if (it > 2)
 190                 return ERROR_LEX;
 191
 192         return LEX_CHRLIT;
 193 }
 194
 195 static int lex_skipstr(struct lex_file *file) {
 196         int ch;
 197         lex_clear(file);
 198         lex_addch('"', file);
 199
 200         while ((ch = lex_getch(file)) != '"') {
 201                 if (ch == '\n' || ch == EOF)
 202                         return ERROR_LEX;
 203
 204                 lex_addch(ch, file);
 205                 if (ch == '\\')
 206                         lex_addch(lex_inget(file), file);
 207         }
 208
 209         lex_addch('"', file);
 210         lex_addch('\0', file);
 211
 212         return LEX_STRLIT;
 213 }
 214 static int lex_skipcmt(struct lex_file *file) {
 215         int ch;
 216         lex_clear(file);
 217         ch = lex_getch(file);
 218
 219         if (ch == '/') {
 220                 lex_addch('/', file);
 221                 lex_addch('/', file);
 222
 223                 while ((ch = lex_getch(file)) != '\n') {
 224                         if (ch == '\\') {
 225                                 lex_addch(ch, file);
 226                                 lex_addch(lex_getch(file), file);
 227                         } else {
 228                                 lex_addch(ch, file);
 229                         }
 230                 }
 231                 lex_addch('\0', file);
 232                 return LEX_COMMENT;
 233         }
 234
 235         if (ch != '*') {
 236                 lex_unget(ch, file);
 237                 return '/';
 238         }
 239
 240         lex_addch('/', file);
 241
 242         /* hate this */
 243         do {
 244                 lex_addch(ch, file);
 245                 while ((ch = lex_getch(file)) != '*') {
 246                         if (ch == EOF)
 247                                 return error(file, ERROR_LEX, "malformatted comment");
 248                         else
 249                                 lex_addch(ch, file);
 250                 }
 251                 lex_addch(ch, file);
 252         } while ((ch = lex_getch(file)) != '/');
 253
 254         lex_addch('/',  file);
 255         lex_addch('\0', file);
 256
 257         return LEX_COMMENT;
 258 }
 259
 260 static int lex_getsource(struct lex_file *file) {
 261         int ch = lex_get(file);
 262
 263         /* skip char/string/comment */
 264         switch (ch) {
 265                 case '\'': return lex_skipchr(file);
 266                 case '"':  return lex_skipstr(file);
 267                 case '/':  return lex_skipcmt(file);
 268                 default:
 269                         return ch;
 270         }
 271 }
 272
 273 int lex_token(struct lex_file *file) {
 274         int ch = lex_getsource(file);
 275         int it;
 276
 277         /* valid identifier */
 278         if (ch > 0 && (ch == '_' || isalpha(ch))) {
 279                 lex_clear(file);
 280
 281                 /*
 282                  * Yes this is dirty, but there is no other _sane_ easy
 283                  * way to do it, this is what I call defensive programming
 284                  * if something breaks, add more defense :-)
 285                  */
 286                 while (ch >   0   && ch != ' ' && ch != '(' &&
 287                        ch != '\n' && ch != ';' && ch != ')') {
 288                         lex_addch(ch, file);
 289                         ch = lex_getsource(file);
 290                 }
 291                 lex_unget(ch,   file);
 292                 lex_addch('\0', file);
 293
 294                 /* look inside the table for a keyword .. */
 295                 for (it = 0; it < sizeof(lex_keywords)/sizeof(*lex_keywords); it++)
 296                         if (!strncmp(file->lastok, lex_keywords[it], sizeof(lex_keywords[it])))
 297                                 return it;
 298
 299                 /* try a type? */
 300                 #define TEST_TYPE(X)                                 \
 301                     do {                                             \
 302                         if (!strncmp(X, "float",  sizeof("float")))  \
 303                             return TOKEN_FLOAT;                      \
 304                         if (!strncmp(X, "vector", sizeof("vector"))) \
 305                             return TOKEN_VECTOR;                     \
 306                         if (!strncmp(X, "string", sizeof("string"))) \
 307                             return TOKEN_STRING;                     \
 308                         if (!strncmp(X, "entity", sizeof("entity"))) \
 309                             return TOKEN_ENTITY;                     \
 310                         if (!strncmp(X, "void"  , sizeof("void")))   \
 311                             return TOKEN_VOID;                       \
 312                     } while(0)
 313
 314                 TEST_TYPE(file->lastok);
 315
 316                 /* try the hashtable for typedefs? */
 317                 if (typedef_find(file->lastok))
 318                         TEST_TYPE(typedef_find(file->lastok)->name);
 319
 320                 #undef TEST_TYPE
 321                 return LEX_IDENT;
 322         }
 323         return ch;
 324 }
 325
 326 void lex_reset(struct lex_file *file) {
 327         file->current = 0;
 328         file->last    = 0;
 329         file->length  = file->size;
 330         fseek(file->file, 0, SEEK_SET);
 331
 332         memset(file->peek,   0, sizeof(file->peek  ));
 333         memset(file->lastok, 0, sizeof(file->lastok));
 334 }
 335
 336 /*
 337  * Include a file into the lexer / parsing process:  This really
 338  * should check if names are the same to prevent endless include
 339  * recrusion.
 340  */
 341 struct lex_file *lex_include(struct lex_file *lex, char *file) {
 342         util_strrq(file);
 343         if (strncmp(lex->name, file, strlen(lex->name)) == 0) {
 344                 error(lex, ERROR_LEX, "Source file cannot include itself\n");
 345                 exit (-1);
 346         }
 347
 348         FILE *fp = fopen(file, "r");
 349         if  (!fp) {
 350                 error(lex, ERROR_LEX, "Include file `%s` doesn't exist\n", file);
 351                 exit (-1);
 352         }
 353
 354         return lex_open(fp);
 355 }