Copying my old lexer

author Wolfgang Bumiller <wolfgang.linux@bumiller.com>

Mon, 16 Jul 2012 11:59:10 +0000 (13:59 +0200)

committer Wolfgang Bumiller <wolfgang.linux@bumiller.com>

Mon, 16 Jul 2012 11:59:10 +0000 (13:59 +0200)
author Wolfgang Bumiller <wolfgang.linux@bumiller.com>
Mon, 16 Jul 2012 11:59:10 +0000 (13:59 +0200)
committer Wolfgang Bumiller <wolfgang.linux@bumiller.com>
Mon, 16 Jul 2012 11:59:10 +0000 (13:59 +0200)
diff --git a/Makefile b/Makefile

index feaf6111fbd4bf84549c99d2e8dc5ac7f2c48e59..9101d2b12a3600ae8e410e3b8ea8d31decdcd8de 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -17,14 +17,13 @@ ifeq ($(CC), clang)
  
  endif
  OBJ     = \
-          error.o     \
            util.o      \
            code.o      \
            ast.o       \
            ir.o
  OBJ_A = test/ast-test.o
  OBJ_I = test/ir-test.o
-OBJ_C = main.o
+OBJ_C = main.o lexer.o parser.o
  
  #default is compiler only
  default: gmqcc
diff --git a/gmqcc.h b/gmqcc.h

index 85cb9ae5461b721b3a42758697af6d3db5885e5f..e5d37af936063b7d5601c93791dc959c35119dd5 100644 (file)
--- a/gmqcc.h
+++ b/gmqcc.h
@@ -179,95 +179,6 @@ typedef char int64_size_is_correct  [sizeof(int64_t)  == 8?1:-1];
  typedef char uintptr_size_is_correct[sizeof(intptr_t) == sizeof(int*)?1:-1];
  typedef char intptr_size_is_correct [sizeof(uintptr_t)== sizeof(int*)?1:-1];
  
-/*===================================================================*/
-/*============================ lex.c ================================*/
-/*===================================================================*/
-typedef struct lex_file_t {
-    FILE *file;        /* file handler */
-    char *name;        /* name of file */
-    char  peek  [5];
-    char  lastok[8192];
-
-    int   last;    /* last token                   */
-    int   current; /* current token                */
-    int   length;  /* bytes left to parse          */
-    int   size;    /* never changes (size of file) */
-    int   line;    /* what line are we on?         */
-} lex_file;
-
-/*
- * It's important that this table never exceed 32 keywords, the ascii
- * table starts at 33 (and we don't want conflicts)
- */
-enum {
-    TOKEN_DO       ,
-    TOKEN_ELSE     ,
-    TOKEN_IF       ,
-    TOKEN_WHILE    ,
-    TOKEN_BREAK    ,
-    TOKEN_CONTINUE ,
-    TOKEN_RETURN   ,
-    TOKEN_GOTO     ,
-    TOKEN_FOR      ,   /* extension */
-    TOKEN_TYPEDEF  ,   /* extension */
-
-    /* ensure the token types are out of the  */
-    /* bounds of anyothers that may conflict. */
-    TOKEN_FLOAT    = 110,
-    TOKEN_VECTOR        ,
-    TOKEN_STRING        ,
-    TOKEN_ENTITY        ,
-    TOKEN_VOID
-};
-
-/*
- * Lexer state constants, these are numbers for where exactly in
- * the lexing the lexer is at. Or where it decided to stop if a lexer
- * error occurs.  These numbers must be > where the ascii-table ends
- * and > the last type token which is TOKEN_VOID
- */
-enum {
-    LEX_COMMENT = 1128,
-    LEX_CHRLIT        ,
-    LEX_STRLIT        ,
-    LEX_IDENT
-};
-
-int       lex_token  (lex_file *);
-void      lex_reset  (lex_file *);
-void      lex_close  (lex_file *);
-void      lex_parse  (lex_file *);
-lex_file *lex_include(lex_file *, const char *);
-void      lex_init   (const char *, lex_file **);
-
-/*===================================================================*/
-/*========================== error.c ================================*/
-/*===================================================================*/
-#define ERROR_LEX      (SHRT_MAX+0)
-#define ERROR_PARSE    (SHRT_MAX+1)
-#define ERROR_INTERNAL (SHRT_MAX+2)
-#define ERROR_COMPILER (SHRT_MAX+3)
-#define ERROR_PREPRO   (SHRT_MAX+4)
-int error(lex_file *, int, const char *, ...);
-
-/*===================================================================*/
-/*========================== parse.c ================================*/
-/*===================================================================*/
-int parse_gen(lex_file *);
-
-/*===================================================================*/
-/*========================== typedef.c ==============================*/
-/*===================================================================*/
-typedef struct typedef_node_t {
-    char      *name;
-} typedef_node;
-
-void          typedef_init();
-void          typedef_clear();
-typedef_node *typedef_find(const char *);
-int           typedef_add (lex_file *file, const char *, const char *);
-
-
  /*===================================================================*/
  /*=========================== util.c ================================*/
  /*===================================================================*/
@@ -364,7 +275,7 @@ enum {
      TYPE_FIELD    ,
      TYPE_FUNCTION ,
      TYPE_POINTER  ,
-    /* TYPE_INTEGER  , */
+    TYPE_INTEGER  ,
      TYPE_QUATERNION  ,
      TYPE_MATRIX  ,
      TYPE_VARIANT  ,
diff --git a/lexer.c b/lexer.c

new file mode 100644 (file)

index 0000000..867fb94
--- /dev/null
+++ b/lexer.c
@@ -0,0 +1,632 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include "gmqcc.h"
+#include "lexer.h"
+
+MEM_VEC_FUNCTIONS(token, char, value)
+
+void lexerror(lex_file *lex, const char *fmt, ...)
+{
+       va_list ap;
+
+       if (lex)
+               printf("error %s:%lu: ", lex->name, (unsigned long)lex->sline);
+       else
+               printf("error: ");
+
+       va_start(ap, fmt);
+       vprintf(fmt, ap);
+       va_end(ap);
+
+       printf("\n");
+}
+
+token* token_new()
+{
+       token *tok = (token*)mem_a(sizeof(token));
+       if (!tok)
+               return NULL;
+       memset(tok, 0, sizeof(*tok));
+       return tok;
+}
+
+void token_delete(token *self)
+{
+       if (self->next && self->next->prev == self)
+               self->next->prev = self->prev;
+       if (self->prev && self->prev->next == self)
+               self->prev->next = self->next;
+       MEM_VECTOR_CLEAR(self, value);
+       mem_d(self);
+}
+
+token* token_copy(const token *cp)
+{
+       token* self = token_new();
+       if (!self)
+               return NULL;
+       /* copy the value */
+       self->value_alloc = cp->value_count + 1;
+       self->value_count = cp->value_count;
+       self->value = (char*)mem_a(self->value_alloc);
+       if (!self->value) {
+               mem_d(self);
+               return NULL;
+       }
+       memcpy(self->value, cp->value, cp->value_count);
+       self->value[self->value_alloc-1] = 0;
+
+       /* rest */
+       self->ctx = cp->ctx;
+       self->ttype = cp->ttype;
+       memcpy(&self->constval, &cp->constval, sizeof(self->constval));
+       return self;
+}
+
+void token_delete_all(token *t)
+{
+       token *n;
+
+       do {
+               n = t->next;
+               token_delete(t);
+               t = n;
+       } while(t);
+}
+
+token* token_copy_all(const token *cp)
+{
+       token *cur;
+       token *out;
+
+       out = cur = token_copy(cp);
+       if (!out)
+               return NULL;
+
+       while (cp->next) {
+               cp = cp->next;
+               cur->next = token_copy(cp);
+               if (!cur->next) {
+                       token_delete_all(out);
+                       return NULL;
+               }
+               cur->next->prev = cur;
+               cur = cur->next;
+       }
+
+       return out;
+}
+
+lex_file* lex_open(const char *file)
+{
+       lex_file *lex;
+       FILE *in = fopen(file, "rb");
+
+       if (!in) {
+               lexerror(NULL, "open failed: '%s'\n", file);
+               return NULL;
+       }
+
+       lex = (lex_file*)mem_a(sizeof(*lex));
+       if (!lex) {
+               fclose(in);
+               lexerror(NULL, "out of memory\n");
+               return NULL;
+       }
+
+       memset(lex, 0, sizeof(*lex));
+
+       lex->file = in;
+       lex->name = util_strdup(file);
+       lex->line = 1; /* we start counting at 1 */
+
+       lex->peekpos = 0;
+
+       return lex;
+}
+
+void lex_close(lex_file *lex)
+{
+       if (lex->file)
+               fclose(lex->file);
+       if (lex->tok)
+               token_delete(lex->tok);
+       mem_d(lex->name);
+       mem_d(lex);
+}
+
+/* Get or put-back data
+ * The following to functions do NOT understand what kind of data they
+ * are working on.
+ * The are merely wrapping get/put in order to count line numbers.
+ */
+static int lex_getch(lex_file *lex)
+{
+       int ch;
+
+       if (lex->peekpos) {
+               lex->peekpos--;
+               if (lex->peek[lex->peekpos] == '\n')
+                       lex->line++;
+               return lex->peek[lex->peekpos];
+       }
+
+       ch = fgetc(lex->file);
+       if (ch == '\n')
+               lex->line++;
+       return ch;
+}
+
+static void lex_ungetch(lex_file *lex, int ch)
+{
+       lex->peek[lex->peekpos++] = ch;
+       if (ch == '\n')
+               lex->line--;
+}
+
+/* classify characters
+ * some additions to the is*() functions of ctype.h
+ */
+
+/* Idents are alphanumberic, but they start with alpha or _ */
+static bool isident_start(int ch)
+{
+       return isalpha(ch) || ch == '_';
+}
+
+static bool isident(int ch)
+{
+       return isident_start(ch) || isdigit(ch);
+}
+
+/* isxdigit_only is used when we already know it's not a digit
+ * and want to see if it's a hex digit anyway.
+ */
+static bool isxdigit_only(int ch)
+{
+       return (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
+}
+
+/* Skip whitespace and comments and return the first
+ * non-white character.
+ * As this makes use of the above getch() ungetch() functions,
+ * we don't need to care at all about line numbering anymore.
+ *
+ * In theory, this function should only be used at the beginning
+ * of lexing, or when we *know* the next character is part of the token.
+ * Otherwise, if the parser throws an error, the linenumber may not be
+ * the line of the error, but the line of the next token AFTER the error.
+ *
+ * This is currently only problematic when using c-like string-continuation,
+ * since comments and whitespaces are allowed between 2 such strings.
+ * Example:
+printf(   "line one\n"
+// A comment
+          "A continuation of the previous string"
+// This line is skipped
+      , foo);
+
+ * In this case, if the parse decides it didn't actually want a string,
+ * and uses lex->line to print an error, it will show the ', foo);' line's
+ * linenumber.
+ *
+ * On the other hand, the parser is supposed to remember the line of the next
+ * token's beginning. In this case we would want skipwhite() to be called
+ * AFTER reading a token, so that the parser, before reading the NEXT token,
+ * doesn't store teh *comment's* linenumber, but the actual token's linenumber.
+ *
+ * THIS SOLUTION
+ *    here is to store the line of the first character after skipping
+ *    the initial whitespace in lex->sline, this happens in lex_do.
+ */
+static int lex_skipwhite(lex_file *lex)
+{
+       int ch = 0;
+
+       do
+       {
+               ch = lex_getch(lex);
+               while (ch != EOF && isspace(ch)) ch = lex_getch(lex);
+
+               if (ch == '/') {
+                       ch = lex_getch(lex);
+                       if (ch == '/')
+                       {
+                               /* one line comment */
+                               ch = lex_getch(lex);
+
+                               /* check for special: '/', '/', '*', '/' */
+                               if (ch == '*') {
+                                       ch = lex_getch(lex);
+                                       if (ch == '/') {
+                                               ch = ' ';
+                                               continue;
+                                       }
+                               }
+
+                               while (ch != EOF && ch != '\n') {
+                                       ch = lex_getch(lex);
+                               }
+                               continue;
+                       }
+                       if (ch == '*')
+                       {
+                               /* multiline comment */
+                               while (ch != EOF)
+                               {
+                                       ch = lex_getch(lex);
+                                       if (ch == '*') {
+                                               ch = lex_getch(lex);
+                                               if (ch == '/') {
+                                                       ch = lex_getch(lex);
+                                                       break;
+                                               }
+                                       }
+                               }
+                               if (ch == '/') /* allow *//* direct following comment */
+                               {
+                                       lex_ungetch(lex, ch);
+                                       ch = ' '; /* cause TRUE in the isspace check */
+                               }
+                               continue;
+                       }
+                       /* Otherwise roll back to the slash and break out of the loop */
+                       lex_ungetch(lex, ch);
+                       ch = '/';
+                       break;
+               }
+       } while (ch != EOF && isspace(ch));
+
+       return ch;
+}
+
+/* Append a character to the token buffer */
+static bool GMQCC_WARN lex_tokench(lex_file *lex, int ch)
+{
+       if (!token_value_add(lex->tok, ch)) {
+               lexerror(lex, "out of memory");
+               return false;
+       }
+       return true;
+}
+
+/* Append a trailing null-byte */
+static bool GMQCC_WARN lex_endtoken(lex_file *lex)
+{
+       if (!token_value_add(lex->tok, 0)) {
+               lexerror(lex, "out of memory");
+               return false;
+       }
+       lex->tok->value_count--;
+       return true;
+}
+
+/* Get a token */
+static bool GMQCC_WARN lex_finish_ident(lex_file *lex)
+{
+       int ch;
+
+       ch = lex_getch(lex);
+       while (ch != EOF && isident(ch))
+       {
+               if (!lex_tokench(lex, ch))
+                       return (lex->tok->ttype = TOKEN_FATAL);
+               ch = lex_getch(lex);
+       }
+
+       /* last ch was not an ident ch: */
+       lex_ungetch(lex, ch);
+
+       return true;
+}
+
+static int GMQCC_WARN lex_finish_string(lex_file *lex, int quote)
+{
+       int ch = 0;
+
+       while (ch != EOF)
+       {
+               ch = lex_getch(lex);
+               if (ch == quote)
+                       return TOKEN_STRINGCONST;
+
+               if (!lex_tokench(lex, ch))
+                       return (lex->tok->ttype = TOKEN_FATAL);
+
+               /* as lexer we only care about \" to not terminate the string prematurely */
+               if (ch == '\\') {
+                       ch = lex_getch(lex);
+                       if (ch == EOF) {
+                               lexerror(lex, "unexpected end of file");
+                               lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
+                               return (lex->tok->ttype = TOKEN_ERROR);
+                       }
+                       /* so we just add the next character no matter what it actually is */
+                       if (!lex_tokench(lex, ch))
+                               return (lex->tok->ttype = TOKEN_FATAL);
+               }
+       }
+       lexerror(lex, "unexpected end of file within string constant");
+       lex_ungetch(lex, EOF); /* next token to be TOKEN_EOF */
+       return (lex->tok->ttype = TOKEN_ERROR);
+}
+
+static int GMQCC_WARN lex_finish_digit(lex_file *lex, int lastch)
+{
+       bool ishex = false;
+
+       int  ch = lastch;
+
+       /* parse a number... */
+       lex->tok->ttype = TOKEN_INTCONST;
+
+       if (!lex_tokench(lex, ch))
+               return (lex->tok->ttype = TOKEN_FATAL);
+
+       ch = lex_getch(lex);
+       if (ch != '.' && !isdigit(ch))
+       {
+               if (lastch != '0' || ch != 'x')
+               {
+                       /* end of the number or EOF */
+                       lex_ungetch(lex, ch);
+                       if (!lex_endtoken(lex))
+                               return (lex->tok->ttype = TOKEN_FATAL);
+
+                       lex->tok->constval.i = lastch - '0';
+                       return lex->tok->ttype;
+               }
+
+               ishex = true;
+       }
+
+       /* EOF would have been caught above */
+
+       if (ch != '.')
+       {
+               if (!lex_tokench(lex, ch))
+                       return (lex->tok->ttype = TOKEN_FATAL);
+               ch = lex_getch(lex);
+               while (isdigit(ch) || (ishex && isxdigit_only(ch)))
+               {
+                       if (!lex_tokench(lex, ch))
+                               return (lex->tok->ttype = TOKEN_FATAL);
+                       ch = lex_getch(lex);
+               }
+       }
+       /* NOT else, '.' can come from above as well */
+       if (ch == '.' && !ishex)
+       {
+               /* Allow floating comma in non-hex mode */
+               lex->tok->ttype = TOKEN_FLOATCONST;
+               if (!lex_tokench(lex, ch))
+                       return (lex->tok->ttype = TOKEN_FATAL);
+
+               /* continue digits-only */
+               ch = lex_getch(lex);
+               while (isdigit(ch))
+               {
+                       if (!lex_tokench(lex, ch))
+                               return (lex->tok->ttype = TOKEN_FATAL);
+                       ch = lex_getch(lex);
+               }
+       }
+       /* put back the last character */
+       /* but do not put back the trailing 'f' or a float */
+       if (lex->tok->ttype == TOKEN_FLOATCONST && ch == 'f')
+               ch = lex_getch(lex);
+
+       /* generally we don't want words to follow numbers: */
+       if (isident(ch)) {
+               lexerror(lex, "unexpected trailing characters after number");
+               return (lex->tok->ttype = TOKEN_ERROR);
+       }
+       lex_ungetch(lex, ch);
+
+       if (!lex_endtoken(lex))
+               return (lex->tok->ttype = TOKEN_FATAL);
+       if (lex->tok->ttype == TOKEN_FLOATCONST)
+               lex->tok->constval.f = strtod(lex->tok->value, NULL);
+       else
+               lex->tok->constval.i = strtol(lex->tok->value, NULL, 0);
+       return lex->tok->ttype;
+}
+
+int lex_do(lex_file *lex)
+{
+       int ch, nextch;
+
+       if (lex->tok)
+               token_delete(lex->tok);
+       lex->tok = token_new();
+       if (!lex->tok)
+               return TOKEN_FATAL;
+
+       ch = lex_skipwhite(lex);
+       lex->sline = lex->line;
+       lex->tok->ctx.line = lex->sline;
+       lex->tok->ctx.file = lex->name;
+
+       if (ch == EOF)
+               return (lex->tok->ttype = TOKEN_EOF);
+
+       /* single-character tokens */
+       switch (ch)
+       {
+               case ';':
+               case '(':
+               case ')':
+               case '{':
+               case '}':
+               case '[':
+               case ']':
+
+               case ',':
+
+                       return (lex->tok->ttype = ch);
+               default:
+                       break;
+       }
+
+       if (lex->flags.noops)
+       {
+               /* Detect characters early which are normally
+                * operators OR PART of an operator.
+                */
+               switch (ch)
+               {
+                       case '+':
+                       case '-':
+                       case '*':
+                       case '/':
+                       case '<':
+                       case '>':
+                       case '=':
+                       case '&':
+                       case '|':
+                       case '^':
+                       case '~':
+                               return ch;
+                       default:
+                               break;
+               }
+       }
+
+       if (ch == '+' || ch == '-' || /* ++, --, +=, -=  and -> as well! */
+           ch == '>' || ch == '<' || /* <<, >>, <=, >= */
+           ch == '=' ||              /* == */
+           ch == '&' || ch == '|')   /* &&, ||, &=, |= */
+       {
+               if (!lex_tokench(lex, ch))
+                       return (lex->tok->ttype = TOKEN_FATAL);
+
+               nextch = lex_getch(lex);
+               if (nextch == ch || nextch == '=') {
+                       if (!lex_tokench(lex, nextch))
+                               return (lex->tok->ttype = TOKEN_FATAL);
+               } else if (ch == '-' && nextch == '>') {
+                       if (!lex_tokench(lex, nextch))
+                               return (lex->tok->ttype = TOKEN_FATAL);
+               } else
+                       lex_ungetch(lex, nextch);
+
+               if (!lex_endtoken(lex))
+                       return (lex->tok->ttype = TOKEN_FATAL);
+               return (lex->tok->ttype = TOKEN_OPERATOR);
+       }
+
+       if (ch == '^' || ch == '~' || ch == '!')
+       {
+               if (!lex_tokench(lex, ch) ||
+                       !lex_endtoken(lex))
+               {
+                       return (lex->tok->ttype = TOKEN_FATAL);
+               }
+               return (lex->tok->ttype = TOKEN_OPERATOR);
+       }
+
+       if (ch == '*' || ch == '/') /* *=, /= */
+       {
+               if (!lex_tokench(lex, ch))
+                       return (lex->tok->ttype = TOKEN_FATAL);
+
+               nextch = lex_getch(lex);
+               if (nextch == '=') {
+                       if (!lex_tokench(lex, nextch))
+                               return (lex->tok->ttype = TOKEN_FATAL);
+               } else
+                       lex_ungetch(lex, nextch);
+
+               if (!lex_endtoken(lex))
+                       return (lex->tok->ttype = TOKEN_FATAL);
+               return (lex->tok->ttype = TOKEN_OPERATOR);
+       }
+
+       if (isident_start(ch))
+       {
+               const char *v;
+               if (!lex_tokench(lex, ch))
+                       return (lex->tok->ttype = TOKEN_FATAL);
+               if (!lex_finish_ident(lex)) {
+                       /* error? */
+                       return (lex->tok->ttype = TOKEN_ERROR);
+               }
+               if (!lex_endtoken(lex))
+                       return (lex->tok->ttype = TOKEN_FATAL);
+               lex->tok->ttype = TOKEN_IDENT;
+
+               v = lex->tok->value;
+               if (!strcmp(v, "void") ||
+                   !strcmp(v, "int") ||
+                   !strcmp(v, "float") ||
+                   !strcmp(v, "vector") )
+               {
+                       lex->tok->ttype = TOKEN_TYPENAME;
+                       switch (v[1]) {
+                               case 'o': lex->tok->constval.t = TYPE_VOID;    break;
+                               case 'n': lex->tok->constval.t = TYPE_INTEGER; break;
+                               case 'l': lex->tok->constval.t = TYPE_FLOAT;   break;
+                               case 'e': lex->tok->constval.t = TYPE_VECTOR;  break;
+                       }
+               }
+               else if (!strcmp(v, "for") ||
+                        !strcmp(v, "while") ||
+                        !strcmp(v, "do"))
+                       lex->tok->ttype = TOKEN_KEYWORD;
+
+               return lex->tok->ttype;
+       }
+
+       if (ch == '"')
+       {
+               lex->tok->ttype = lex_finish_string(lex, '"');
+               while (lex->tok->ttype == TOKEN_STRINGCONST)
+               {
+                       /* Allow c style "string" "continuation" */
+                       ch = lex_skipwhite(lex);
+                       if (ch != '"') {
+                               lex_ungetch(lex, ch);
+                               break;
+                       }
+
+                       lex->tok->ttype = lex_finish_string(lex, '"');
+               }
+               if (!lex_endtoken(lex))
+                       return (lex->tok->ttype = TOKEN_FATAL);
+               return lex->tok->ttype;
+       }
+
+       if (ch == '\'')
+       {
+               /* we parse character constants like string,
+                * but return TOKEN_CHARCONST, or a vector type if it fits...
+                * Likewise actual unescaping has to be done by the parser.
+                * The difference is we don't allow 'char' 'continuation'.
+                */
+                lex->tok->ttype = lex_finish_string(lex, '\'');
+                if (!lex_endtoken(lex))
+                        return (lex->tok->ttype = TOKEN_FATAL);
+
+                /* It's a vector if we can successfully scan 3 floats */
+                if (sscanf(lex->tok->value, " %f %f %f ", &lex->tok->constval.v.x, &lex->tok->constval.v.y, &lex->tok->constval.v.z) == 3)
+                {
+                        lex->tok->ttype = TOKEN_VECTORCONST;
+                }
+
+                return lex->tok->ttype;
+       }
+
+       if (isdigit(ch))
+       {
+               lex->tok->ttype = lex_finish_digit(lex, ch);
+               if (!lex_endtoken(lex))
+                       return (lex->tok->ttype = TOKEN_FATAL);
+               return lex->tok->ttype;
+       }
+
+       lexerror(lex, "unknown token");
+       return (lex->tok->ttype = TOKEN_ERROR);
+}
diff --git a/lexer.h b/lexer.h

new file mode 100644 (file)

index 0000000..f49b8ff
--- /dev/null
+++ b/lexer.h
@@ -0,0 +1,207 @@
+#ifndef GMQCC_LEXER_HDR_
+#define GMQCC_LEXER_HDR_
+
+typedef struct token_s token;
+
+#include "ast.h"
+
+struct token_s {
+       int ttype;
+
+       MEM_VECTOR_MAKE(char, value);
+
+       union {
+               vector v;
+               int    i;
+               double f;
+               int    t; /* type */
+       } constval;
+
+       struct token_s *next;
+       struct token_s *prev;
+
+       lex_ctx ctx;
+};
+
+token* token_new();
+void   token_delete(token*);
+token* token_copy(const token *cp);
+void   token_delete_all(token *t);
+token* token_copy_all(const token *cp);
+
+/* Lexer
+ *
+ */
+enum {
+    /* Other tokens which we can return: */
+    TOKEN_NONE = 0,
+    TOKEN_START = 128,
+
+    TOKEN_IDENT,
+
+    TOKEN_TYPENAME,
+
+    TOKEN_OPERATOR,
+
+    TOKEN_KEYWORD, /* loop */
+
+    TOKEN_STRINGCONST, /* not the typename but an actual "string" */
+    TOKEN_CHARCONST,
+    TOKEN_VECTORCONST,
+    TOKEN_INTCONST,
+    TOKEN_FLOATCONST,
+
+    TOKEN_EOF,
+
+    /* We use '< TOKEN_ERROR', so TOKEN_FATAL must come after it and any
+     * other error related tokens as well
+     */
+    TOKEN_ERROR,
+    TOKEN_FATAL /* internal error, eg out of memory */
+};
+
+static const char *_tokennames[] = {
+    "TOKEN_START",
+    "TOKEN_IDENT",
+    "TOKEN_TYPENAME",
+    "TOKEN_OPERATOR",
+    "TOKEN_KEYWORD",
+    "TOKEN_STRINGCONST",
+    "TOKEN_CHARCONST",
+    "TOKEN_VECTORCONST",
+    "TOKEN_INTCONST",
+    "TOKEN_FLOATCONST",
+    "TOKEN_EOF",
+    "TOKEN_ERROR",
+    "TOKEN_FATAL",
+};
+typedef int
+_all_tokennames_added_[
+       ((TOKEN_FATAL - TOKEN_START + 1) ==
+        (sizeof(_tokennames)/sizeof(_tokennames[0])))
+       ? 1 : -1];
+
+typedef struct {
+       FILE   *file;
+       char   *name;
+       size_t  line;
+       size_t  sline; /* line at the start of a token */
+
+       char    peek[256];
+       size_t  peekpos;
+
+       token  *tok;
+
+       struct {
+           bool noops;
+       } flags;
+} lex_file;
+
+MEM_VECTOR_PROTO(lex_file, char, token);
+
+lex_file* lex_open (const char *file);
+void      lex_close(lex_file   *lex);
+int       lex_do   (lex_file   *lex);
+
+/* Parser
+ *
+ */
+
+enum {
+    ASSOC_LEFT,
+    ASSOC_RIGHT
+};
+
+#define OP_SUFFIX 1
+#define OP_PREFIX 2
+
+typedef struct {
+    const char   *op;
+    unsigned int assoc;
+    unsigned int prec;
+    unsigned int flags;
+} oper_info;
+
+static const oper_info operators[] = {
+    { "++",  ASSOC_LEFT,  16, OP_SUFFIX},
+    { "--",  ASSOC_LEFT,  16, OP_SUFFIX},
+
+    { ".",   ASSOC_LEFT,  15, 0 },
+
+    { "!",   ASSOC_RIGHT, 14, 0 },
+    { "~",   ASSOC_RIGHT, 14, 0 },
+    { "+",   ASSOC_RIGHT, 14, OP_PREFIX },
+    { "-",   ASSOC_RIGHT, 14, OP_PREFIX },
+    { "++",  ASSOC_RIGHT, 14, OP_PREFIX },
+    { "--",  ASSOC_RIGHT, 14, OP_PREFIX },
+/*  { "&",   ASSOC_RIGHT, 14, OP_PREFIX }, */
+
+    { "*",   ASSOC_LEFT,  13, 0 },
+    { "/",   ASSOC_LEFT,  13, 0 },
+    { "%",   ASSOC_LEFT,  13, 0 },
+
+    { "+",   ASSOC_LEFT,  12, 0 },
+    { "-",   ASSOC_LEFT,  12, 0 },
+
+    { "<<",  ASSOC_LEFT,  11, 0 },
+    { ">>",  ASSOC_LEFT,  11, 0 },
+
+    { "<",   ASSOC_LEFT,  10, 0 },
+    { ">",   ASSOC_LEFT,  10, 0 },
+    { "<=",  ASSOC_LEFT,  10, 0 },
+    { ">=",  ASSOC_LEFT,  10, 0 },
+
+    { "==",  ASSOC_LEFT,  9,  0 },
+    { "!=",  ASSOC_LEFT,  9,  0 },
+
+    { "&",   ASSOC_LEFT,  8,  0 },
+
+    { "^",   ASSOC_LEFT,  7,  0 },
+
+    { "|",   ASSOC_LEFT,  6,  0 },
+
+    { "&&",  ASSOC_LEFT,  5,  0 },
+
+    { "||",  ASSOC_LEFT,  4,  0 },
+
+    { "?",   ASSOC_RIGHT, 3,  0 },
+
+    { "=",   ASSOC_RIGHT, 2,  0 },
+    { "+=",  ASSOC_RIGHT, 2,  0 },
+    { "-=",  ASSOC_RIGHT, 2,  0 },
+    { "*=",  ASSOC_RIGHT, 2,  0 },
+    { "/=",  ASSOC_RIGHT, 2,  0 },
+    { "%=",  ASSOC_RIGHT, 2,  0 },
+    { ">>=", ASSOC_RIGHT, 2,  0 },
+    { "<<=", ASSOC_RIGHT, 2,  0 },
+    { "&=",  ASSOC_RIGHT, 2,  0 },
+    { "^=",  ASSOC_RIGHT, 2,  0 },
+    { "|=",  ASSOC_RIGHT, 2,  0 },
+};
+
+typedef struct
+{
+       lex_file *lex;
+       int      error;
+       lex_ctx  ctx;
+
+       token    *tokens;
+       token    *lastok;
+
+       token    *tok; /* current token */
+
+       MEM_VECTOR_MAKE(ast_value*, globals);
+} parse_file;
+
+MEM_VECTOR_PROTO(parse_file, ast_value*, globals);
+
+parse_file* parse_open(const char *file);
+void        parse_file_close(parse_file*);
+
+bool        parse(parse_file*);
+
+bool        parse_iskey(parse_file *self, const char *ident);
+
+void lexerror(lex_file*, const char *fmt, ...);
+
+#endif
diff --git a/main.c b/main.c

index 62786d35acb325a9dd32952ef0783d24c5df6ec3..195574ea1538c737b65aeb0f7bb4207a5d3066c8 100644 (file)
--- a/main.c
+++ b/main.c
@@ -21,19 +21,18 @@
   * SOFTWARE.
   */
  #include "gmqcc.h"
-typedef struct { char *name, type; } argitem;
-VECTOR_MAKE(argitem, items);
  
+bool parser_compile(const char *filename);
  int main(int argc, char **argv) {
-    size_t itr;
-
      util_debug("COM", "starting ...\n");
  
+    if (argc == 2) {
+        if (!parser_compile(argv[1])) {
+            printf("There were compile errors\n");
+        }
+    }
+
      util_debug("COM", "cleaning ...\n");
-    /* clean list */
-    for (itr = 0; itr < items_elements; itr++)
-        mem_d(items_data[itr].name);
-    mem_d(items_data);
  
      util_meminfo();
      return 0;
diff --git a/parser.c b/parser.c

new file mode 100644 (file)

index 0000000..0dd972e
--- /dev/null
+++ b/parser.c
@@ -0,0 +1,7 @@
+#include "gmqcc.h"
+#include "lexer.h"
+
+bool parser_compile(const char *filename)
+{
+    return false;
+}
author	Wolfgang Bumiller <wolfgang.linux@bumiller.com>
	Mon, 16 Jul 2012 11:59:10 +0000 (13:59 +0200)
committer	Wolfgang Bumiller <wolfgang.linux@bumiller.com>
	Mon, 16 Jul 2012 11:59:10 +0000 (13:59 +0200)
Makefile		patch \| blob \| history
gmqcc.h		patch \| blob \| history
lexer.c	[new file with mode: 0644]	patch \| blob
lexer.h	[new file with mode: 0644]	patch \| blob
main.c		patch \| blob \| history
parser.c	[new file with mode: 0644]	patch \| blob