Mercurial > hg > index.cgi
changeset 498:1bd2d590d734
Rejig parser to eliminate lemon
No longer use lemon for building the parser. It adds too much complexity,
really, and a hand written recursive descent parser is far more flexible.
The current iteration can handle exactly one statement: "return <int>".
author | William Astle <lost@l-w.ca> |
---|---|
date | Thu, 08 Aug 2019 23:32:23 -0600 |
parents | 4b865c9d4371 |
children | c3099c5d9d3e |
files | Makefile lwcc/cc-parse.c lwcc/tree.c lwcc/tree.h |
diffstat | 4 files changed, 285 insertions(+), 14 deletions(-) [+] |
line wrap: on
line diff
--- a/Makefile Mon Aug 05 21:38:23 2019 -0600 +++ b/Makefile Thu Aug 08 23:32:23 2019 -0600 @@ -108,8 +108,7 @@ lwcc_cpp_objs := $(lwcc_cpp_srcs:.c=.o) lwcc_cpp_deps := $(lwcc_cpp_srcs:.c=.d) -# parse_c.c needs to be first here -lwcc_cc_srcs := parse_c.c cc-main.c tree.c parse.c token_names.c +lwcc_cc_srcs := cc-main.c tree.c cc-parse.c lwcc_cc_srcs := $(addprefix lwcc/,$(lwcc_cc_srcs)) lwcc_cc_objs := $(lwcc_cc_srcs:.c=.o) lwcc_cc_deps := $(lwcc_cc_srcs:.c=.d) @@ -180,17 +179,6 @@ extra_clean := $(extra_clean) *~ */*~ -lwcc/parse_c.c lwcc/parse_c.h: lwcc/parse_c.y - rm -f lwcc/parse_c.h lwcc/parse_c.c - lemon -q lwcc/parse_c.y - -lwcc/token_names.c: lwcc/parse_c.h - echo "char *ptoken_names[] = {" > $@ - echo '"TOKEN_NONE",' >> $@ - cat lwcc/parse_c.h | sed -e 's/#define \(.*\) .*$$/"\1",/g' -e 's/ //g' >> $@ - echo '"" };' >> $@ - - %.o: %.c @echo "Building dependencies for $@" @$(CC) -MM -MG $(CPPFLAGS) -o $*.d $<
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lwcc/cc-parse.c Thu Aug 08 23:32:23 2019 -0600 @@ -0,0 +1,278 @@ +/* +lwcc/cc-parse.c + +Copyright © 2019 William Astle + +This file is part of LWTOOLS. + +LWTOOLS is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software +Foundation, either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <string.h> + +#include <lw_alloc.h> +#include <lw_string.h> + +#include "cpp.h" +#include "tree.h" + +#define TOK_KW_IF -1 +#define TOK_KW_ELSE -2 +#define TOK_KW_WHILE -3 +#define TOK_KW_DO -4 +#define TOK_KW_FOR -5 +#define TOK_KW_VOID -6 +#define TOK_KW_INT -7 +#define TOK_KW_CHAR -8 +#define TOK_KW_SHORT -9 +#define TOK_KW_LONG -10 +#define TOK_KW_UNSIGNED -11 +#define TOK_KW_SIGNED -12 +#define TOK_KW_FLOAT -13 +#define TOK_KW_DOUBLE -14 +#define TOK_KW_STRUCT -15 +#define TOK_KW_UNION -16 +#define TOK_KW_TYPEDEF -17 +#define TOK_KW_STATIC -18 +#define TOK_KW_SWITCH -19 +#define TOK_KW_CASE -20 +#define TOK_KW_DEFAULT -21 +#define TOK_KW_BREAK -22 +#define TOK_KW_CONTINUE -23 +#define TOK_KW_CONST -24 +#define TOK_KW_AUTO -25 +#define TOK_KW_ENUM -26 +#define TOK_KW_REGISTER -27 +#define TOK_KW_SIZEOF -28 +#define TOK_KW_VOLATILE -29 +#define TOK_KW_RETURN -30 +#define TOK_KW_EXTERN -31 +#define TOK_KW_GOTO -32 +#define TOK_TYPENAME -100 +#define TOK_CONST_INT -150 + +static struct { int tok; char *word; } keyword_list[] = { + { TOK_KW_IF, "if" }, + { TOK_KW_ELSE, "else" }, + { TOK_KW_WHILE, "while" }, + { TOK_KW_DO, "do" }, + { TOK_KW_FOR, "for" }, + { TOK_KW_VOID, "void" }, + { TOK_KW_INT, "int" }, + { TOK_KW_CHAR, "char" }, + { TOK_KW_SHORT, "short" }, + { TOK_KW_LONG, "long" }, + { TOK_KW_UNSIGNED, "unsigned" }, + { TOK_KW_SIGNED, "signed" }, + { TOK_KW_FLOAT, "float" }, + { TOK_KW_DOUBLE, "double" }, + { TOK_KW_STRUCT, "struct" }, + { TOK_KW_UNION, "union" }, + { TOK_KW_TYPEDEF, "typedef" }, + { TOK_KW_STATIC, "static" }, + { TOK_KW_SWITCH, "switch" }, + { TOK_KW_CASE, "case" }, + { TOK_KW_DEFAULT, "default" }, + { TOK_KW_BREAK, "break" }, + { TOK_KW_CONTINUE, "continue" }, + { TOK_KW_CONST, "const" }, + { TOK_KW_AUTO, "auto" }, + { TOK_KW_ENUM, "enum" }, + { TOK_KW_REGISTER, "register" }, + { TOK_KW_SIZEOF, "sizeof" }, + { TOK_KW_VOLATILE, "volatile" }, + { TOK_KW_RETURN, "return" }, + { TOK_KW_EXTERN, "extern" }, + { TOK_KW_GOTO, "goto" }, + { TOK_NONE, "" } +}; + + +struct parser_state +{ + struct preproc_info *pp; // preprocessor data + struct token *curtok; // the current token +}; + + +struct token *parse_next(struct parser_state *ps) +{ + struct token *tok; + int i; + + for (;;) + { + tok = preproc_next(ps -> pp); + if (tok -> ttype == TOK_WSPACE) + continue; + if (tok -> ttype == TOK_EOL) + continue; + if (tok -> ttype == TOK_CHAR) + { + // random character + fprintf(stderr, "Random character %02x\n", tok -> strval[0]); + if (tok -> strval[0] < 32 || tok -> strval[0] > 126) + continue; + } + break; + } + if (tok -> ttype == TOK_IDENT) + { + // convert identifier tokens to their respective meanings + for (i = 0; keyword_list[i].tok != TOK_NONE; i++) + { + if (strcmp(keyword_list[i].word, tok -> strval) == 0) + { + tok -> ttype = keyword_list[i].tok; + goto out; + } + } + // check for registered types here + } + else if (tok -> ttype == TOK_NUMBER) + { + // look for anything that isn't 0-9 + for (i = 0; tok -> strval[i]; i++) + { + if (tok -> strval[i] < '0' || tok -> strval[i] > '9') + break; + } + if (tok -> strval[i] == 0) + tok -> ttype = TOK_CONST_INT; + } +out: + fprintf(stderr, "Lexed: "); + token_print(tok, stderr); + fprintf(stderr, " (%d)\n", tok -> ttype); + if (ps -> curtok) + token_free(ps -> curtok); + ps -> curtok = tok; + return tok; +} + +void parse_generr(struct parser_state *ps, char *tag) +{ + fprintf(stderr, "(%s) Unexpected token (%d): ", tag, ps -> curtok -> ttype); + token_print(ps -> curtok, stderr); + fprintf(stderr, "\n"); + +} + +node_t *parse_expr(struct parser_state *ps) +{ + node_t *rv; + if (ps -> curtok -> ttype == TOK_CONST_INT) + { + rv = node_create(NODE_CONST_INT, ps -> curtok -> strval); + parse_next(ps); + return rv; + } + parse_generr(ps, "expr"); + return NULL; +} + +node_t *parse_statement(struct parser_state *ps) +{ + node_t *rv; + node_t *n; + + switch (ps -> curtok -> ttype) + { + case TOK_KW_RETURN: + parse_next(ps); + n = parse_expr(ps); + if (!n) + { + parse_generr(ps, "statement"); + return NULL; + } + rv = node_create(NODE_STMT_RETURN); + node_addchild(rv, n); + break; + + default: + return NULL; + } + + if (ps -> curtok -> ttype != TOK_EOS) + parse_generr(ps, "statement"); + else + parse_next(ps); + return rv; +} + +node_t *parse_globaldecl(struct parser_state *ps) +{ + node_t *rv = NULL; + node_t *stmt; + char *fnname = NULL; + if (ps -> curtok -> ttype == TOK_KW_INT) + { + // variable name + parse_next(ps); + if (ps -> curtok -> ttype != TOK_IDENT) + goto error; + fnname = lw_strdup(ps -> curtok -> strval); + parse_next(ps); + if (ps -> curtok -> ttype != TOK_OPAREN) + goto error; + parse_next(ps); + if (ps -> curtok -> ttype != TOK_CPAREN) + goto error; + parse_next(ps); + if (ps -> curtok -> ttype != TOK_OBRACE) + goto error; + parse_next(ps); + stmt = parse_statement(ps); + if (!stmt) + goto error; + rv = node_create(NODE_FUNDEF, node_create(NODE_TYPE_INT), node_create(NODE_IDENT, fnname), node_create(NODE_FUNARGS), stmt); + if (ps -> curtok -> ttype != TOK_CBRACE) + goto error; + parse_next(ps); + lw_free(fnname); + return rv; + } + + +error: + if (fnname) + lw_free(fnname); + parse_generr(ps, "globaldecl"); + return rv; +} + +node_t *parse_program(struct preproc_info *pp) +{ + node_t *rv; + node_t *node; + struct parser_state ps; + + ps.pp = pp; + ps.curtok = NULL; + + rv = node_create(NODE_PROGRAM); + + // prime the parser + parse_next(&ps); + while (ps.curtok -> ttype != TOK_EOF) + { + node = parse_globaldecl(&ps); + if (!node) + break; + node_addchild(rv, node); + } + + return rv; +}
--- a/lwcc/tree.c Mon Aug 05 21:38:23 2019 -0600 +++ b/lwcc/tree.c Thu Aug 08 23:32:23 2019 -0600 @@ -51,6 +51,8 @@ "FUNDECL", "FUNARGS", "BLOCK", + "STMT_RETURN", + "CONST_INT", }; @@ -77,6 +79,7 @@ break; case NODE_IDENT: + case NODE_CONST_INT: r -> strval = lw_strdup(va_arg(args, char *)); break;
--- a/lwcc/tree.h Mon Aug 05 21:38:23 2019 -0600 +++ b/lwcc/tree.h Thu Aug 08 23:32:23 2019 -0600 @@ -49,7 +49,9 @@ #define NODE_FUNDECL 21 // function declaration #define NODE_FUNARGS 22 // list of function args #define NODE_BLOCK 23 // statement block -#define NODE_NUMTYPES 24 // the number of node types +#define NODE_STMT_RETURN 24 // return statement +#define NODE_CONST_INT 25 // constant integer +#define NODE_NUMTYPES 26 // the number of node types typedef struct node_s node_t;