Mercurial > hg > index.cgi
view lwcc/cc-parse.c @ 516:c33b4abff860
Fix bug related to parsing \x sequences under pragma cstrings
Fix the test for lower case letter digits to test for the full range of hex
digit values instead of just 0 to 9 when deciding to apply the correction
factor for lower case.
author | William Astle <lost@l-w.ca> |
---|---|
date | Thu, 11 Feb 2021 09:25:16 -0700 |
parents | 7e8298f7bc0a |
children |
line wrap: on
line source
/* lwcc/cc-parse.c Copyright © 2019 William Astle This file is part of LWTOOLS. LWTOOLS is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <string.h> #include <lw_alloc.h> #include <lw_string.h> #include "cpp.h" #include "tree.h" #define TOK_KW_IF -1 #define TOK_KW_ELSE -2 #define TOK_KW_WHILE -3 #define TOK_KW_DO -4 #define TOK_KW_FOR -5 #define TOK_KW_VOID -6 #define TOK_KW_INT -7 #define TOK_KW_CHAR -8 #define TOK_KW_SHORT -9 #define TOK_KW_LONG -10 #define TOK_KW_UNSIGNED -11 #define TOK_KW_SIGNED -12 #define TOK_KW_FLOAT -13 #define TOK_KW_DOUBLE -14 #define TOK_KW_STRUCT -15 #define TOK_KW_UNION -16 #define TOK_KW_TYPEDEF -17 #define TOK_KW_STATIC -18 #define TOK_KW_SWITCH -19 #define TOK_KW_CASE -20 #define TOK_KW_DEFAULT -21 #define TOK_KW_BREAK -22 #define TOK_KW_CONTINUE -23 #define TOK_KW_CONST -24 #define TOK_KW_AUTO -25 #define TOK_KW_ENUM -26 #define TOK_KW_REGISTER -27 #define TOK_KW_SIZEOF -28 #define TOK_KW_VOLATILE -29 #define TOK_KW_RETURN -30 #define TOK_KW_EXTERN -31 #define TOK_KW_GOTO -32 #define TOK_TYPENAME -100 #define TOK_CONST_INT -150 static struct { int tok; char *word; } keyword_list[] = { { TOK_KW_IF, "if" }, { TOK_KW_ELSE, "else" }, { TOK_KW_WHILE, "while" }, { TOK_KW_DO, "do" }, { TOK_KW_FOR, "for" }, { TOK_KW_VOID, "void" }, { TOK_KW_INT, "int" }, { TOK_KW_CHAR, "char" }, { TOK_KW_SHORT, "short" }, { TOK_KW_LONG, "long" }, { TOK_KW_UNSIGNED, "unsigned" }, { TOK_KW_SIGNED, "signed" }, { TOK_KW_FLOAT, "float" }, { TOK_KW_DOUBLE, "double" }, { TOK_KW_STRUCT, "struct" }, { TOK_KW_UNION, "union" }, { TOK_KW_TYPEDEF, "typedef" }, { TOK_KW_STATIC, "static" }, { TOK_KW_SWITCH, "switch" }, { TOK_KW_CASE, "case" }, { TOK_KW_DEFAULT, "default" }, { TOK_KW_BREAK, "break" }, { TOK_KW_CONTINUE, "continue" }, { TOK_KW_CONST, "const" }, { TOK_KW_AUTO, "auto" }, { TOK_KW_ENUM, "enum" }, { TOK_KW_REGISTER, "register" }, { TOK_KW_SIZEOF, "sizeof" }, { TOK_KW_VOLATILE, "volatile" }, { TOK_KW_RETURN, "return" }, { TOK_KW_EXTERN, "extern" }, { TOK_KW_GOTO, "goto" }, { TOK_NONE, "" } }; struct parser_state { struct preproc_info *pp; // preprocessor data struct token *curtok; // the current token }; struct token *parse_next(struct parser_state *ps) { struct token *tok; int i; for (;;) { tok = preproc_next(ps -> pp); if (tok -> ttype == TOK_WSPACE) continue; if (tok -> ttype == TOK_EOL) continue; if (tok -> ttype == TOK_CHAR) { // random character fprintf(stderr, "Random character %02x\n", tok -> strval[0]); if (tok -> strval[0] < 32 || tok -> strval[0] > 126) continue; } break; } if (tok -> ttype == TOK_IDENT) { // convert identifier tokens to their respective meanings for (i = 0; keyword_list[i].tok != TOK_NONE; i++) { if (strcmp(keyword_list[i].word, tok -> strval) == 0) { tok -> ttype = keyword_list[i].tok; goto out; } } // check for registered types here } else if (tok -> ttype == TOK_NUMBER) { // look for anything that isn't 0-9 for (i = 0; tok -> strval[i]; i++) { if (tok -> strval[i] < '0' || tok -> strval[i] > '9') break; } if (tok -> strval[i] == 0) tok -> ttype = TOK_CONST_INT; } out: fprintf(stderr, "Lexed: "); token_print(tok, stderr); fprintf(stderr, " (%d)\n", tok -> ttype); if (ps -> curtok) token_free(ps -> curtok); ps -> curtok = tok; return tok; } void parse_generr(struct parser_state *ps, char *tag) { fprintf(stderr, "(%s) Unexpected token (%d): ", tag, ps -> curtok -> ttype); token_print(ps -> curtok, stderr); fprintf(stderr, "\n"); } node_t *parse_expr_real(struct parser_state *ps, int prec); // parse an elementary type (int, etc.) node_t *parse_elem_type(struct parser_state *ps) { int sgn = -1; int nt = -1; int nn = 1; if (ps -> curtok -> ttype == TOK_KW_SIGNED) { sgn = 1; parse_next(ps); } else if (ps -> curtok -> ttype == TOK_KW_UNSIGNED) { sgn = 0; parse_next(ps); } switch (ps -> curtok -> ttype) { // NOTE: char is unsigned by default case TOK_KW_CHAR: if (sgn == -1 || sgn == 0) nt = NODE_TYPE_UCHAR; else nt = NODE_TYPE_CHAR; break; case TOK_KW_SHORT: nt = sgn ? NODE_TYPE_SHORT : NODE_TYPE_USHORT; break; case TOK_KW_INT: nt = sgn ? NODE_TYPE_INT : NODE_TYPE_UINT; break; case TOK_KW_LONG: parse_next(ps); if (ps -> curtok -> ttype == TOK_KW_LONG) { nt = sgn ? NODE_TYPE_LONGLONG : NODE_TYPE_ULONGLONG; break; } nn = 0; nt = sgn ? NODE_TYPE_LONG : NODE_TYPE_ULONG; break; } if (nt == -1) { if (sgn == -1) { return NULL; } else { nt = sgn ? NODE_TYPE_INT : NODE_TYPE_UINT; } } else if (nn) { parse_next(ps); } return node_create(nt); } // if ident is non-zero, accept an identifier as part of the type; otherwise // do not accept an identifier; currently a stub node_t *parse_type(struct parser_state *ps, int ident) { node_t *rv; // see if we have an elementary type rv = parse_elem_type(ps); // look for "struct", etc. // look for pointer indicator(s) // look for identifier if wanted/allowed // look for array indicator or function parameter list return rv; } node_t *parse_term_real(struct parser_state *ps) { node_t *rv, *rv2; switch (ps -> curtok -> ttype) { case TOK_CONST_INT: rv = node_create(NODE_CONST_INT, ps -> curtok -> strval); parse_next(ps); return rv; // opening paren: either grouping or type cast case TOK_OPAREN: parse_next(ps); // parse a type without an identifier rv2 = parse_type(ps, 0); if (rv2) { if (ps -> curtok -> ttype != TOK_CPAREN) { node_destroy(rv2); parse_generr(ps, "missing ) on type cast"); return NULL; } parse_next(ps); // detect C99 compound literal here rv = parse_expr_real(ps, 175); if (!rv) { node_destroy(rv); return NULL; } return node_create(NODE_TYPECAST, rv2, rv); } // grouping rv = parse_expr_real(ps, 0); if (ps -> curtok -> ttype != TOK_CPAREN) { node_destroy(rv); parse_generr(ps, "missing ) on expression grouping"); return NULL; } parse_next(ps); return rv; } parse_generr(ps, "term"); return NULL; } node_t *parse_expr_fncall(struct parser_state *ps, node_t *term1) { if (ps -> curtok -> ttype != TOK_CPAREN) { node_destroy(term1); parse_generr(ps, "missing )"); return NULL; } parse_next(ps); return node_create(NODE_OPER_FNCALL, term1, NULL); } node_t *parse_expr_postinc(struct parser_state *ps, node_t *term1) { return node_create(NODE_OPER_POSTINC, term1); } node_t *parse_expr_postdec(struct parser_state *ps, node_t *term1) { return node_create(NODE_OPER_POSTDEC, term1); } node_t *parse_expr_subscript(struct parser_state *ps, node_t *term1) { node_t *term2; term2 = parse_expr_real(ps, 0); if (!term2) { node_destroy(term1); return NULL; } if (ps -> curtok -> ttype != TOK_CSQUARE) { node_destroy(term2); node_destroy(term1); parse_generr(ps, "missing ]"); return NULL; } parse_next(ps); return node_create(NODE_OPER_SUBSCRIPT, term1, term2); } node_t *parse_expr_cond(struct parser_state *ps, node_t *term1) { node_t *term2, *term3; // conditional operator // NOTE: the middle operand is evaluated as though it is its own // independent expression because the : must appear. The third // operand is evaluated at the ternary operator precedence so that // subsequent operand binding behaves correctly (if surprisingly). This // would be less confusing if the ternary operator was fully bracketed // (that is, had a terminator) term2 = parse_expr_real(ps, 0); if (!term2) { node_destroy(term1); return NULL; } if (ps -> curtok -> ttype == TOK_COLON) { parse_next(ps); term3 = parse_expr_real(ps, 25); if (!term3) { node_destroy(term1); node_destroy(term2); return NULL; } return node_create(NODE_OPER_COND, term1, term2, term3); } else { node_destroy(term1); node_destroy(term2); parse_generr(ps, "missing :"); return NULL; } } node_t *parse_expr_real(struct parser_state *ps, int prec) { static struct { int tok; int nodetype; int prec; int ra; node_t *(*spec)(struct parser_state *, node_t *); } operlist[] = { // { TOK_OPAREN, NODE_OPER_FNCALL, 200, 0, parse_expr_fncall }, // { TOK_OSQUARE, NODE_OPER_SUBSCRIPT, 200, 0, parse_expr_subscript }, // { TOK_ARROW, NODE_OPER_PTRMEM, 200, 0 }, // { TOK_DOT, NODE_OPER_OBJMEM, 200, 0 }, // { TOK_DBLADD, NODE_OPER_POSTINC, 200, 0, parse_expr_postinc }, // { TOK_DBLSUB, NODE_OPER_POSTDEC, 200, 0, parse_expr_postdec }, { TOK_STAR, NODE_OPER_TIMES, 150 }, { TOK_DIV, NODE_OPER_DIVIDE, 150 }, { TOK_MOD, NODE_OPER_MOD, 150 }, { TOK_ADD, NODE_OPER_PLUS, 100 }, { TOK_SUB, NODE_OPER_MINUS, 100 }, { TOK_LSH, NODE_OPER_LSH, 90 }, { TOK_RSH, NODE_OPER_RSH, 90 }, { TOK_LT, NODE_OPER_LT, 80 }, { TOK_LE, NODE_OPER_LE, 80 }, { TOK_GT, NODE_OPER_GT, 80 }, { TOK_GE, NODE_OPER_GE, 80 }, { TOK_EQ, NODE_OPER_EQ, 70 }, { TOK_NE, NODE_OPER_NE, 70 }, { TOK_BWAND, NODE_OPER_BWAND, 60}, { TOK_XOR, NODE_OPER_BWXOR, 55 }, { TOK_BWOR, NODE_OPER_BWOR, 50 }, { TOK_BAND, NODE_OPER_BAND, 40 }, { TOK_BOR, NODE_OPER_BOR, 35 }, { TOK_QMARK, NODE_OPER_COND, 25, 1, parse_expr_cond }, // { TOK_ASS, NODE_OPER_ASS, 20, 1 }, // { TOK_ADDASS, NODE_OPER_ADDASS, 20, 1 }, // { TOK_SUBASS, NODE_OPER_SUBASS, 20, 1 }, // { TOK_MULASS, NODE_OPER_MULASS, 20, 1 }, // { TOK_DIVASS, NODE_OPER_DIVASS, 20, 1 }, // { TOK_MODASS, NODE_OPER_MODASS, 20, 1 }, // { TOK_LSHASS, NODE_OPER_LSHASS, 20, 1 }, // { TOK_RSHASS, NODE_OPER_RSHASS, 20, 1 }, // { TOK_BWANDASS, NODE_OPER_BWANDASS, 20, 1}, // { TOK_BWORASS, NODE_OPER_BWORASS, 20, 1 }, // { TOK_XORASS, NODE_OPER_BWXORASS, 20, 1 }, { TOK_COMMA, NODE_OPER_COMMA, 1 }, { 0, 0, 0 } }; node_t *term1, *term2; int i; term1 = parse_term_real(ps); if (!term1) return NULL; nextoper: for (i = 0; operlist[i].tok; i++) if (operlist[i].tok == ps -> curtok -> ttype) break; fprintf(stderr, "Matched operator: %d, %d\n", operlist[i].tok, operlist[i].prec); // if we hit the end of the expression, return if (operlist[i].tok == 0) return term1; // is previous operation higher precedence? If so, just return the first term if (operlist[i].prec < prec) return term1; // is this operator left associative and previous operation is same precedence? // if so, just return the first term if (operlist[i].ra == 0 && operlist[i].prec == prec) return term1; // consume the operator parse_next(ps); // special handling if (operlist[i].spec) { term2 = (operlist[i].spec)(ps, term1); if (!term2) { node_destroy(term1); return NULL; } term1 = term2; goto nextoper; } term2 = parse_expr_real(ps, operlist[i].prec); if (!term2) { parse_generr(ps, "expr"); node_destroy(term1); } term1 = node_create(operlist[i].nodetype, term1, term2); term2 = NULL; goto nextoper; } node_t *parse_expr(struct parser_state *ps) { return parse_expr_real(ps, 0); } node_t *parse_statement(struct parser_state *ps) { node_t *rv; node_t *n; switch (ps -> curtok -> ttype) { case TOK_KW_RETURN: parse_next(ps); n = parse_expr(ps); if (!n) { parse_generr(ps, "statement"); return NULL; } rv = node_create(NODE_STMT_RETURN); node_addchild(rv, n); break; default: return NULL; } if (ps -> curtok -> ttype != TOK_EOS) parse_generr(ps, "statement"); else parse_next(ps); return rv; } node_t *parse_globaldecl(struct parser_state *ps) { node_t *rv = NULL; node_t *stmt; char *fnname = NULL; if (ps -> curtok -> ttype == TOK_KW_INT) { // variable name parse_next(ps); if (ps -> curtok -> ttype != TOK_IDENT) goto error; fnname = lw_strdup(ps -> curtok -> strval); parse_next(ps); if (ps -> curtok -> ttype != TOK_OPAREN) goto error; parse_next(ps); if (ps -> curtok -> ttype != TOK_CPAREN) goto error; parse_next(ps); if (ps -> curtok -> ttype != TOK_OBRACE) goto error; parse_next(ps); stmt = parse_statement(ps); if (!stmt) goto error; rv = node_create(NODE_FUNDEF, node_create(NODE_TYPE_INT), node_create(NODE_IDENT, fnname), node_create(NODE_FUNARGS), stmt); if (ps -> curtok -> ttype != TOK_CBRACE) goto error; parse_next(ps); lw_free(fnname); return rv; } error: if (fnname) lw_free(fnname); parse_generr(ps, "globaldecl"); return rv; } node_t *parse_program(struct preproc_info *pp) { node_t *rv; node_t *node; struct parser_state ps; ps.pp = pp; ps.curtok = NULL; rv = node_create(NODE_PROGRAM); // prime the parser parse_next(&ps); while (ps.curtok -> ttype != TOK_EOF) { node = parse_globaldecl(&ps); if (!node) break; node_addchild(rv, node); } return rv; }