Mercurial > hg > index.cgi
diff lwcc/cpp/file.c @ 292:40ecbd5da481 ccdev
Part one of the C preprocessor
This is part one of the C preprocessor. It finds and then fails to intepret
directives. Also handles line splicing and trigraphs.
author | William Astle <lost@l-w.ca> |
---|---|
date | Sun, 08 Sep 2013 21:58:12 -0600 |
parents | |
children | c419b3b3d43f |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lwcc/cpp/file.c Sun Sep 08 21:58:12 2013 -0600 @@ -0,0 +1,636 @@ +/* +lwcc/cpp/file.c + +Copyright © 2013 William Astle + +This file is part of LWTOOLS. + +LWTOOLS is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software +Foundation, either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see <http://www.gnu.org/licenses/>. + + +NOTES: + +The function fetch_byte() grabs a byte from the input file. It returns +CPP_EOF if end of file has been reached. The resulting byte has passed +through three filters, in order: + +* All CRLF, LFCR, LF, and CR have been converted to CPP_EOL +* If enabled (--trigraphs), trigraphs have been interpreted +* \\n (backslash-newline) has been processed (eliminated) + +To obtain a byte without processing \\n, call fetch_byte_tg(). + +*/ + +#include <errno.h> +#include <stdio.h> +#include <string.h> + +#include <lw_alloc.h> + +#include "cpp.h" + +struct file_stack_e *file_stack = NULL; + +int is_whitespace(int c) +{ + switch (c) + { + case ' ': + case '\t': + case '\r': + case '\n': + return 1; + } + return 0; +} + +int is_sidchr(c) +{ + if (c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) + return 1; + return 0; +} + +int is_idchr(int c) +{ + if (c >= '0' && c <= '9') + return 1; + return is_sidchr(c); +} + +int is_ep(int c) +{ + if (c == 'e' || c == 'E' || c == 'p' || c == 'P') + return 1; + return 0; +} + +int is_hex(int c) +{ + if (c >= 'a' && c <= 'f') + return 1; + if (c >= 'A' && c <= 'F') + return 1; + if (c >= '0' && c <= '9') + return 1; + return 0; +} + +int is_dec(int c) +{ + if (c >= '0' && c <= '9') + return 1; + return 0; +} + +static void outchr(int c) +{ + fputc(c, output_fp); +} + +static void outstr(char *s) +{ + while (*s) + outchr(*s++); +} + +int fetch_byte_ll(struct file_stack_e *f) +{ + int c; + + if (f -> eolstate != 0) + { + f -> line++; + f -> col = 0; + } + c = getc(f -> fp); + f -> col++; + if (f -> eolstate == 1) + { + // just saw CR, munch LF + if (c == 10) + c = getc(f -> fp); + f -> eolstate = 0; + } + else if (f -> eolstate == 2) + { + // just saw LF, much CR + if (c == 13) + c = getc(f -> fp); + f -> eolstate = 0; + } + + if (c == 10) + { + // we have LF - end of line, flag to munch CR + f -> eolstate = 2; + c = CPP_EOL; + } + else if (c == 13) + { + // we have CR - end of line, flag to munch LF + f -> eolstate = 1; + c = CPP_EOL; + } + else if (c == EOF) + { + c = CPP_EOF; + } + return c; +} + +int fetch_byte_tg(struct file_stack_e *f) +{ + int c; + + if (!trigraphs) + { + c = fetch_byte_ll(f); + } + else + { + /* we have to do the trigraph shit here */ + if (f -> ra != CPP_NOUNG) + { + if (f -> qseen > 0) + { + c = '?'; + f -> qseen -= 1; + return c; + } + else + { + c = f -> ra; + f -> ra = CPP_NOUNG; + return c; + } + } + + c = fetch_byte_ll(f); + while (c == '?') + { + f -> qseen++; + c = fetch_byte_ll(f); + } + + if (f -> qseen >= 2) + { + // we have a trigraph + switch (c) + { + case '=': + c = '#'; + f -> qseen -= 2; + break; + + case '/': + c = '\\'; + f -> qseen -= 2; + break; + + case '\'': + c = '^'; + f -> qseen -= 2; + break; + + case '(': + c = '['; + f -> qseen -= 2; + break; + + case ')': + c = ']'; + f -> qseen -= 2; + break; + + case '!': + c = '|'; + f -> qseen -= 2; + break; + + case '<': + c = '{'; + f -> qseen -= 2; + break; + + case '>': + c = '}'; + f -> qseen -= 2; + break; + + case '~': + c = '~'; + f -> qseen -= 2; + break; + } + if (f -> qseen > 0) + { + f -> ra = c; + c = '?'; + f -> qseen--; + } + } + else if (f -> qseen > 0) + { + f -> ra = c; + c = '?'; + f -> qseen--; + } + } + return c; +} + +int fetch_byte(struct file_stack_e *f) +{ + int c; + +again: + if (f -> unget != CPP_NOUNG) + { + c = f -> unget; + f -> unget = CPP_NOUNG; + } + else + { + c = fetch_byte_tg(f); + } + if (c == '\\') + { + int c2; + c2 = fetch_byte_tg(f); + if (c2 == CPP_EOL) + goto again; + else + f -> unget = c2; + } + f -> curc = c; + return c; +} + +static void skip_line(struct file_stack_e *f) +{ + int c; + while ((c = fetch_byte(f)) != CPP_EOL && c != CPP_EOF) + /* do nothing */ ; +} + + +struct +{ + char *name; + void (*fn)(struct file_stack_e *); +} directives[] = +{ + { NULL, NULL }, + { NULL, NULL } +}; + +/* +This handles a preprocessing directive. Such a directive goes from the +next character to be retrieved from f until the first instance of CPP_EOL +or CPP_EOF. +*/ +void handle_directive(struct file_stack_e *f) +{ + int c, i; + char kw[20]; + +again: + while ((c = fetch_byte(f)) == ' ' || c == '\t') + /* do nothing */ ; + if (c == '/') + { + // maybe a comment // + c = fetch_byte(f); + if (c == '/') + { + // line comment + skip_line(f); + return; + } + if (c == '*') + { + // block comment + while (1) + { + c = fetch_byte(f); + if (c == CPP_EOF) + return; + if (c == '*') + { + c = fetch_byte(f); + if (c == '/') + { + // end of comment - try again for directive + goto again; + } + if (c == CPP_EOF) + return; + } + } + } + } + + // empty directive - do nothing + if (c == CPP_EOL) + return; + + if (c < 'a' || c > 'z') + goto out; + + i = 0; + do + { + kw[i++] = c; + if (i == sizeof(kw) - 1) + goto out; // keyword too long + c = fetch_byte(f); + } while ((c >= 'a' && c <= 'z') || (c == '_')); + kw[i++] = '\0'; + + /* we have a keyword here */ + for (i = 0; directives[i].name; i++) + { + if (strcmp(directives[i].name, kw) == 0) + { + (*directives[i].fn)(f); + return; + } + } + +/* if we fall through here, we have an unknown directive */ +out: + do_error("invalid preprocessor directive"); + skip_line(f); +} + +/* +Notes: + +Rather than tokenize the entire file, we run through it interpreting +things only as much as we need to in order to identify the following: + +preprocessing directives (#...) +identifiers which might need to be replaced with macros + +We have to interpret strings, character constants, and numbers to prevent +false positives in those situations. + +When we find a preprocessing directive, it is handled with a more +aggressive tokenization process and then intepreted accordingly. + +nlws is used to record the fact that only whitespace has occurred at the +start of a line. Whitespace is defined as comments or isspace(c). It gets +reset to 1 after each EOL character. If a non-whitespace character is +encountered, it is set to -1. If the character processing decides it really +is a whitespace character, it will set nlws back to 1 (block comment). +Elsewise, it will get set to 0 if it is still -1 when the loop starts again. + +This is needed so we can identify whitespace interposed before a +preprocessor directive. This is the only case where it matters for +the preprocessor. + +*/ +void preprocess_file(struct file_stack_e *f) +{ + int c; + int nlws = 1; + + while (1) + { + c = fetch_byte(f); +again: + if (nlws == -1) + nlws = 0; + if (c == CPP_EOF) + { + outchr('\n'); + return; + } + if (c == CPP_EOL) + { + nlws = 1; + outchr('\n'); + continue; + } + + if (!is_whitespace(c)) + nlws = -1; + + if (is_sidchr(c)) + { + // have identifier here - parse it off + char *ident = NULL; + int idlen = 0; + + do + { + ident = lw_realloc(ident, idlen + 1); + ident[idlen++] = c; + ident[idlen] = '\0'; + c = fetch_byte(f); + } while (is_idchr(c)); + + /* do something with the identifier here - macros, etc. */ + outstr(ident); + lw_free(ident); + + goto again; + } + + switch (c) + { + default: + outchr(c); + break; + + case '.': // a number - to prevent seeing an identifier in middle of number + outchr(c); + c = fetch_byte(f); + if (!is_dec(c)) + goto again; + /* fall through */ + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + do + { + outchr(c); + c = fetch_byte(f); + if (c == CPP_EOF) + return; + if (is_ep(c)) + { + outchr(c); + c = fetch_byte(f); + if (c == '-' || c == '+') + { + outchr(c); + c = fetch_byte(f); + } + } + } while ((is_idchr(c)) || (c == '.')); + goto again; + + case '#': + if (nlws) + { + handle_directive(f); + /* note: no need to reset nlws */ + } + else + outchr('#'); + break; + + case '\'': // character constant + outchr('\''); + while ((c = fetch_byte(f)) != '\'') + { + if (c == '\\') + { + outchr('\\'); + c = fetch_byte(f); + } + if (c == CPP_EOL) + { + do_warning("Unterminated character constant"); + goto again; + } + if (c == CPP_EOF) + return; + outchr(c); + } + outchr(c); + break; + + case '"': // strings + outchr(c); + while ((c = fetch_byte(f)) != '"') + { + if (c == '\\') + { + outchr('\\'); + c = fetch_byte(f); + } + if (c == CPP_EOL) + { + do_warning("unterminated string literal"); + goto again; + } + if (c == CPP_EOF) + return; + outchr(c); + } + outchr(c); + break; + + case '/': // comments + c = fetch_byte(f); + if (c == '/') + { + // line comment + outchr(' '); + do + { + c = fetch_byte(f); + } while (c != CPP_EOF && c != CPP_EOL); + } + else if (c == '*') + { + // block comment + for (;;) + { + c = fetch_byte(f); + if (c == CPP_EOF) + { + break; + } + if (c == CPP_EOL) + { + continue; + } + if (c == '*') + { + // maybe end of comment + c = fetch_byte(f); + if (c == '/') + { + // end of comment + break; + } + } + } + // replace comment with a single space + outchr(' '); + if (nlws == -1) + nlws = 1; + continue; + } + else + { + // restore eaten '/' + outchr('/'); + // process the character we just fetched + goto again; + } + } // switch + } // processing loop +} + +int process_file(const char *f) +{ + struct file_stack_e *nf; + FILE *fp; + + fprintf(stderr, "Processing %s\n", f); + + if (strcmp(f, "-") == 0) + fp = stdin; + else + fp = fopen(f, "rb"); + if (fp == NULL) + { + do_warning("Cannot open %s: %s", f, strerror(errno)); + return -1; + } + + /* push the file onto the file stack */ + nf = lw_alloc(sizeof(struct file_stack_e)); + nf -> fn = f; + nf -> fp = fp; + nf -> next = file_stack; + nf -> line = 1; + nf -> col = 0; + nf -> qseen = 0; + nf -> ra = CPP_NOUNG; + nf -> unget = CPP_NOUNG; + file_stack = nf; + + /* go preprocess the file */ + preprocess_file(nf); + + if (nf -> fp != stdin) + fclose(nf -> fp); + file_stack = nf -> next; + lw_free(nf); + return 0; +}