Mercurial > hg > index.cgi
diff lwcc/cpp/file.c @ 293:c419b3b3d43f ccdev
Checkpoint on lwcc-cpp development
This is a checkpoint with some substantial code cleanups on what is so far
implemented. This should avoid substantial code duplication later.
author | William Astle <lost@l-w.ca> |
---|---|
date | Mon, 09 Sep 2013 23:07:19 -0600 |
parents | 40ecbd5da481 |
children | 048adfee2933 |
line wrap: on
line diff
--- a/lwcc/cpp/file.c Sun Sep 08 21:58:12 2013 -0600 +++ b/lwcc/cpp/file.c Mon Sep 09 23:07:19 2013 -0600 @@ -18,19 +18,6 @@ You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. - -NOTES: - -The function fetch_byte() grabs a byte from the input file. It returns -CPP_EOF if end of file has been reached. The resulting byte has passed -through three filters, in order: - -* All CRLF, LFCR, LF, and CR have been converted to CPP_EOL -* If enabled (--trigraphs), trigraphs have been interpreted -* \\n (backslash-newline) has been processed (eliminated) - -To obtain a byte without processing \\n, call fetch_byte_tg(). - */ #include <errno.h> @@ -43,105 +30,74 @@ struct file_stack_e *file_stack = NULL; -int is_whitespace(int c) -{ - switch (c) - { - case ' ': - case '\t': - case '\r': - case '\n': - return 1; - } - return 0; -} - -int is_sidchr(c) -{ - if (c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) - return 1; - return 0; -} - -int is_idchr(int c) +/* output a byte to the current output stream as long as we aren't in the + middle of a false conditional. CPP_EOL will be converted to '\n' + on output. */ +void outchr(int c) { - if (c >= '0' && c <= '9') - return 1; - return is_sidchr(c); -} - -int is_ep(int c) -{ - if (c == 'e' || c == 'E' || c == 'p' || c == 'P') - return 1; - return 0; -} - -int is_hex(int c) -{ - if (c >= 'a' && c <= 'f') - return 1; - if (c >= 'A' && c <= 'F') - return 1; - if (c >= '0' && c <= '9') - return 1; - return 0; -} - -int is_dec(int c) -{ - if (c >= '0' && c <= '9') - return 1; - return 0; -} - -static void outchr(int c) -{ + if (skip_level) + return; + if (c == CPP_EOL) + c = '\n'; fputc(c, output_fp); } -static void outstr(char *s) +/* output a string to the current output stream as long as we aren't in the + middle of a false conditional */ +void outstr(char *s) { + if (skip_level) + return; while (*s) outchr(*s++); } -int fetch_byte_ll(struct file_stack_e *f) +/* fetch a raw input byte from the current file. Will return CPP_EOF if + EOF is encountered and CPP_EOL if an end of line sequence is encountered. + End of line is defined as either CR, CRLF, LF, or LFCR. CPP_EOL is + returned on the first CR or LF encountered. The complementary CR or LF + is munched, if present, when the *next* character is read. This always + operates on file_stack. + + This function also accounts for line numbers in input files and also + character columns. +*/ +int fetch_byte_ll(void) { int c; - if (f -> eolstate != 0) + if (file_stack -> eolstate != 0) { - f -> line++; - f -> col = 0; + file_stack -> line++; + file_stack -> col = 0; } - c = getc(f -> fp); - f -> col++; - if (f -> eolstate == 1) + c = getc(file_stack -> fp); + file_stack -> col++; + if (file_stack -> eolstate == 1) { // just saw CR, munch LF if (c == 10) - c = getc(f -> fp); - f -> eolstate = 0; + c = getc(file_stack -> fp); + file_stack -> eolstate = 0; } - else if (f -> eolstate == 2) + else if (file_stack -> eolstate == 2) { // just saw LF, much CR if (c == 13) - c = getc(f -> fp); - f -> eolstate = 0; + c = getc(file_stack -> fp); + file_stack -> eolstate = 0; } if (c == 10) { // we have LF - end of line, flag to munch CR - f -> eolstate = 2; + file_stack -> eolstate = 2; c = CPP_EOL; } else if (c == 13) { // we have CR - end of line, flag to munch LF - f -> eolstate = 1; + file_stack -> eolstate = 1; c = CPP_EOL; } else if (c == EOF) @@ -151,454 +107,174 @@ return c; } -int fetch_byte_tg(struct file_stack_e *f) +/* This function takes a sequence of bytes from the _ll function above + and does trigraph interpretation on it, but only if the global + trigraphs is nonzero. */ +int fetch_byte_tg(void) { int c; - + if (!trigraphs) { - c = fetch_byte_ll(f); + c = fetch_byte_ll(); } else { /* we have to do the trigraph shit here */ - if (f -> ra != CPP_NOUNG) + if (file_stack -> ra != CPP_NOUNG) { - if (f -> qseen > 0) + if (file_stack -> qseen > 0) { c = '?'; - f -> qseen -= 1; + file_stack -> qseen -= 1; return c; } else { - c = f -> ra; - f -> ra = CPP_NOUNG; + c = file_stack -> ra; + file_stack -> ra = CPP_NOUNG; return c; } } - c = fetch_byte_ll(f); + c = fetch_byte_ll(); while (c == '?') { - f -> qseen++; - c = fetch_byte_ll(f); + file_stack -> qseen++; + c = fetch_byte_ll(); } - if (f -> qseen >= 2) + if (file_stack -> qseen >= 2) { // we have a trigraph switch (c) { case '=': c = '#'; - f -> qseen -= 2; + file_stack -> qseen -= 2; break; case '/': c = '\\'; - f -> qseen -= 2; + file_stack -> qseen -= 2; break; case '\'': c = '^'; - f -> qseen -= 2; + file_stack -> qseen -= 2; break; case '(': c = '['; - f -> qseen -= 2; + file_stack -> qseen -= 2; break; case ')': c = ']'; - f -> qseen -= 2; + file_stack -> qseen -= 2; break; case '!': c = '|'; - f -> qseen -= 2; + file_stack -> qseen -= 2; break; case '<': c = '{'; - f -> qseen -= 2; + file_stack -> qseen -= 2; break; case '>': c = '}'; - f -> qseen -= 2; + file_stack -> qseen -= 2; break; case '~': c = '~'; - f -> qseen -= 2; + file_stack -> qseen -= 2; break; } - if (f -> qseen > 0) + if (file_stack -> qseen > 0) { - f -> ra = c; + file_stack -> ra = c; c = '?'; - f -> qseen--; + file_stack -> qseen--; } } - else if (f -> qseen > 0) + else if (file_stack -> qseen > 0) { - f -> ra = c; + file_stack -> ra = c; c = '?'; - f -> qseen--; + file_stack -> qseen--; } } return c; } -int fetch_byte(struct file_stack_e *f) +/* This function puts a byte back onto the front of the input stream used + by fetch_byte(). Theoretically, an unlimited number of characters can + be unfetched. Line and column counting may be incorrect if unfetched + characters cross a token boundary. */ +void unfetch_byte(int c) +{ + if (file_stack -> ungetbufl >= file_stack -> ungetbufs) + { + file_stack -> ungetbufs += 100; + file_stack -> ungetbuf = lw_realloc(file_stack -> ungetbuf, file_stack -> ungetbufs); + } + file_stack -> ungetbuf[file_stack -> ungetbufl++] = c; +} + +/* This function retrieves a byte from the input stream. It performs + backslash-newline splicing on the returned bytes. Any character + retrieved from the unfetch buffer is presumed to have already passed + the backslash-newline filter. */ +int fetch_byte(void) { int c; + + if (file_stack -> ungetbufl > 0) + { + file_stack -> ungetbufl--; + c = file_stack -> ungetbuf[file_stack -> ungetbufl]; + if (file_stack -> ungetbufl == 0) + { + lw_free(file_stack -> ungetbuf); + file_stack -> ungetbuf = NULL; + file_stack -> ungetbufs = 0; + } + return c; + } again: - if (f -> unget != CPP_NOUNG) + if (file_stack -> unget != CPP_NOUNG) { - c = f -> unget; - f -> unget = CPP_NOUNG; + c = file_stack -> unget; + file_stack -> unget = CPP_NOUNG; } else { - c = fetch_byte_tg(f); + c = fetch_byte_tg(); } if (c == '\\') { int c2; - c2 = fetch_byte_tg(f); + c2 = fetch_byte_tg(); if (c2 == CPP_EOL) goto again; else - f -> unget = c2; + file_stack -> unget = c2; } - f -> curc = c; + file_stack -> curc = c; return c; } -static void skip_line(struct file_stack_e *f) -{ - int c; - while ((c = fetch_byte(f)) != CPP_EOL && c != CPP_EOF) - /* do nothing */ ; -} - - -struct -{ - char *name; - void (*fn)(struct file_stack_e *); -} directives[] = -{ - { NULL, NULL }, - { NULL, NULL } -}; - -/* -This handles a preprocessing directive. Such a directive goes from the -next character to be retrieved from f until the first instance of CPP_EOL -or CPP_EOF. -*/ -void handle_directive(struct file_stack_e *f) -{ - int c, i; - char kw[20]; - -again: - while ((c = fetch_byte(f)) == ' ' || c == '\t') - /* do nothing */ ; - if (c == '/') - { - // maybe a comment // - c = fetch_byte(f); - if (c == '/') - { - // line comment - skip_line(f); - return; - } - if (c == '*') - { - // block comment - while (1) - { - c = fetch_byte(f); - if (c == CPP_EOF) - return; - if (c == '*') - { - c = fetch_byte(f); - if (c == '/') - { - // end of comment - try again for directive - goto again; - } - if (c == CPP_EOF) - return; - } - } - } - } - - // empty directive - do nothing - if (c == CPP_EOL) - return; - - if (c < 'a' || c > 'z') - goto out; - - i = 0; - do - { - kw[i++] = c; - if (i == sizeof(kw) - 1) - goto out; // keyword too long - c = fetch_byte(f); - } while ((c >= 'a' && c <= 'z') || (c == '_')); - kw[i++] = '\0'; - - /* we have a keyword here */ - for (i = 0; directives[i].name; i++) - { - if (strcmp(directives[i].name, kw) == 0) - { - (*directives[i].fn)(f); - return; - } - } - -/* if we fall through here, we have an unknown directive */ -out: - do_error("invalid preprocessor directive"); - skip_line(f); -} - -/* -Notes: - -Rather than tokenize the entire file, we run through it interpreting -things only as much as we need to in order to identify the following: - -preprocessing directives (#...) -identifiers which might need to be replaced with macros - -We have to interpret strings, character constants, and numbers to prevent -false positives in those situations. - -When we find a preprocessing directive, it is handled with a more -aggressive tokenization process and then intepreted accordingly. - -nlws is used to record the fact that only whitespace has occurred at the -start of a line. Whitespace is defined as comments or isspace(c). It gets -reset to 1 after each EOL character. If a non-whitespace character is -encountered, it is set to -1. If the character processing decides it really -is a whitespace character, it will set nlws back to 1 (block comment). -Elsewise, it will get set to 0 if it is still -1 when the loop starts again. - -This is needed so we can identify whitespace interposed before a -preprocessor directive. This is the only case where it matters for -the preprocessor. - -*/ -void preprocess_file(struct file_stack_e *f) -{ - int c; - int nlws = 1; - - while (1) - { - c = fetch_byte(f); -again: - if (nlws == -1) - nlws = 0; - if (c == CPP_EOF) - { - outchr('\n'); - return; - } - if (c == CPP_EOL) - { - nlws = 1; - outchr('\n'); - continue; - } - - if (!is_whitespace(c)) - nlws = -1; - - if (is_sidchr(c)) - { - // have identifier here - parse it off - char *ident = NULL; - int idlen = 0; - - do - { - ident = lw_realloc(ident, idlen + 1); - ident[idlen++] = c; - ident[idlen] = '\0'; - c = fetch_byte(f); - } while (is_idchr(c)); - - /* do something with the identifier here - macros, etc. */ - outstr(ident); - lw_free(ident); - - goto again; - } - - switch (c) - { - default: - outchr(c); - break; - - case '.': // a number - to prevent seeing an identifier in middle of number - outchr(c); - c = fetch_byte(f); - if (!is_dec(c)) - goto again; - /* fall through */ - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - do - { - outchr(c); - c = fetch_byte(f); - if (c == CPP_EOF) - return; - if (is_ep(c)) - { - outchr(c); - c = fetch_byte(f); - if (c == '-' || c == '+') - { - outchr(c); - c = fetch_byte(f); - } - } - } while ((is_idchr(c)) || (c == '.')); - goto again; - - case '#': - if (nlws) - { - handle_directive(f); - /* note: no need to reset nlws */ - } - else - outchr('#'); - break; - - case '\'': // character constant - outchr('\''); - while ((c = fetch_byte(f)) != '\'') - { - if (c == '\\') - { - outchr('\\'); - c = fetch_byte(f); - } - if (c == CPP_EOL) - { - do_warning("Unterminated character constant"); - goto again; - } - if (c == CPP_EOF) - return; - outchr(c); - } - outchr(c); - break; - - case '"': // strings - outchr(c); - while ((c = fetch_byte(f)) != '"') - { - if (c == '\\') - { - outchr('\\'); - c = fetch_byte(f); - } - if (c == CPP_EOL) - { - do_warning("unterminated string literal"); - goto again; - } - if (c == CPP_EOF) - return; - outchr(c); - } - outchr(c); - break; - - case '/': // comments - c = fetch_byte(f); - if (c == '/') - { - // line comment - outchr(' '); - do - { - c = fetch_byte(f); - } while (c != CPP_EOF && c != CPP_EOL); - } - else if (c == '*') - { - // block comment - for (;;) - { - c = fetch_byte(f); - if (c == CPP_EOF) - { - break; - } - if (c == CPP_EOL) - { - continue; - } - if (c == '*') - { - // maybe end of comment - c = fetch_byte(f); - if (c == '/') - { - // end of comment - break; - } - } - } - // replace comment with a single space - outchr(' '); - if (nlws == -1) - nlws = 1; - continue; - } - else - { - // restore eaten '/' - outchr('/'); - // process the character we just fetched - goto again; - } - } // switch - } // processing loop -} - +/* This function opens (if not stdin) the file f and pushes it onto the + top of the input file stack. It then proceeds to process the file + and return. Nonzero return means the file could not be opened. */ int process_file(const char *f) { - struct file_stack_e *nf; + struct file_stack_e nf; FILE *fp; fprintf(stderr, "Processing %s\n", f); @@ -614,23 +290,24 @@ } /* push the file onto the file stack */ - nf = lw_alloc(sizeof(struct file_stack_e)); - nf -> fn = f; - nf -> fp = fp; - nf -> next = file_stack; - nf -> line = 1; - nf -> col = 0; - nf -> qseen = 0; - nf -> ra = CPP_NOUNG; - nf -> unget = CPP_NOUNG; - file_stack = nf; - + nf.fn = f; + nf.fp = fp; + nf.next = file_stack; + nf.line = 1; + nf.col = 0; + nf.qseen = 0; + nf.ra = CPP_NOUNG; + nf.unget = CPP_NOUNG; + file_stack = &nf; + nf.ungetbuf = NULL; + nf.ungetbufs = 0; + nf.ungetbufl = 0; + /* go preprocess the file */ - preprocess_file(nf); + preprocess_file(); - if (nf -> fp != stdin) - fclose(nf -> fp); - file_stack = nf -> next; - lw_free(nf); + if (nf.fp != stdin) + fclose(nf.fp); + file_stack = nf.next; return 0; }