LWTools: lwcc/cpp/file.c comparison

comparison lwcc/cpp/file.c @ 293:c419b3b3d43f ccdev

Checkpoint on lwcc-cpp development This is a checkpoint with some substantial code cleanups on what is so far implemented. This should avoid substantial code duplication later.

author	William Astle <lost@l-w.ca>
date	Mon, 09 Sep 2013 23:07:19 -0600
parents	40ecbd5da481
children	048adfee2933

comparison

equal deleted inserted replaced

-:40ecbd5da481
+:c419b3b3d43f
 more details.
 You should have received a copy of the GNU General Public License along with
 this program. If not, see <http://www.gnu.org/licenses/>.
-NOTES:
-The function fetch_byte() grabs a byte from the input file. It returns
-CPP_EOF if end of file has been reached. The resulting byte has passed
-through three filters, in order:
-* All CRLF, LFCR, LF, and CR have been converted to CPP_EOL
-* If enabled (--trigraphs), trigraphs have been interpreted
-* \\n (backslash-newline) has been processed (eliminated)
-To obtain a byte without processing \\n, call fetch_byte_tg().
 */
 #include <errno.h>
 #include <stdio.h>
 #include <string.h>
 #include "cpp.h"
 struct file_stack_e *file_stack = NULL;
-int is_whitespace(int c)
+/* output a byte to the current output stream as long as we aren't in the
-{
+middle of a false conditional. CPP_EOL will be converted to '\n'
-	switch (c)
+on output. */
-	{
+void outchr(int c)
-	case ' ':
+{
-	case '\t':
+	if (skip_level)
-	case '\r':
+		return;
-	case '\n':
+	if (c == CPP_EOL)
-		return 1;
+		c = '\n';
-	}
-	return 0;
-}
-int is_sidchr(c)
-{
-	if (c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
-		return 1;
-	return 0;
-}
-int is_idchr(int c)
-{
-	if (c >= '0' && c <= '9')
-		return 1;
-	return is_sidchr(c);
-}
-int is_ep(int c)
-{
-	if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
-		return 1;
-	return 0;
-}
-int is_hex(int c)
-{
-	if (c >= 'a' && c <= 'f')
-		return 1;
-	if (c >= 'A' && c <= 'F')
-		return 1;
-	if (c >= '0' && c <= '9')
-		return 1;
-	return 0;
-}
-int is_dec(int c)
-{
-	if (c >= '0' && c <= '9')
-		return 1;
-	return 0;
-}
-static void outchr(int c)
-{
 	fputc(c, output_fp);
 }
-static void outstr(char *s)
+/* output a string to the current output stream as long as we aren't in the
-{
+middle of a false conditional */
+void outstr(char *s)
+{
+	if (skip_level)
+		return;
 	while (*s)
 		outchr(*s++);
 }
-int fetch_byte_ll(struct file_stack_e *f)
+/* fetch a raw input byte from the current file. Will return CPP_EOF if
+EOF is encountered and CPP_EOL if an end of line sequence is encountered.
+End of line is defined as either CR, CRLF, LF, or LFCR. CPP_EOL is
+returned on the first CR or LF encountered. The complementary CR or LF
+is munched, if present, when the *next* character is read. This always
+operates on file_stack.
+This function also accounts for line numbers in input files and also
+character columns.
+*/
+int fetch_byte_ll(void)
 {
 	int c;
-	if (f -> eolstate != 0)
+	if (file_stack -> eolstate != 0)
 	{
-		f -> line++;
+		file_stack -> line++;
-		f -> col = 0;
+		file_stack -> col = 0;
 	}
-	c = getc(f -> fp);
+	c = getc(file_stack -> fp);
-	f -> col++;
+	file_stack -> col++;
-	if (f -> eolstate == 1)
+	if (file_stack -> eolstate == 1)
 	{
 		// just saw CR, munch LF
 		if (c == 10)
-			c = getc(f -> fp);
+			c = getc(file_stack -> fp);
-		f -> eolstate = 0;
+		file_stack -> eolstate = 0;
 	}
-	else if (f -> eolstate == 2)
+	else if (file_stack -> eolstate == 2)
 	{
 		// just saw LF, much CR
 		if (c == 13)
-			c = getc(f -> fp);
+			c = getc(file_stack -> fp);
-		f -> eolstate = 0;
+		file_stack -> eolstate = 0;
 	}
 	if (c == 10)
 	{
 		// we have LF - end of line, flag to munch CR
-		f -> eolstate = 2;
+		file_stack -> eolstate = 2;
 		c = CPP_EOL;
 	}
 	else if (c == 13)
 	{
 		// we have CR - end of line, flag to munch LF
-		f -> eolstate = 1;
+		file_stack -> eolstate = 1;
 		c = CPP_EOL;
 	}
 	else if (c == EOF)
 	{
 		c = CPP_EOF;
 	}
 	return c;
 }
-int fetch_byte_tg(struct file_stack_e *f)
+/* This function takes a sequence of bytes from the _ll function above
+and does trigraph interpretation on it, but only if the global
+trigraphs is nonzero. */
+int fetch_byte_tg(void)
 {
 	int c;
 	if (!trigraphs)
 	{
-		c = fetch_byte_ll(f);
+		c = fetch_byte_ll();
 	}
 	else
 	{
 		/* we have to do the trigraph shit here */
-		if (f -> ra != CPP_NOUNG)
+		if (file_stack -> ra != CPP_NOUNG)
 		{
-			if (f -> qseen > 0)
+			if (file_stack -> qseen > 0)
 			{
 				c = '?';
-				f -> qseen -= 1;
+				file_stack -> qseen -= 1;
 				return c;
 			}
 			else
 			{
-				c = f -> ra;
+				c = file_stack -> ra;
-				f -> ra = CPP_NOUNG;
+				file_stack -> ra = CPP_NOUNG;
 				return c;
 			}
 		}
-		c = fetch_byte_ll(f);
+		c = fetch_byte_ll();
 		while (c == '?')
 		{
-			f -> qseen++;
+			file_stack -> qseen++;
-			c = fetch_byte_ll(f);
+			c = fetch_byte_ll();
 		}
-		if (f -> qseen >= 2)
+		if (file_stack -> qseen >= 2)
 		{
 			// we have a trigraph
 			switch (c)
 			{
 			case '=':
 				c = '#';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			case '/':
 				c = '\\';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			case '\'':
 				c = '^';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			case '(':
 				c = '[';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			case ')':
 				c = ']';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			case '!':
 				c = '|';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			case '<':
 				c = '{';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			case '>':
 				c = '}';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			case '~':
 				c = '~';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			}
-			if (f -> qseen > 0)
+			if (file_stack -> qseen > 0)
 			{
-				f -> ra = c;
+				file_stack -> ra = c;
 				c = '?';
-				f -> qseen--;
+				file_stack -> qseen--;
 			}
 		}
-		else if (f -> qseen > 0)
+		else if (file_stack -> qseen > 0)
 		{
-			f -> ra = c;
+			file_stack -> ra = c;
 			c = '?';
-			f -> qseen--;
+			file_stack -> qseen--;
 		}
 	}
 	return c;
 }
-int fetch_byte(struct file_stack_e *f)
+/* This function puts a byte back onto the front of the input stream used
+by fetch_byte(). Theoretically, an unlimited number of characters can
+be unfetched. Line and column counting may be incorrect if unfetched
+characters cross a token boundary. */
+void unfetch_byte(int c)
+{
+	if (file_stack -> ungetbufl >= file_stack -> ungetbufs)
+	{
+		file_stack -> ungetbufs += 100;
+		file_stack -> ungetbuf = lw_realloc(file_stack -> ungetbuf, file_stack -> ungetbufs);
+	}
+	file_stack -> ungetbuf[file_stack -> ungetbufl++] = c;
+}
+/* This function retrieves a byte from the input stream. It performs
+backslash-newline splicing on the returned bytes. Any character
+retrieved from the unfetch buffer is presumed to have already passed
+the backslash-newline filter. */
+int fetch_byte(void)
 {
 	int c;
+	if (file_stack -> ungetbufl > 0)
+	{
+		file_stack -> ungetbufl--;
+		c = file_stack -> ungetbuf[file_stack -> ungetbufl];
+		if (file_stack -> ungetbufl == 0)
+		{
+			lw_free(file_stack -> ungetbuf);
+			file_stack -> ungetbuf = NULL;
+			file_stack -> ungetbufs = 0;
+		}
+		return c;
+	}
 again:
-	if (f -> unget != CPP_NOUNG)
+	if (file_stack -> unget != CPP_NOUNG)
 	{
-		c = f -> unget;
+		c = file_stack -> unget;
-		f -> unget = CPP_NOUNG;
+		file_stack -> unget = CPP_NOUNG;
 	}
 	else
 	{
-		c = fetch_byte_tg(f);
+		c = fetch_byte_tg();
 	}
 	if (c == '\\')
 	{
 		int c2;
-		c2 = fetch_byte_tg(f);
+		c2 = fetch_byte_tg();
 		if (c2 == CPP_EOL)
 			goto again;
 		else
-			f -> unget = c2;
+			file_stack -> unget = c2;
 	}
-	f -> curc = c;
+	file_stack -> curc = c;
 	return c;
 }
-static void skip_line(struct file_stack_e *f)
+/* This function opens (if not stdin) the file f and pushes it onto the
-{
+top of the input file stack. It then proceeds to process the file
-	int c;
+and return. Nonzero return means the file could not be opened. */
-	while ((c = fetch_byte(f)) != CPP_EOL && c != CPP_EOF)
-		/* do nothing */ ;
-}
-struct
-{
-	char *name;
-	void (*fn)(struct file_stack_e *);
-} directives[] =
-{
-	{ NULL, NULL },
-	{ NULL, NULL }
-};
-/*
-This handles a preprocessing directive. Such a directive goes from the
-next character to be retrieved from f until the first instance of CPP_EOL
-or CPP_EOF.
-*/
-void handle_directive(struct file_stack_e *f)
-{
-	int c, i;
-	char kw[20];
-again:
-	while ((c = fetch_byte(f)) == ' ' || c == '\t')
-		/* do nothing */ ;
-	if (c == '/')
-	{
-		// maybe a comment //
-		c = fetch_byte(f);
-		if (c == '/')
-		{
-			// line comment
-			skip_line(f);
-			return;
-		}
-		if (c == '*')
-		{
-			// block comment
-			while (1)
-			{
-				c = fetch_byte(f);
-				if (c == CPP_EOF)
-					return;
-				if (c == '*')
-				{
-					c = fetch_byte(f);
-					if (c == '/')
-					{
-						// end of comment - try again for directive
-						goto again;
-					}
-					if (c == CPP_EOF)
-						return;
-				}
-			}
-		}
-	}
-	// empty directive - do nothing
-	if (c == CPP_EOL)
-		return;
-	if (c < 'a' || c > 'z')
-		goto out;
-	i = 0;
-	do
-	{
-		kw[i++] = c;
-		if (i == sizeof(kw) - 1)
-			goto out;	// keyword too long
-		c = fetch_byte(f);
-	} while ((c >= 'a' && c <= 'z') || (c == '_'));
-	kw[i++] = '\0';
-	/* we have a keyword here */
-	for (i = 0; directives[i].name; i++)
-	{
-		if (strcmp(directives[i].name, kw) == 0)
-		{
-			(*directives[i].fn)(f);
-			return;
-		}
-	}
-/* if we fall through here, we have an unknown directive */
-out:
-	do_error("invalid preprocessor directive");
-	skip_line(f);
-}
-/*
-Notes:
-Rather than tokenize the entire file, we run through it interpreting
-things only as much as we need to in order to identify the following:
-preprocessing directives (#...)
-identifiers which might need to be replaced with macros
-We have to interpret strings, character constants, and numbers to prevent
-false positives in those situations.
-When we find a preprocessing directive, it is handled with a more
-aggressive tokenization process and then intepreted accordingly.
-nlws is used to record the fact that only whitespace has occurred at the
-start of a line. Whitespace is defined as comments or isspace(c). It gets
-reset to 1 after each EOL character. If a non-whitespace character is
-encountered, it is set to -1. If the character processing decides it really
-is a whitespace character, it will set nlws back to 1 (block comment).
-Elsewise, it will get set to 0 if it is still -1 when the loop starts again.
-This is needed so we can identify whitespace interposed before a
-preprocessor directive. This is the only case where it matters for
-the preprocessor.
-*/
-void preprocess_file(struct file_stack_e *f)
-{
-	int c;
-	int nlws = 1;
-	while (1)
-	{
-		c = fetch_byte(f);
-again:
-		if (nlws == -1)
-			nlws = 0;
-		if (c == CPP_EOF)
-		{
-			outchr('\n');
-			return;
-		}
-		if (c == CPP_EOL)
-		{
-			nlws = 1;
-			outchr('\n');
-			continue;
-		}
-		if (!is_whitespace(c))
-			nlws = -1;
-		if (is_sidchr(c))
-		{
-			// have identifier here - parse it off
-			char *ident = NULL;
-			int idlen = 0;
-			do
-			{
-				ident = lw_realloc(ident, idlen + 1);
-				ident[idlen++] = c;
-				ident[idlen] = '\0';
-				c = fetch_byte(f);
-			} while (is_idchr(c));
-			/* do something with the identifier here  - macros, etc. */
-			outstr(ident);
-			lw_free(ident);
-			goto again;
-		}
-		switch (c)
-		{
-		default:
-			outchr(c);
-			break;
-		case '.':	// a number - to prevent seeing an identifier in middle of number
-			outchr(c);
-			c = fetch_byte(f);
-			if (!is_dec(c))
-				goto again;
-			/* fall through */
-		case '0':
-		case '1':
-		case '2':
-		case '3':
-		case '4':
-		case '5':
-		case '6':
-		case '7':
-		case '8':
-		case '9':
-			do
-			{
-				outchr(c);
-				c = fetch_byte(f);
-				if (c == CPP_EOF)
-					return;
-				if (is_ep(c))
-				{
-					outchr(c);
-					c = fetch_byte(f);
-					if (c == '-' || c == '+')
-					{
-						outchr(c);
-						c = fetch_byte(f);
-					}
-				}
-			} while ((is_idchr(c)) || (c == '.'));
-			goto again;
-		case '#':
-			if (nlws)
-			{
-				handle_directive(f);
-				/* note: no need to reset nlws */
-			}
-			else
-				outchr('#');
-			break;
-		case '\'':	// character constant
-			outchr('\'');
-			while ((c = fetch_byte(f)) != '\'')
-			{
-				if (c == '\\')
-				{
-					outchr('\\');
-					c = fetch_byte(f);
-				}
-				if (c == CPP_EOL)
-				{
-					do_warning("Unterminated character constant");
-					goto again;
-				}
-				if (c == CPP_EOF)
-					return;
-				outchr(c);
-			}
-			outchr(c);
-			break;
-		case '"':	// strings
-			outchr(c);
-			while ((c = fetch_byte(f)) != '"')
-			{
-				if (c == '\\')
-				{
-					outchr('\\');
-					c = fetch_byte(f);
-				}
-				if (c == CPP_EOL)
-				{
-					do_warning("unterminated string literal");
-					goto again;
-				}
-				if (c == CPP_EOF)
-					return;
-				outchr(c);
-			}
-			outchr(c);
-			break;
-		case '/':	// comments
-			c = fetch_byte(f);
-			if (c == '/')
-			{
-				// line comment
-				outchr(' ');
-				do
-				{
-					c = fetch_byte(f);
-				} while (c != CPP_EOF && c != CPP_EOL);
-			}
-			else if (c == '*')
-			{
-				// block comment
-				for (;;)
-				{
-					c = fetch_byte(f);
-					if (c == CPP_EOF)
-					{
-						break;
-					}
-					if (c == CPP_EOL)
-					{
-						continue;
-					}
-					if (c == '*')
-					{
-						// maybe end of comment
-						c = fetch_byte(f);
-						if (c == '/')
-						{
-							// end of comment
-							break;
-						}
-					}
-				}
-				// replace comment with a single space
-				outchr(' ');
-				if (nlws == -1)
-					nlws = 1;
-				continue;
-			}
-			else
-			{
-				// restore eaten '/'
-				outchr('/');
-				// process the character we just fetched
-				goto again;
-			}
-		} // switch
-	} // processing loop
-}
 int process_file(const char *f)
 {
-	struct file_stack_e *nf;
+	struct file_stack_e nf;
 	FILE *fp;
 	fprintf(stderr, "Processing %s\n", f);
 	if (strcmp(f, "-") == 0)
 		do_warning("Cannot open %s: %s", f, strerror(errno));
 		return -1;
 	}
 	/* push the file onto the file stack */
-	nf = lw_alloc(sizeof(struct file_stack_e));
+	nf.fn = f;
-	nf -> fn = f;
+	nf.fp = fp;
-	nf -> fp = fp;
+	nf.next = file_stack;
-	nf -> next = file_stack;
+	nf.line = 1;
-	nf -> line = 1;
+	nf.col = 0;
-	nf -> col = 0;
+	nf.qseen = 0;
-	nf -> qseen = 0;
+	nf.ra = CPP_NOUNG;
-	nf -> ra = CPP_NOUNG;
+	nf.unget = CPP_NOUNG;
-	nf -> unget = CPP_NOUNG;
+	file_stack = &nf;
-	file_stack = nf;
+	nf.ungetbuf = NULL;
+	nf.ungetbufs = 0;
+	nf.ungetbufl = 0;
 	/* go preprocess the file */
-	preprocess_file(nf);
+	preprocess_file();
-	if (nf -> fp != stdin)
+	if (nf.fp != stdin)
-		fclose(nf -> fp);
+		fclose(nf.fp);
-	file_stack = nf -> next;
+	file_stack = nf.next;
-	lw_free(nf);
 	return 0;
 }

Mercurial > hg > index.cgi

comparison lwcc/cpp/file.c @ 293:c419b3b3d43f ccdev