Mercurial > hg > index.cgi

diff lwcc/cpp/file.c @ 293:c419b3b3d43f ccdev
Checkpoint on lwcc-cpp development This is a checkpoint with some substantial code cleanups on what is so far implemented. This should avoid substantial code duplication later.
author: William Astle <lost@l-w.ca>
date: Mon, 09 Sep 2013 23:07:19 -0600
parents: 40ecbd5da481
children: 048adfee2933
--- a/lwcc/cpp/file.c	Sun Sep 08 21:58:12 2013 -0600
+++ b/lwcc/cpp/file.c	Mon Sep 09 23:07:19 2013 -0600
@@ -18,19 +18,6 @@
 You should have received a copy of the GNU General Public License along with
 this program. If not, see <http://www.gnu.org/licenses/>.
 
-
-NOTES:
-
-The function fetch_byte() grabs a byte from the input file. It returns
-CPP_EOF if end of file has been reached. The resulting byte has passed
-through three filters, in order:
-
-* All CRLF, LFCR, LF, and CR have been converted to CPP_EOL
-* If enabled (--trigraphs), trigraphs have been interpreted
-* \\n (backslash-newline) has been processed (eliminated)
-
-To obtain a byte without processing \\n, call fetch_byte_tg().
-
 */
 
 #include <errno.h>
@@ -43,105 +30,74 @@
 
 struct file_stack_e *file_stack = NULL;
 
-int is_whitespace(int c)
-{
-	switch (c)
-	{
-	case ' ':
-	case '\t':
-	case '\r':
-	case '\n':
-		return 1;
-	}
-	return 0;
-}
-
-int is_sidchr(c)
-{
-	if (c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
-		return 1;
-	return 0;
-}
-
-int is_idchr(int c)
+/* output a byte to the current output stream as long as we aren't in the
+   middle of a false conditional. CPP_EOL will be converted to '\n'
+   on output. */
+void outchr(int c)
 {
-	if (c >= '0' && c <= '9')
-		return 1;
-	return is_sidchr(c);
-}
-
-int is_ep(int c)
-{
-	if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
-		return 1;
-	return 0;
-}
-
-int is_hex(int c)
-{
-	if (c >= 'a' && c <= 'f')
-		return 1;
-	if (c >= 'A' && c <= 'F')
-		return 1;
-	if (c >= '0' && c <= '9')
-		return 1;
-	return 0;
-}
-
-int is_dec(int c)
-{
-	if (c >= '0' && c <= '9')
-		return 1;
-	return 0;
-}
-
-static void outchr(int c)
-{
+	if (skip_level)
+		return;
+	if (c == CPP_EOL)
+		c = '\n';
 	fputc(c, output_fp);
 }
 
-static void outstr(char *s)
+/* output a string to the current output stream as long as we aren't in the
+   middle of a false conditional */
+void outstr(char *s)
 {
+	if (skip_level)
+		return;
 	while (*s)
 		outchr(*s++);
 }
 
-int fetch_byte_ll(struct file_stack_e *f)
+/* fetch a raw input byte from the current file. Will return CPP_EOF if
+   EOF is encountered and CPP_EOL if an end of line sequence is encountered.
+   End of line is defined as either CR, CRLF, LF, or LFCR. CPP_EOL is
+   returned on the first CR or LF encountered. The complementary CR or LF
+   is munched, if present, when the *next* character is read. This always
+   operates on file_stack.
+
+   This function also accounts for line numbers in input files and also
+   character columns.
+*/
+int fetch_byte_ll(void)
 {
 	int c;
 
-	if (f -> eolstate != 0)	
+	if (file_stack -> eolstate != 0)	
 	{
-		f -> line++;
-		f -> col = 0;
+		file_stack -> line++;
+		file_stack -> col = 0;
 	}
-	c = getc(f -> fp);
-	f -> col++;
-	if (f -> eolstate == 1)
+	c = getc(file_stack -> fp);
+	file_stack -> col++;
+	if (file_stack -> eolstate == 1)
 	{
 		// just saw CR, munch LF
 		if (c == 10)
-			c = getc(f -> fp);
-		f -> eolstate = 0;
+			c = getc(file_stack -> fp);
+		file_stack -> eolstate = 0;
 	}
-	else if (f -> eolstate == 2)
+	else if (file_stack -> eolstate == 2)
 	{
 		// just saw LF, much CR
 		if (c == 13)
-			c = getc(f -> fp);
-		f -> eolstate = 0;
+			c = getc(file_stack -> fp);
+		file_stack -> eolstate = 0;
 	}
 	
 	if (c == 10)
 	{
 		// we have LF - end of line, flag to munch CR
-		f -> eolstate = 2;
+		file_stack -> eolstate = 2;
 		c = CPP_EOL;
 	}
 	else if (c == 13)
 	{
 		// we have CR - end of line, flag to munch LF
-		f -> eolstate = 1;
+		file_stack -> eolstate = 1;
 		c = CPP_EOL;
 	}
 	else if (c == EOF)
@@ -151,454 +107,174 @@
 	return c;
 }
 
-int fetch_byte_tg(struct file_stack_e *f)
+/* This function takes a sequence of bytes from the _ll function above
+   and does trigraph interpretation on it, but only if the global
+   trigraphs is nonzero. */
+int fetch_byte_tg(void)
 {
 	int c;
-
+	
 	if (!trigraphs)
 	{
-		c = fetch_byte_ll(f);
+		c = fetch_byte_ll();
 	}
 	else
 	{
 		/* we have to do the trigraph shit here */
-		if (f -> ra != CPP_NOUNG)
+		if (file_stack -> ra != CPP_NOUNG)
 		{
-			if (f -> qseen > 0)
+			if (file_stack -> qseen > 0)
 			{
 				c = '?';
-				f -> qseen -= 1;
+				file_stack -> qseen -= 1;
 				return c;
 			}
 			else
 			{
-				c = f -> ra;
-				f -> ra = CPP_NOUNG;
+				c = file_stack -> ra;
+				file_stack -> ra = CPP_NOUNG;
 				return c;
 			}
 		}
 	
-		c = fetch_byte_ll(f);
+		c = fetch_byte_ll();
 		while (c == '?')
 		{
-			f -> qseen++;
-			c = fetch_byte_ll(f);
+			file_stack -> qseen++;
+			c = fetch_byte_ll();
 		}
 	
-		if (f -> qseen >= 2)
+		if (file_stack -> qseen >= 2)
 		{
 			// we have a trigraph
 			switch (c)
 			{
 			case '=':
 				c = '#';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			
 			case '/':
 				c = '\\';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case '\'':
 				c = '^';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case '(':
 				c = '[';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case ')':
 				c = ']';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case '!':
 				c = '|';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case '<':
 				c = '{';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case '>':
 				c = '}';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 		
 			case '~':
 				c = '~';
-				f -> qseen -= 2;
+				file_stack -> qseen -= 2;
 				break;
 			}
-			if (f -> qseen > 0)
+			if (file_stack -> qseen > 0)
 			{
-				f -> ra = c;
+				file_stack -> ra = c;
 				c = '?';
-				f -> qseen--;
+				file_stack -> qseen--;
 			}
 		}
-		else if (f -> qseen > 0)
+		else if (file_stack -> qseen > 0)
 		{
-			f -> ra = c;
+			file_stack -> ra = c;
 			c = '?';
-			f -> qseen--;
+			file_stack -> qseen--;
 		}
 	}
 	return c;
 }
 
-int fetch_byte(struct file_stack_e *f)
+/* This function puts a byte back onto the front of the input stream used
+   by fetch_byte(). Theoretically, an unlimited number of characters can
+   be unfetched. Line and column counting may be incorrect if unfetched
+   characters cross a token boundary. */
+void unfetch_byte(int c)
+{
+	if (file_stack -> ungetbufl >= file_stack -> ungetbufs)
+	{
+		file_stack -> ungetbufs += 100;
+		file_stack -> ungetbuf = lw_realloc(file_stack -> ungetbuf, file_stack -> ungetbufs);
+	}
+	file_stack -> ungetbuf[file_stack -> ungetbufl++] = c;
+}
+
+/* This function retrieves a byte from the input stream. It performs
+   backslash-newline splicing on the returned bytes. Any character
+   retrieved from the unfetch buffer is presumed to have already passed
+   the backslash-newline filter. */
+int fetch_byte(void)
 {
 	int c;
+
+	if (file_stack -> ungetbufl > 0)
+	{
+		file_stack -> ungetbufl--;
+		c = file_stack -> ungetbuf[file_stack -> ungetbufl];
+		if (file_stack -> ungetbufl == 0)
+		{
+			lw_free(file_stack -> ungetbuf);
+			file_stack -> ungetbuf = NULL;
+			file_stack -> ungetbufs = 0;
+		}
+		return c;
+	}
 	
 again:
-	if (f -> unget != CPP_NOUNG)
+	if (file_stack -> unget != CPP_NOUNG)
 	{
-		c = f -> unget;
-		f -> unget = CPP_NOUNG;
+		c = file_stack -> unget;
+		file_stack -> unget = CPP_NOUNG;
 	}
 	else
 	{
-		c = fetch_byte_tg(f);
+		c = fetch_byte_tg();
 	}
 	if (c == '\\')
 	{
 		int c2;
-		c2 = fetch_byte_tg(f);
+		c2 = fetch_byte_tg();
 		if (c2 == CPP_EOL)
 			goto again;
 		else
-			f -> unget = c2;
+			file_stack -> unget = c2;
 	}
-	f -> curc = c;
+	file_stack -> curc = c;
 	return c;
 }
 
-static void skip_line(struct file_stack_e *f)
-{
-	int c;
-	while ((c = fetch_byte(f)) != CPP_EOL && c != CPP_EOF)
-		/* do nothing */ ;
-}
-
-
-struct
-{
-	char *name;
-	void (*fn)(struct file_stack_e *);
-} directives[] =
-{
-	{ NULL, NULL },
-	{ NULL, NULL }
-};
-
-/*
-This handles a preprocessing directive. Such a directive goes from the
-next character to be retrieved from f until the first instance of CPP_EOL
-or CPP_EOF.
-*/
-void handle_directive(struct file_stack_e *f)
-{
-	int c, i;
-	char kw[20];
-	
-again:
-	while ((c = fetch_byte(f)) == ' ' || c == '\t')
-		/* do nothing */ ;
-	if (c == '/')
-	{
-		// maybe a comment //
-		c = fetch_byte(f);
-		if (c == '/')
-		{
-			// line comment
-			skip_line(f);
-			return;
-		}
-		if (c == '*')
-		{
-			// block comment
-			while (1)
-			{
-				c = fetch_byte(f);
-				if (c == CPP_EOF)
-					return;
-				if (c == '*')
-				{
-					c = fetch_byte(f);
-					if (c == '/')
-					{
-						// end of comment - try again for directive
-						goto again;
-					}
-					if (c == CPP_EOF)
-						return;
-				}
-			}
-		}
-	}
-	
-	// empty directive - do nothing
-	if (c == CPP_EOL)
-		return;
-	
-	if (c < 'a' || c > 'z')
-		goto out;
-	
-	i = 0;
-	do
-	{
-		kw[i++] = c;
-		if (i == sizeof(kw) - 1)
-			goto out;	// keyword too long
-		c = fetch_byte(f);
-	} while ((c >= 'a' && c <= 'z') || (c == '_'));
-	kw[i++] = '\0';
-	
-	/* we have a keyword here */
-	for (i = 0; directives[i].name; i++)
-	{
-		if (strcmp(directives[i].name, kw) == 0)
-		{
-			(*directives[i].fn)(f);
-			return;
-		}
-	}
-
-/* if we fall through here, we have an unknown directive */
-out:
-	do_error("invalid preprocessor directive");
-	skip_line(f);
-}
-
-/*
-Notes:
-
-Rather than tokenize the entire file, we run through it interpreting
-things only as much as we need to in order to identify the following:
-
-preprocessing directives (#...)
-identifiers which might need to be replaced with macros
-
-We have to interpret strings, character constants, and numbers to prevent
-false positives in those situations.
-
-When we find a preprocessing directive, it is handled with a more
-aggressive tokenization process and then intepreted accordingly.
-
-nlws is used to record the fact that only whitespace has occurred at the
-start of a line. Whitespace is defined as comments or isspace(c). It gets
-reset to 1 after each EOL character. If a non-whitespace character is
-encountered, it is set to -1. If the character processing decides it really
-is a whitespace character, it will set nlws back to 1 (block comment).
-Elsewise, it will get set to 0 if it is still -1 when the loop starts again.
-
-This is needed so we can identify whitespace interposed before a
-preprocessor directive. This is the only case where it matters for
-the preprocessor.
-
-*/
-void preprocess_file(struct file_stack_e *f)
-{
-	int c;
-	int nlws = 1;
-	
-	while (1)
-	{
-		c = fetch_byte(f);
-again:
-		if (nlws == -1)
-			nlws = 0;
-		if (c == CPP_EOF)
-		{
-			outchr('\n');
-			return;
-		}
-		if (c == CPP_EOL)
-		{
-			nlws = 1;
-			outchr('\n');
-			continue;
-		}
-		
-		if (!is_whitespace(c))
-			nlws = -1;
-
-		if (is_sidchr(c))
-		{
-			// have identifier here - parse it off
-			char *ident = NULL;
-			int idlen = 0;
-			
-			do
-			{
-				ident = lw_realloc(ident, idlen + 1);
-				ident[idlen++] = c;
-				ident[idlen] = '\0';
-				c = fetch_byte(f);
-			} while (is_idchr(c));
-			
-			/* do something with the identifier here  - macros, etc. */
-			outstr(ident);
-			lw_free(ident);
-			
-			goto again;
-		}
-		
-		switch (c)
-		{
-		default:
-			outchr(c);
-			break;
-
-		case '.':	// a number - to prevent seeing an identifier in middle of number
-			outchr(c);
-			c = fetch_byte(f);
-			if (!is_dec(c))
-				goto again;
-			/* fall through */
-		case '0':
-		case '1':
-		case '2':
-		case '3':
-		case '4':
-		case '5':
-		case '6':
-		case '7':
-		case '8':
-		case '9':
-			do
-			{
-				outchr(c);
-				c = fetch_byte(f);
-				if (c == CPP_EOF)
-					return;
-				if (is_ep(c))
-				{
-					outchr(c);
-					c = fetch_byte(f);
-					if (c == '-' || c == '+')
-					{
-						outchr(c);
-						c = fetch_byte(f);
-					}
-				}
-			} while ((is_idchr(c)) || (c == '.'));
-			goto again;
-
-		case '#':
-			if (nlws)
-			{
-				handle_directive(f);
-				/* note: no need to reset nlws */
-			}
-			else
-				outchr('#');
-			break;
-		
-		case '\'':	// character constant
-			outchr('\'');
-			while ((c = fetch_byte(f)) != '\'')
-			{
-				if (c == '\\')
-				{
-					outchr('\\');
-					c = fetch_byte(f);
-				}
-				if (c == CPP_EOL)
-				{
-					do_warning("Unterminated character constant");
-					goto again;
-				}
-				if (c == CPP_EOF)
-					return;
-				outchr(c);
-			}
-			outchr(c);
-			break;
-			
-		case '"':	// strings
-			outchr(c);
-			while ((c = fetch_byte(f)) != '"')
-			{
-				if (c == '\\')
-				{
-					outchr('\\');
-					c = fetch_byte(f);
-				}
-				if (c == CPP_EOL)
-				{
-					do_warning("unterminated string literal");
-					goto again;
-				}
-				if (c == CPP_EOF)
-					return;
-				outchr(c);
-			}
-			outchr(c);
-			break;
-			
-		case '/':	// comments
-			c = fetch_byte(f);
-			if (c == '/')
-			{
-				// line comment
-				outchr(' ');
-				do
-				{
-					c = fetch_byte(f);
-				} while (c != CPP_EOF && c != CPP_EOL);
-			}
-			else if (c == '*')
-			{
-				// block comment
-				for (;;)
-				{
-					c = fetch_byte(f);
-					if (c == CPP_EOF)
-					{
-						break;
-					}
-					if (c == CPP_EOL)
-					{
-						continue;
-					}
-					if (c == '*')
-					{
-						// maybe end of comment
-						c = fetch_byte(f);
-						if (c == '/')
-						{
-							// end of comment
-							break;
-						}
-					}
-				}
-				// replace comment with a single space
-				outchr(' ');
-				if (nlws == -1)
-					nlws = 1;
-				continue;
-			}
-			else
-			{
-				// restore eaten '/'
-				outchr('/');
-				// process the character we just fetched
-				goto again;
-			}
-		} // switch
-	} // processing loop
-}
-
+/* This function opens (if not stdin) the file f and pushes it onto the
+   top of the input file stack. It then proceeds to process the file
+   and return. Nonzero return means the file could not be opened. */
 int process_file(const char *f)
 {
-	struct file_stack_e *nf;
+	struct file_stack_e nf;
 	FILE *fp;
 
 	fprintf(stderr, "Processing %s\n", f);
@@ -614,23 +290,24 @@
 	}
 
 	/* push the file onto the file stack */	
-	nf = lw_alloc(sizeof(struct file_stack_e));
-	nf -> fn = f;
-	nf -> fp = fp;
-	nf -> next = file_stack;
-	nf -> line = 1;
-	nf -> col = 0;
-	nf -> qseen = 0;
-	nf -> ra = CPP_NOUNG;
-	nf -> unget = CPP_NOUNG;
-	file_stack = nf;
-
+	nf.fn = f;
+	nf.fp = fp;
+	nf.next = file_stack;
+	nf.line = 1;
+	nf.col = 0;
+	nf.qseen = 0;
+	nf.ra = CPP_NOUNG;
+	nf.unget = CPP_NOUNG;
+	file_stack = &nf;
+	nf.ungetbuf = NULL;
+	nf.ungetbufs = 0;
+	nf.ungetbufl = 0;
+	
 	/* go preprocess the file */
-	preprocess_file(nf);
+	preprocess_file();
 	
-	if (nf -> fp != stdin)
-		fclose(nf -> fp);
-	file_stack = nf -> next;
-	lw_free(nf);
+	if (nf.fp != stdin)
+		fclose(nf.fp);
+	file_stack = nf.next;
 	return 0;
 }
author	William Astle <lost@l-w.ca>
date	Mon, 09 Sep 2013 23:07:19 -0600
parents	40ecbd5da481
children	048adfee2933