diff lwcc/lex.c @ 295:4b17780f2777 ccdev

Checkpoint lwcc development Changed tactics with the preprocessor. Instead of getting clever and trying to do things the "fast" way, instead, just tokenize the whole input and process it that way. Also, set up so the preprocessor and compiler can be integrated instead of having to have a specifically correct output for the preprocessed file. Also removed the subdirectories in the lwcc directory. It made things more complicated than they needed to be.
author William Astle <lost@l-w.ca>
date Thu, 12 Sep 2013 22:06:26 -0600
parents
children 83fcc1ed6ad6
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lwcc/lex.c	Thu Sep 12 22:06:26 2013 -0600
@@ -0,0 +1,737 @@
+/*
+lwcc/lex.c
+
+Copyright © 2013 William Astle
+
+This file is part of LWTOOLS.
+
+LWTOOLS is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software
+Foundation, either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <ctype.h>
+#include <stdio.h>
+
+#include <lw_alloc.h>
+
+#include "cpp.h"
+#include "strbuf.h"
+#include "token.h"
+
+/* fetch a raw input byte from the current file. Will return CPP_EOF if
+   EOF is encountered and CPP_EOL if an end of line sequence is encountered.
+   End of line is defined as either CR, CRLF, LF, or LFCR. CPP_EOL is
+   returned on the first CR or LF encountered. The complementary CR or LF
+   is munched, if present, when the *next* character is read. This always
+   operates on file_stack.
+
+   This function also accounts for line numbers in input files and also
+   character columns.
+*/
+static int fetch_byte_ll(struct preproc_info *pp)
+{
+	int c;
+
+	if (pp -> eolstate != 0)	
+	{
+		pp -> lineno++;
+		pp -> column = 0;
+	}
+	c = getc(pp -> fp);
+	pp -> column++;
+	if (pp -> eolstate == 1)
+	{
+		// just saw CR, munch LF
+		if (c == 10)
+			c = getc(pp -> fp);
+		pp -> eolstate = 0;
+	}
+	else if (pp -> eolstate == 2)
+	{
+		// just saw LF, much CR
+		if (c == 13)
+			c = getc(pp -> fp);
+		pp -> eolstate = 0;
+	}
+	
+	if (c == 10)
+	{
+		// we have LF - end of line, flag to munch CR
+		pp -> eolstate = 2;
+		c = CPP_EOL;
+	}
+	else if (c == 13)
+	{
+		// we have CR - end of line, flag to munch LF
+		pp -> eolstate = 1;
+		c = CPP_EOL;
+	}
+	else if (c == EOF)
+	{
+		c = CPP_EOF;
+	}
+	return c;
+}
+
+/* This function takes a sequence of bytes from the _ll function above
+   and does trigraph interpretation on it, but only if the global
+   trigraphs is nonzero. */
+static int fetch_byte_tg(struct preproc_info *pp)
+{
+	int c;
+	
+	if (!pp -> trigraphs)
+	{
+		c = fetch_byte_ll(pp);
+	}
+	else
+	{
+		/* we have to do the trigraph shit here */
+		if (pp -> ra != CPP_NOUNG)
+		{
+			if (pp -> qseen > 0)
+			{
+				c = '?';
+				pp -> qseen -= 1;
+				return c;
+			}
+			else
+			{
+				c = pp -> ra;
+				pp -> ra = CPP_NOUNG;
+				return c;
+			}
+		}
+	
+		c = fetch_byte_ll(pp);
+		while (c == '?')
+		{
+			pp -> qseen++;
+			c = fetch_byte_ll(pp);
+		}
+	
+		if (pp -> qseen >= 2)
+		{
+			// we have a trigraph
+			switch (c)
+			{
+			case '=':
+				c = '#';
+				pp -> qseen -= 2;
+				break;
+			
+			case '/':
+				c = '\\';
+				pp -> qseen -= 2;
+				break;
+		
+			case '\'':
+				c = '^';
+				pp -> qseen -= 2;
+				break;
+		
+			case '(':
+				c = '[';
+				pp -> qseen -= 2;
+				break;
+		
+			case ')':
+				c = ']';
+				pp -> qseen -= 2;
+				break;
+		
+			case '!':
+				c = '|';
+				pp -> qseen -= 2;
+				break;
+		
+			case '<':
+				c = '{';
+				pp -> qseen -= 2;
+				break;
+		
+			case '>':
+				c = '}';
+				pp -> qseen -= 2;
+				break;
+		
+			case '-':
+				c = '~';
+				pp -> qseen -= 2;
+				break;
+			}
+			if (pp -> qseen > 0)
+			{
+				pp -> ra = c;
+				c = '?';
+				pp -> qseen--;
+			}
+		}
+		else if (pp -> qseen > 0)
+		{
+			pp -> ra = c;
+			c = '?';
+			pp -> qseen--;
+		}
+	}
+	return c;
+}
+
+/* This function puts a byte back onto the front of the input stream used
+   by fetch_byte(). Theoretically, an unlimited number of characters can
+   be unfetched. Line and column counting may be incorrect if unfetched
+   characters cross a token boundary. */
+static void preproc_lex_unfetch_byte(struct preproc_info *pp, int c)
+{
+	if (pp -> ungetbufl >= pp -> ungetbufs)
+	{
+		pp -> ungetbufs += 100;
+		pp -> ungetbuf = lw_realloc(pp -> ungetbuf, pp -> ungetbufs);
+	}
+	pp -> ungetbuf[pp -> ungetbufl++] = c;
+}
+
+/* This function retrieves a byte from the input stream. It performs
+   backslash-newline splicing on the returned bytes. Any character
+   retrieved from the unfetch buffer is presumed to have already passed
+   the backslash-newline filter. */
+static int fetch_byte(struct preproc_info *pp)
+{
+	int c;
+
+	if (pp -> ungetbufl > 0)
+	{
+		pp -> ungetbufl--;
+		c = pp -> ungetbuf[pp -> ungetbufl];
+		if (pp -> ungetbufl == 0)
+		{
+			lw_free(pp -> ungetbuf);
+			pp -> ungetbuf = NULL;
+			pp -> ungetbufs = 0;
+		}
+		return c;
+	}
+	
+again:
+	if (pp -> unget != CPP_NOUNG)
+	{
+		c = pp -> unget;
+		pp -> unget = CPP_NOUNG;
+	}
+	else
+	{
+		c = fetch_byte_tg(pp);
+	}
+	if (c == '\\')
+	{
+		int c2;
+		c2 = fetch_byte_tg(pp);
+		if (c2 == CPP_EOL)
+			goto again;
+		else
+			pp -> unget = c2;
+	}
+	return c;
+}
+
+
+
+/*
+Lex a token off the current input file.
+
+Returned tokens are as follows:
+
+* all words starting with [a-zA-Z_] are returned as TOK_IDENT
+* numbers are returned as their appropriate type
+* all whitespace in a sequence, including comments, is returned as
+  a single instance of TOK_WSPACE
+* TOK_EOL is returned in the case of the end of a line
+* TOK_EOF is returned when the end of the file is reached
+* If no TOK_EOL appears before TOK_EOF, a TOK_EOL will be synthesised
+* Any symbolic operator, etc., recognized by C will be returned as such
+  a token
+* TOK_HASH will be returned for a #
+* trigraphs will be interpreted
+* backslash-newline will be interpreted
+* any instance of CR, LF, CRLF, or LFCR will be interpreted as TOK_EOL
+*/
+
+
+static int preproc_lex_fetch_byte(struct preproc_info *pp)
+{
+	int c;
+	c = fetch_byte(pp);
+	if (c == CPP_EOF && pp -> eolseen == 0)
+	{
+		preproc_throw_warning(pp, "No newline at end of file");
+		pp -> eolseen = 1;
+		return CPP_EOL;
+	}
+	
+	if (c == CPP_EOL)
+	{
+		pp -> eolseen = 1;
+		return c;
+	}
+	
+	pp -> eolseen = 0;
+	
+	/* convert comments to a single space here */
+	if (c == '/')
+	{
+		int c2;
+		c2 = fetch_byte(pp);
+		if (c2 == '/')
+		{
+			/* single line comment */
+			c = ' ';
+			for (;;)
+			{
+				c2 = fetch_byte(pp);
+				if (c2 == CPP_EOF || c2 == CPP_EOL)
+					break;
+			}
+			preproc_lex_unfetch_byte(pp, c2);
+		}
+		else if (c2 == '*')
+		{
+			/* block comment */
+			c = ' ';
+			for (;;)
+			{
+				c2 = fetch_byte(pp);
+				if (c2 == CPP_EOL || c2 == CPP_EOF)
+				{
+					preproc_lex_unfetch_byte(pp, c);
+					break;
+				}
+				if (c2 == '*')
+				{
+					/* maybe end of comment */
+					c2 = preproc_lex_fetch_byte(pp);
+					if (c2 == '/')
+						break;
+				}
+			}
+		}
+		else
+		{
+			/* not a comment - restore lookahead character */
+			preproc_lex_unfetch_byte(pp, c2);
+		}
+	}
+	return c;
+}
+
+struct token *preproc_lex_next_token(struct preproc_info *pp)
+{
+	int sline = pp -> lineno;
+	int scol = pp -> column;
+	char *strval = NULL;
+	int ttype = TOK_NONE;
+	int c, c2;
+	int cl;
+	struct strbuf *strbuf;
+	struct token *t;
+						
+	c = preproc_lex_fetch_byte(pp);
+	if (c == CPP_EOF)
+	{
+		if (pp -> nlseen == 0)
+		{
+			c = CPP_EOL;
+		}
+	}
+	
+	if (c == CPP_EOF)
+	{
+		ttype = TOK_EOF;
+		goto out;
+	}
+	if (c == CPP_EOL)
+	{
+		pp -> nlseen = 1;
+		ttype = TOK_EOL;
+		goto out;
+	}
+
+	pp -> nlseen = 0;
+	if (isspace(c))
+	{
+		while (isspace(c))
+			c = preproc_lex_fetch_byte(pp);
+		preproc_lex_unfetch_byte(pp, c);
+		ttype = TOK_WSPACE;
+		goto out;
+	}
+	
+	switch (c)
+	{
+	case '?':
+		ttype = TOK_QMARK;
+		goto out;
+		
+	case ':':
+		ttype = TOK_COLON;
+		goto out;
+		
+	case ',':
+		ttype = TOK_COMMA;
+		goto out;
+		
+	case '(':
+		ttype = TOK_OPAREN;
+		goto out;
+		
+	case ')':
+		ttype = TOK_CPAREN;
+		goto out;
+		
+	case '{':
+		ttype = TOK_OBRACE;
+		goto out;
+		
+	case '}':
+		ttype = TOK_CBRACE;
+		goto out;
+		
+	case '[':
+		ttype = TOK_OSQUARE;
+		goto out;
+		
+	case ']':
+		ttype = TOK_CSQUARE;
+		goto out;
+		
+	case '~':
+		ttype = TOK_COM;
+		goto out;
+		
+	case ';':
+		ttype = TOK_EOS;
+		goto out;
+	
+	/* and now for the possible multi character tokens */
+	case '#':
+		ttype = TOK_HASH;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '#')
+			ttype = TOK_DBLHASH;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '^':
+		ttype = TOK_XOR;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_XORASS;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '!':
+		ttype = TOK_BNOT;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_NE;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '*':
+		ttype = TOK_STAR;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_MULASS;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '/':
+		ttype = TOK_DIV;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_DIVASS;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '=':
+		ttype = TOK_ASS;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_EQ;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '%':
+		ttype = TOK_MOD;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_MODASS;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '-':
+		ttype = TOK_SUB;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_SUBASS;
+		else if (c == '-')
+			ttype = TOK_DBLSUB;
+		else if (c == '>')
+			ttype = TOK_ARROW;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '+':
+		ttype = TOK_ADD;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_ADDASS;
+		else if (c == '+')
+			ttype = TOK_DBLADD;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+
+	case '&':
+		ttype = TOK_BWAND;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_BWANDASS;
+		else if (c == '&')
+			ttype = TOK_BAND;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+
+	case '|':
+		ttype = TOK_BWOR;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_BWORASS;
+		else if (c == '|')
+			ttype = TOK_BOR;
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+
+	case '<':
+		ttype = TOK_LT;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_LE;
+		else if (c == '<')
+		{
+			ttype = TOK_LSH;
+			c = preproc_lex_fetch_byte(pp);
+			if (c == '=')
+				ttype = TOK_LSHASS;
+			else
+				preproc_lex_unfetch_byte(pp, c);
+		}
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+		
+	case '>':
+		ttype = TOK_GT;
+		c = preproc_lex_fetch_byte(pp);
+		if (c == '=')
+			ttype = TOK_GE;
+		else if (c == '>')
+		{
+			ttype = TOK_RSH;
+			c = preproc_lex_fetch_byte(pp);
+			if (c == '=')
+				ttype = TOK_RSHASS;
+			else
+				preproc_lex_unfetch_byte(pp, c);
+		}
+		else
+			preproc_lex_unfetch_byte(pp, c);
+		goto out;
+	
+	case '\'':
+		/* character constant - turns into a  uint */
+chrlit:
+		cl = 0;
+		strbuf = strbuf_new();
+		for (;;)
+		{
+			c = preproc_lex_fetch_byte(pp);
+			if (c == CPP_EOF || c == CPP_EOL || c == '\'')
+				break;
+			cl++;
+			if (c == '\\')
+			{
+				strbuf_add(strbuf, '\\');
+				c = preproc_lex_fetch_byte(pp);
+				if (c == CPP_EOF || c == CPP_EOL)
+				{
+					preproc_throw_error(pp, "Invalid character constant");
+					break;
+				}
+				cl++;
+				strbuf_add(strbuf, c);
+				continue;
+			}
+			strbuf_add(strbuf, c);
+		}
+		if (cl == 0)
+			preproc_throw_error(pp, "Invalid character constant");
+		strval = strbuf_end(strbuf);
+		ttype = TOK_CHR_LIT;
+		goto out;
+
+	case '"':
+strlit:
+		/* string literal */
+		strbuf = strbuf_new();
+		for (;;)
+		{
+			c = preproc_lex_fetch_byte(pp);
+			if (c == CPP_EOF || c == CPP_EOL || c == '"')
+				break;
+			if (c == '\\')
+			{
+				strbuf_add(strbuf, '\\');
+				c = preproc_lex_fetch_byte(pp);
+				if (c == CPP_EOF || c == CPP_EOL)
+				{
+					preproc_throw_error(pp, "Invalid string constant");
+					break;
+				}
+				cl++;
+				strbuf_add(strbuf, c);
+				continue;
+			}
+			strbuf_add(strbuf, c);
+		}
+		strval = strbuf_end(strbuf);
+		ttype = TOK_STR_LIT;
+		goto out;
+
+	case 'L':
+		/* check for wide string or wide char const */
+		c2 = preproc_lex_fetch_byte(pp);
+		if (c2 == '\'')
+		{
+			goto chrlit;
+		}
+		else if (c2 == '"')
+		{
+			goto strlit;
+		}
+		preproc_lex_unfetch_byte(pp, c2);
+		/* fall through for identifier */
+	case '_':
+	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
+	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
+	case 'y': case 'z':
+	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+	case 'G': case 'H': case 'I': case 'J': case 'K':
+	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
+	case 'Y': case 'Z':
+		/* we have an identifier here */
+		strbuf = strbuf_new();
+		strbuf_add(strbuf, c);
+		for (;;)
+		{
+			c = preproc_lex_fetch_byte(pp);
+			if ((c == '_') || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
+			{
+				strbuf_add(strbuf, c);
+				continue;
+			}
+			else
+			{
+				strbuf_add(strbuf, 0);
+				strval = strbuf_end(strbuf);
+				break;
+			}
+		}
+		preproc_lex_unfetch_byte(pp, c);
+		ttype = TOK_IDENT;
+		goto out;
+
+	case '.':
+		c = preproc_lex_fetch_byte(pp);
+		if (c >= '0' && c <= '9')
+		{
+			strbuf = strbuf_new();
+			strbuf_add(strbuf, '.');
+			goto numlit;
+		}
+		else if (c == '.')
+		{
+			c = preproc_lex_fetch_byte(pp);
+			if (c == '.')
+			{
+				ttype = TOK_ELLIPSIS;
+				goto out;
+			}
+			preproc_lex_unfetch_byte(pp, c);
+		}
+		preproc_lex_unfetch_byte(pp, c);
+		ttype = TOK_DOT;
+		goto out;
+
+	case '0': case '1': case '2': case '3': case '4':
+	case '5': case '6': case '7': case '8': case '9':
+		strbuf = strbuf_new();
+numlit:
+		strbuf_add(strbuf, c);
+		for (;;)
+		{
+			c = preproc_lex_fetch_byte(pp);
+			if (!((c == '_') || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))
+				break;
+			strbuf_add(strbuf, c);
+			if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
+			{
+				c = preproc_lex_fetch_byte(pp);
+				if (c == '+' || c == '-')
+				{
+					strbuf_add(strbuf, c);
+					continue;
+				}
+				preproc_lex_unfetch_byte(pp, c);
+			}
+		}
+		strval = strbuf_end(strbuf);
+		preproc_lex_unfetch_byte(pp, c);
+		goto out;
+		
+	default:
+		ttype = TOK_CHAR;
+		strval = lw_alloc(2);
+		strval[0] = c;
+		strval[1] = 0;
+		break;
+	}
+out:	
+	t = token_create(ttype, strval, sline, scol, pp -> fn);
+	lw_free(strval);
+	return t;
+}