changeset 25:87590f43e76d

Started lwbasic parser; checkpoint
author lost@l-w.ca
date Mon, 24 Jan 2011 20:08:09 -0700
parents 421d7ceb4d86
children 26aa76da75ad
files Makefile lwbasic/compiler.c lwbasic/input.c lwbasic/lexer.c lwbasic/lwbasic.h lwbasic/main.c lwbasic/rules.make
diffstat 7 files changed, 460 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Mon Jan 24 18:31:07 2011 -0700
+++ b/Makefile	Mon Jan 24 20:08:09 2011 -0700
@@ -19,6 +19,7 @@
 CPPFLAGS += -I lwlib -DPACKAGE_STRING='"lwtools 4.0-pre"'
 LDFLAGS += -L$(PWD)/lwlib -llw
 
+CFLAGS ?= -g -Wall
 
 MAIN_TARGETS := lwasm/lwasm$(PROGSUFFIX) \
 	lwlink/lwlink$(PROGSUFFIX) \
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lwbasic/compiler.c	Mon Jan 24 20:08:09 2011 -0700
@@ -0,0 +1,168 @@
+/*
+compiler.c
+
+Copyright © 2011 William Astle
+
+This file is part of LWTOOLS.
+
+LWTOOLS is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software
+Foundation, either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+This is the actual compiler bit; it drives the parser and code generation
+*/
+
+#include <stdio.h>
+
+#include "lwbasic.h"
+
+/* parse a type; the next token will be acquired as a result */
+/* the token advancement is to provide consistency */
+static int parse_type(cstate *state)
+{
+	int pt = -1;
+	
+	switch (state -> lexer_token)
+	{
+	case token_kw_integer:
+		pt = 1;
+		break;
+
+	default:
+		lwb_error("Invalid type specification");
+	}
+	lexer(state);
+	/* look for "unsigned" modifier for integer types */
+	return pt;
+}
+
+
+/* issub means RETURNS is not allowed; !issub means RETURNS is required */
+static void parse_subfunc(cstate *state, int issub)
+{
+	int pt;
+	
+	lexer(state);
+	if (state -> lexer_token != token_identifier)
+	{
+		lwb_error("Invalid sub name '%s'", state -> lexer_token_string);
+	}
+	
+	printf("<name> = %s\n", state -> lexer_token_string);
+	
+	lexer(state);
+	if (state -> lexer_token == token_kw_public || state -> lexer_token == token_kw_private)
+	{
+		printf("<type> = %s\n", state -> lexer_token_string);
+		lexer(state);
+	}
+
+	/* ignore the "PARAMS" keyword if present */
+	if (state -> lexer_token == token_kw_params)
+		lexer(state);
+	
+	if (state -> lexer_token == token_eol)
+		goto noparms;
+
+paramagain:
+	if (state -> lexer_token != token_identifier)
+	{
+		lwb_error("Parameter name expected, get %d, %s\n", state -> lexer_token, state -> lexer_token_string);
+	}
+	printf("Got <param> = %s\n", state -> lexer_token_string);
+	lexer(state);
+	
+	if (state -> lexer_token != token_kw_as)
+		lwb_error("Expecting AS\n");
+	lexer(state);
+	
+	pt = parse_type(state);
+	printf("Got <type> = %d\n", pt);
+	
+	if (state -> lexer_token == token_char && state -> lexer_token_string[0] == ',')
+	{
+		lexer(state);
+		goto paramagain;
+	}
+
+noparms:	
+	if (!issub)
+	{
+		int rt;
+		
+		if (state -> lexer_token != token_kw_returns)
+		{
+			lwb_error("FUNCTION must have RETURNS\n");
+		}
+		lexer(state);
+		if (state -> lexer_token == token_identifier)
+		{
+			printf("Return value named: %s\n", state -> lexer_token_string);
+			lexer(state);
+			if (state -> lexer_token != token_kw_as)
+				lwb_error("Execting AS after RETURNS");
+			lexer(state);
+		}
+		rt = parse_type(state);
+		printf("Return type: %d\n", rt);
+	}
+	else
+	{
+		if (state -> lexer_token == token_kw_returns)
+		{
+			lwb_error("SUB cannot specify RETURNS\n");
+		}
+	}
+
+	
+	if (state -> lexer_token != token_eol)
+	{
+		lwb_error("EOL expected; found %d, %s\n", state -> lexer_token, state -> lexer_token_string);
+	}
+}
+
+void compiler(cstate *state)
+{
+	state -> lexer_curchar = -1;
+	
+	/* now look for a global declaration */
+	for (;;)
+	{
+		state -> parser_state = parser_state_global;
+		lexer(state);
+		switch (state -> lexer_token)
+		{
+		case token_kw_function:
+			printf("Function\n");
+			parse_subfunc(state, 0);
+			break;
+			
+		case token_kw_sub:
+			printf("Sub\n");
+			parse_subfunc(state, 1);
+			break;
+
+		/* blank lines are allowed */
+		case token_eol:
+			continue;
+		
+		/* EOF is allowed - end of parsing */
+		case token_eof:
+			return;
+
+		default:
+			lwb_error("Invalid token %d, %s in global state\n", state -> lexer_token, state -> lexer_token_string);
+		}
+	}	
+}
--- a/lwbasic/input.c	Mon Jan 24 18:31:07 2011 -0700
+++ b/lwbasic/input.c	Mon Jan 24 20:08:09 2011 -0700
@@ -28,6 +28,7 @@
 #include <string.h>
 
 #include <lw_alloc.h>
+#include <lw_error.h>
 
 #define __input_c_seen__
 #include "lwbasic.h"
@@ -54,8 +55,7 @@
 		sp -> fp = fopen(state -> input_file, "rb");
 		if (!(sp -> fp))
 		{
-			fprintf(stderr, "Cannot open input file\n");
-			exit(1);
+			lwb_error("Cannot open input file\n");
 		}
 	}
 	
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lwbasic/lexer.c	Mon Jan 24 20:08:09 2011 -0700
@@ -0,0 +1,216 @@
+/*
+lexer.c
+
+Copyright © 2011 William Astle
+
+This file is part of LWTOOLS.
+
+LWTOOLS is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software
+Foundation, either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+This handles the gritty details of parsing tokens
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <lw_alloc.h>
+#include <lw_string.h>
+
+#define __lexer_c_seen__
+#include "lwbasic.h"
+
+/*
+A token idenfier is returned by lexer(). The actual string value
+is found in state->lexer_lexer_token_string; if the token as an integer value,
+it will be found in state->lexer_token_number in the appropriate "value"
+slot.
+*/
+
+struct token_list
+{
+	char *string;
+	int token;
+};
+
+static struct token_list lexer_global_tokens[] = 
+{
+	{ "function",		token_kw_function },
+	{ "sub",			token_kw_sub },
+	{ "public",			token_kw_public },
+	{ "private",		token_kw_private },
+	{ "as",				token_kw_as },
+	{ "params",			token_kw_params },
+	{ "returns",		token_kw_returns },
+	{ NULL }
+};
+
+static int lexer_getchar(cstate *state)
+{
+	int c;
+	c = input_getchar(state);
+	if (c == -2)
+	{
+		lwb_error("Error reading input stream.");
+	}
+	return c;
+}
+
+static void lexer_nextchar(cstate *state)
+{
+	state -> lexer_curchar = lexer_getchar(state);
+	if (state -> lexer_curchar == state -> lexer_ignorechar)
+		state -> lexer_curchar = lexer_getchar(state);
+	state -> lexer_ignorechar = 0;
+}
+
+static int lexer_curchar(cstate *state)
+{
+	if (state -> lexer_curchar == -1)
+	{
+		lexer_nextchar(state);
+	}
+	
+	return state -> lexer_curchar;
+}
+
+static void lexer_skip_white(cstate *state)
+{
+	int c;
+	
+	for (;;)
+	{
+		c = lexer_curchar(state);
+		if (!(c == 0 || c == ' ' || c == '\t'))
+			return;
+		lexer_nextchar(state);
+	}
+}
+
+/* must not be called unless the word will be non-zero length */
+static void lexer_word(cstate *state)
+{
+	int wordlen = 0;
+	int wordpos = 0;
+	char *word = NULL;
+	int c;
+	struct token_list *tok = NULL;
+	
+	for (;;) {
+		c = lexer_curchar(state);
+		if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80)
+		{
+			/* character is part of word */
+			if (wordpos >= wordlen)
+			{
+				word = lw_realloc(word, wordlen + 32);
+				wordlen += 32;
+			}
+			word[wordpos++] = c;
+		}
+		else
+			break;
+		
+		lexer_nextchar(state);
+	}
+	
+	word[wordpos] = 0;
+	lw_free(state -> lexer_token_string);
+	state -> lexer_token_string = lw_strdup(word);
+	
+	switch (state -> parser_state)
+	{
+	default:
+		tok = lexer_global_tokens;
+	}
+	
+	/* check for tokens if appropriate */
+	/* force uppercase */
+	if (tok)
+	{
+		for (c = 0; word[c]; c++)
+			if (word[c] >= 'A' && word[c] <= 'Z')
+				word[c] = word[c] + 0x20;
+
+		while (tok -> string)
+		{
+			if (strcmp(tok -> string, word) == 0)
+				break;
+			tok++;
+		}
+	}
+	
+	lw_free(word);
+	if (tok && tok -> string)
+		state -> lexer_token = tok -> token;
+	else
+		state -> lexer_token = token_identifier;
+}
+
+static void lexer_empty_token(cstate *state)
+{
+	lw_free(state -> lexer_token_string);
+	state -> lexer_token_string = NULL;
+}
+
+void lexer(cstate *state)
+{
+	int c;
+
+	lexer_skip_white(state);
+	
+	lexer_empty_token(state);
+	
+	c = lexer_curchar(state);
+	if (c == -1)
+	{
+		state -> lexer_token = token_eof;
+		return;
+	}
+
+	if (c == '\n')
+	{
+		/* LF */
+		lexer_nextchar(state);
+		state -> lexer_ignorechar = '\r';
+		state -> lexer_token = token_eol;
+		return;
+	}
+	
+	if (c == '\r')
+	{
+		/* CR */
+		lexer_nextchar(state);
+		state -> lexer_ignorechar = '\n';
+		state -> lexer_token = token_eol;
+		return;
+	}
+	
+	if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80)
+	{
+		/* we have a word here; identifier, keyword, etc. */
+		lexer_word(state);
+		return;
+	}
+	
+	/* return the character if all else fails */
+	state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2);
+	state -> lexer_token_string[0] = c;
+	state -> lexer_token_string[1] = 0;
+	lexer_nextchar(state);
+	state -> lexer_token = token_char;
+	return;
+}
--- a/lwbasic/lwbasic.h	Mon Jan 24 18:31:07 2011 -0700
+++ b/lwbasic/lwbasic.h	Mon Jan 24 20:08:09 2011 -0700
@@ -26,18 +26,73 @@
 #ifndef __lwbasic_h_seen__
 #define __lwbasic_h_seen__
 
+#include <stdint.h>
+
+/* note: integer and uinteger will be the same for positive values from 0
+through 0x7FFFFFFF; the unsigned type should be used for doing ascii
+conversions and then if a negative value was discovered, it should be
+negated IFF it is in range. */
+
+union lexer_numbers
+{
+	uint32_t uinteger;
+	int32_t integer;
+};
+
 typedef struct
 {
 	char *output_file;
 	char *input_file;
 	
 	int debug_level;
+
+	char *lexer_token_string;
+	union lexer_numbers lexer_token_number;
+	int lexer_token;
+	int lexer_curchar;
+	int lexer_ignorechar;
+	
+	int parser_state;
 	
 	void *input_state;
 } cstate;
 
+/* parser states */
+enum
+{
+	parser_state_global = 0,			/* only global decls allowed */
+	parser_state_error
+};
+
+/* token types */
+enum
+{
+	token_kw_sub,				/* SUB keyword */
+	token_kw_function,			/* FUNCTION keyword */
+	token_kw_as,				/* AS keyword */
+	token_kw_public,			/* PUBLIC keyword */
+	token_kw_private,			/* PRIVATE keyword */
+	token_kw_params,			/* PARAMS keyword */
+	token_kw_returns,			/* RETURNS keyword */
+	token_kw_integer,			/* INTEGER keyword */
+	token_identifier,			/* an identifier (variable, function, etc. */
+	token_char,					/* single character; fallback */
+	token_uint,					/* unsigned integer up to 32 bits */
+	token_int,					/* signed integer up to 32 bits */
+	token_eol,					/* end of line */
+	token_eof					/* end of file */
+};
+
 #ifndef __input_c_seen__
 extern int input_getchar(cstate *state);
 #endif
 
+#ifndef __main_c_seen__
+extern void lwb_error(const char *fmt, ...);
+#endif
+
+#ifndef __lexer_c_seen__
+extern void lexer(cstate *state);
+#endif
+
 #endif /* __lwbasic_h_seen__ */
--- a/lwbasic/main.c	Mon Jan 24 18:31:07 2011 -0700
+++ b/lwbasic/main.c	Mon Jan 24 20:08:09 2011 -0700
@@ -25,11 +25,13 @@
 
 #include <stdlib.h>
 #include <stdio.h>
+#include <stdarg.h>
 
 #include <lw_cmdline.h>
 #include <lw_string.h>
 #include <lw_alloc.h>
 
+#define __main_c_seen__
 #include "lwbasic.h"
 
 #define PROGVER "lwbasic from " PACKAGE_STRING
@@ -90,11 +92,26 @@
 	PROGVER
 };
 
+extern void compiler(cstate *state);
+
 int main(int argc, char **argv)
 {
 	cstate state = { 0 };
 
 	lw_cmdline_parse(&cmdline_parser, argc, argv, 0, 0, &state);
 
+	compiler(&state);
+
 	exit(0);
 }
+
+void lwb_error(const char *fmt, ...)
+{
+	va_list args;
+	
+	va_start(args, fmt);
+	vfprintf(stderr, fmt, args);
+	va_end(args);
+	
+	exit(1);
+}
--- a/lwbasic/rules.make	Mon Jan 24 18:31:07 2011 -0700
+++ b/lwbasic/rules.make	Mon Jan 24 20:08:09 2011 -0700
@@ -1,7 +1,7 @@
 dirname := $(dir $(lastword $(MAKEFILE_LIST)))
 lwbasic_dir := $(dirname)
 
-lwbasic_lsrcs := main.c input.c
+lwbasic_lsrcs := main.c input.c compiler.c lexer.c
 
 lwbasic_srcs := $(addprefix $(dirname),$(lwbasic_lsrcs))
 lwbasic_objs := $(lwbasic_srcs:.c=.o)