view lwbasic/attic/lexer.c @ 258:ebda5c96665e

Improved stack handling for os9 target in lwlink Added "stack" as a valid symbol in the __os9 section. All instances of __os9 are now polled for "stack" symobls and the values added to the stack size set in the linker script. The stack size is then added to the final data size of the module. Also set a default minimum stack size of 32 bytes.
author William Astle <lost@l-w.ca>
date Thu, 31 Jan 2013 19:34:54 -0700
parents cca933d32298
children
line wrap: on
line source

/*
lexer.c

Copyright © 2011 William Astle

This file is part of LWTOOLS.

LWTOOLS is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.

You should have received a copy of the GNU General Public License along with
this program. If not, see <http://www.gnu.org/licenses/>.
*/

/*
This handles the gritty details of parsing tokens
*/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <lw_alloc.h>
#include <lw_string.h>

#define __lexer_c_seen__
#include "lwbasic.h"

/*
A token idenfier is returned by lexer(). The actual string value
is found in state->lexer_lexer_token_string; if the token as an integer value,
it will be found in state->lexer_token_number in the appropriate "value"
slot.
*/

struct token_list
{
	char *string;
	int token;
};

/* keywords that appear as part of normal expressions */
static struct token_list lexer_global_tokens[] = 
{
	{ "function",		token_kw_function },
	{ "sub",			token_kw_sub },
	{ "public",			token_kw_public },
	{ "private",		token_kw_private },
	{ "as",				token_kw_as },
	{ "params",			token_kw_params },
	{ "returns",		token_kw_returns },
	{ "integer",		token_kw_integer },
	{ "endsub",			token_kw_endsub },
	{ "endfunction",	token_kw_endfunction },
	{ "dim",			token_kw_dim },
	{ NULL }
};

/* contains "built in" function names */
static struct token_list lexer_expr_tokens[] =
{
	{ "and",			token_op_and },
	{ "or",				token_op_or },
	{ "band",			token_op_band },
	{ "bor", 			token_op_bor },
	{ "bxor",			token_op_bxor },
	{ "xor",			token_op_xor },
	{ "not",			token_op_not },
	{ "bnot",			token_op_bnot },
	{ NULL }
};

static char *lexer_token_names[] =
{
	"SUB",
	"FUNCTION",
	"AS",
	"PUBLIC",
	"PRIVATE",
	"PARAMS",
	"RETURNS",
	"INTEGER",
	"ENDSUB",
	"ENDFUNCTION",
	"DIM",
	"<assignment>",
	"<equality>",
	"<greater>",
	"<less>",
	"<greaterequal>",
	"<lessequal>",
	"<notequal>",
	"<and>",
	"<or>",
	"<xor>",
	"<bitwiseand>",
	"<bitwiseor>",
	"<bitwisexor>",
	"<plus>",
	"<minus>",
	"<times>",
	"<divide>",
	"<modulus>",
	"<openparen>",
	"<closeparen>",
	"<not>",
	"<bitwisenot>",
	"<identifier>",
	"<char>",
	"<uint>",
	"<int>",
	"<eol>",
	"<eof>"
};

char *lexer_token_name(int token)
{
	if (token > token_eol)
		return "???";
	return lexer_token_names[token];
}

static int lexer_getchar(cstate *state)
{
	int c;
	c = input_getchar(state);
	if (c == -2)
	{
		lwb_error("Error reading input stream.");
	}
	return c;
}

static void lexer_nextchar(cstate *state)
{
	state -> lexer_curchar = lexer_getchar(state);
	if (state -> lexer_curchar == state -> lexer_ignorechar)
		state -> lexer_curchar = lexer_getchar(state);
	state -> lexer_ignorechar = 0;
}

static int lexer_curchar(cstate *state)
{
	if (state -> lexer_curchar == -1)
	{
		lexer_nextchar(state);
	}
	
	return state -> lexer_curchar;
}

static void lexer_skip_white(cstate *state)
{
	int c;
	
	for (;;)
	{
		c = lexer_curchar(state);
		if (!(c == 0 || c == ' ' || c == '\t'))
			return;
		lexer_nextchar(state);
	}
}

/* must not be called unless the word will be non-zero length */
static void lexer_word(cstate *state)
{
	int wordlen = 0;
	int wordpos = 0;
	char *word = NULL;
	int c;
	struct token_list *tok = NULL;
	
	for (;;) {
		c = lexer_curchar(state);
		if (c == '_' || (c >= '0' && c <= '9' ) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80)
		{
			/* character is part of word */
			if (wordpos >= wordlen)
			{
				word = lw_realloc(word, wordlen + 32);
				wordlen += 32;
			}
			word[wordpos++] = c;
		}
		else
			break;
		
		lexer_nextchar(state);
	}
	
	word[wordpos] = 0;
	lw_free(state -> lexer_token_string);
	state -> lexer_token_string = lw_strdup(word);
	
	switch (state -> parser_state)
	{
	default:
		tok = lexer_global_tokens;
	}
	
	if (state -> expression)
	{
		tok = lexer_expr_tokens;
	}
	
	/* check for tokens if appropriate */
	/* force uppercase */
	if (tok)
	{
		for (c = 0; word[c]; c++)
			if (word[c] >= 'A' && word[c] <= 'Z')
				word[c] = word[c] + 0x20;

		while (tok -> string)
		{
			if (strcmp(tok -> string, word) == 0)
				break;
			tok++;
		}
	}
	
	lw_free(word);
	if (tok && tok -> string)
		state -> lexer_token = tok -> token;
	else
		state -> lexer_token = token_identifier;
}

static void lexer_parse_number(cstate *state, int neg)
{
	unsigned long tint = 0;
	int c;
	
	for (;;)
	{
		c = lexer_curchar(state);
		if (c >= '0' && c <= '9')
		{
			tint *= 10 + (c - '0');
		}
		else
		{
			/* end of the number here */
			if (neg)
			{
				if (tint > 0x80000000)
					lwb_error("Integer overflow\n");
				state -> lexer_token_number.integer = -tint;
				state -> lexer_token = token_int;
			}
			else
			{
				state -> lexer_token = token_uint;
				state -> lexer_token_number.uinteger = tint;
			}
			return;
		}
		lexer_nextchar(state);
	}
}

static void lexer_empty_token(cstate *state)
{
	lw_free(state -> lexer_token_string);
	state -> lexer_token_string = NULL;
}

void lexer(cstate *state)
{
	int c;

	lexer_skip_white(state);
	
	lexer_empty_token(state);
	
	c = lexer_curchar(state);
	if (c == -1)
	{
		state -> lexer_token = token_eof;
		return;
	}

	if (c == '\n')
	{
		/* LF */
		lexer_nextchar(state);
		state -> lexer_ignorechar = '\r';
		state -> lexer_token = token_eol;
		return;
	}
	
	if (c == '\r')
	{
		/* CR */
		lexer_nextchar(state);
		state -> lexer_ignorechar = '\n';
		state -> lexer_token = token_eol;
		return;
	}
	
	if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80)
	{
		/* we have a word here; identifier, keyword, etc. */
		lexer_word(state);
		return;
	}

	if (state -> expression && c >= '0' && c <= '9')
	{
		/* we have a number */
		lexer_parse_number(state, 0);
		return;
	}
 
	lexer_nextchar(state);	
	if (state -> expression)
	{
		if (c == '-' && lexer_curchar(state) >= '0' && lexer_curchar(state) <= '9')
		{
			/* we have a negative number here */
			lexer_parse_number(state, 1);
			return;
		}
		if (c == '=')
		{
			state -> lexer_token = token_op_equality;
			return;
		}
		if (c == '<')
		{
			if (lexer_curchar(state) == '=')
			{
				lexer_nextchar(state);
				state -> lexer_token = token_op_lessequal;
				return;
			}
			if (lexer_curchar(state) == '>')
			{
				lexer_nextchar(state);
				state -> lexer_token = token_op_notequal;
				return;
			}
			state -> lexer_token = token_op_less;
			return;
		}
		if (c == '>')
		{
			if (lexer_curchar(state) == '>')
			{
				lexer_nextchar(state);
				state -> lexer_token = token_op_greaterequal;
				return;
			}
			if (lexer_curchar(state) == '<')
			{
				state -> lexer_token = token_op_notequal;
				lexer_nextchar(state);
				return;
			}
			state -> lexer_token = token_op_greater;
			return;
		}
		switch(c)
		{
		case '+':
			state -> lexer_token = token_op_plus;
			return;
		
		case '-':
			state -> lexer_token = token_op_minus;
			return;
		
		case '/':
			state -> lexer_token = token_op_divide;
			return;
		
		case '*':
			state -> lexer_token = token_op_times;
			return;
		
		case '%':
			state -> lexer_token = token_op_modulus;
			return;
		
		case '(':
			state -> lexer_token = token_op_oparen;
			return;
		
		case ')':
			state -> lexer_token = token_op_cparen;
			return;
		
		}
	}
	else
	{
		if (c == '=')
		{
			state -> lexer_token = token_op_assignment;
			return;
		}
	}
	
	/* return the character if all else fails */
	state -> lexer_token = token_char;
	state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2);
	state -> lexer_token_string[0] = c;
	state -> lexer_token_string[1] = 0;
	return;
}

char *lexer_return_token(cstate *state)
{
	static char *buffer = NULL;
	static int buflen = 0;
	int l;
	
	if (buflen == 0)
	{
		buffer = lw_alloc(128);
		buflen = 128;
	}

	l = snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token));
	if (l >= buflen)
	{
		buffer = lw_realloc(buffer, l + 1);
		buflen = l + 1;
		snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token));
	}
	return buffer;
}