view lwbasic/attic/lexer.c @ 207:07e1fac76321

Added pragma to allow non case sensitive symbols Added "nosymbolcase" and "symbolnocase" pragmas to cause symbols defined while the pragma is in effect to be treated as case insensitive. Also documented the new pragma.
author William Astle <lost@l-w.ca>
date Sat, 09 Jun 2012 15:47:22 -0600
parents cca933d32298
children
line wrap: on
line source

/*
lexer.c

Copyright © 2011 William Astle

This file is part of LWTOOLS.

LWTOOLS is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.

You should have received a copy of the GNU General Public License along with
this program. If not, see <http://www.gnu.org/licenses/>.
*/

/*
This handles the gritty details of parsing tokens
*/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <lw_alloc.h>
#include <lw_string.h>

#define __lexer_c_seen__
#include "lwbasic.h"

/*
A token idenfier is returned by lexer(). The actual string value
is found in state->lexer_lexer_token_string; if the token as an integer value,
it will be found in state->lexer_token_number in the appropriate "value"
slot.
*/

struct token_list
{
	char *string;
	int token;
};

/* keywords that appear as part of normal expressions */
static struct token_list lexer_global_tokens[] = 
{
	{ "function",		token_kw_function },
	{ "sub",			token_kw_sub },
	{ "public",			token_kw_public },
	{ "private",		token_kw_private },
	{ "as",				token_kw_as },
	{ "params",			token_kw_params },
	{ "returns",		token_kw_returns },
	{ "integer",		token_kw_integer },
	{ "endsub",			token_kw_endsub },
	{ "endfunction",	token_kw_endfunction },
	{ "dim",			token_kw_dim },
	{ NULL }
};

/* contains "built in" function names */
static struct token_list lexer_expr_tokens[] =
{
	{ "and",			token_op_and },
	{ "or",				token_op_or },
	{ "band",			token_op_band },
	{ "bor", 			token_op_bor },
	{ "bxor",			token_op_bxor },
	{ "xor",			token_op_xor },
	{ "not",			token_op_not },
	{ "bnot",			token_op_bnot },
	{ NULL }
};

static char *lexer_token_names[] =
{
	"SUB",
	"FUNCTION",
	"AS",
	"PUBLIC",
	"PRIVATE",
	"PARAMS",
	"RETURNS",
	"INTEGER",
	"ENDSUB",
	"ENDFUNCTION",
	"DIM",
	"<assignment>",
	"<equality>",
	"<greater>",
	"<less>",
	"<greaterequal>",
	"<lessequal>",
	"<notequal>",
	"<and>",
	"<or>",
	"<xor>",
	"<bitwiseand>",
	"<bitwiseor>",
	"<bitwisexor>",
	"<plus>",
	"<minus>",
	"<times>",
	"<divide>",
	"<modulus>",
	"<openparen>",
	"<closeparen>",
	"<not>",
	"<bitwisenot>",
	"<identifier>",
	"<char>",
	"<uint>",
	"<int>",
	"<eol>",
	"<eof>"
};

char *lexer_token_name(int token)
{
	if (token > token_eol)
		return "???";
	return lexer_token_names[token];
}

static int lexer_getchar(cstate *state)
{
	int c;
	c = input_getchar(state);
	if (c == -2)
	{
		lwb_error("Error reading input stream.");
	}
	return c;
}

static void lexer_nextchar(cstate *state)
{
	state -> lexer_curchar = lexer_getchar(state);
	if (state -> lexer_curchar == state -> lexer_ignorechar)
		state -> lexer_curchar = lexer_getchar(state);
	state -> lexer_ignorechar = 0;
}

static int lexer_curchar(cstate *state)
{
	if (state -> lexer_curchar == -1)
	{
		lexer_nextchar(state);
	}
	
	return state -> lexer_curchar;
}

static void lexer_skip_white(cstate *state)
{
	int c;
	
	for (;;)
	{
		c = lexer_curchar(state);
		if (!(c == 0 || c == ' ' || c == '\t'))
			return;
		lexer_nextchar(state);
	}
}

/* must not be called unless the word will be non-zero length */
static void lexer_word(cstate *state)
{
	int wordlen = 0;
	int wordpos = 0;
	char *word = NULL;
	int c;
	struct token_list *tok = NULL;
	
	for (;;) {
		c = lexer_curchar(state);
		if (c == '_' || (c >= '0' && c <= '9' ) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80)
		{
			/* character is part of word */
			if (wordpos >= wordlen)
			{
				word = lw_realloc(word, wordlen + 32);
				wordlen += 32;
			}
			word[wordpos++] = c;
		}
		else
			break;
		
		lexer_nextchar(state);
	}
	
	word[wordpos] = 0;
	lw_free(state -> lexer_token_string);
	state -> lexer_token_string = lw_strdup(word);
	
	switch (state -> parser_state)
	{
	default:
		tok = lexer_global_tokens;
	}
	
	if (state -> expression)
	{
		tok = lexer_expr_tokens;
	}
	
	/* check for tokens if appropriate */
	/* force uppercase */
	if (tok)
	{
		for (c = 0; word[c]; c++)
			if (word[c] >= 'A' && word[c] <= 'Z')
				word[c] = word[c] + 0x20;

		while (tok -> string)
		{
			if (strcmp(tok -> string, word) == 0)
				break;
			tok++;
		}
	}
	
	lw_free(word);
	if (tok && tok -> string)
		state -> lexer_token = tok -> token;
	else
		state -> lexer_token = token_identifier;
}

static void lexer_parse_number(cstate *state, int neg)
{
	unsigned long tint = 0;
	int c;
	
	for (;;)
	{
		c = lexer_curchar(state);
		if (c >= '0' && c <= '9')
		{
			tint *= 10 + (c - '0');
		}
		else
		{
			/* end of the number here */
			if (neg)
			{
				if (tint > 0x80000000)
					lwb_error("Integer overflow\n");
				state -> lexer_token_number.integer = -tint;
				state -> lexer_token = token_int;
			}
			else
			{
				state -> lexer_token = token_uint;
				state -> lexer_token_number.uinteger = tint;
			}
			return;
		}
		lexer_nextchar(state);
	}
}

static void lexer_empty_token(cstate *state)
{
	lw_free(state -> lexer_token_string);
	state -> lexer_token_string = NULL;
}

void lexer(cstate *state)
{
	int c;

	lexer_skip_white(state);
	
	lexer_empty_token(state);
	
	c = lexer_curchar(state);
	if (c == -1)
	{
		state -> lexer_token = token_eof;
		return;
	}

	if (c == '\n')
	{
		/* LF */
		lexer_nextchar(state);
		state -> lexer_ignorechar = '\r';
		state -> lexer_token = token_eol;
		return;
	}
	
	if (c == '\r')
	{
		/* CR */
		lexer_nextchar(state);
		state -> lexer_ignorechar = '\n';
		state -> lexer_token = token_eol;
		return;
	}
	
	if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80)
	{
		/* we have a word here; identifier, keyword, etc. */
		lexer_word(state);
		return;
	}

	if (state -> expression && c >= '0' && c <= '9')
	{
		/* we have a number */
		lexer_parse_number(state, 0);
		return;
	}
 
	lexer_nextchar(state);	
	if (state -> expression)
	{
		if (c == '-' && lexer_curchar(state) >= '0' && lexer_curchar(state) <= '9')
		{
			/* we have a negative number here */
			lexer_parse_number(state, 1);
			return;
		}
		if (c == '=')
		{
			state -> lexer_token = token_op_equality;
			return;
		}
		if (c == '<')
		{
			if (lexer_curchar(state) == '=')
			{
				lexer_nextchar(state);
				state -> lexer_token = token_op_lessequal;
				return;
			}
			if (lexer_curchar(state) == '>')
			{
				lexer_nextchar(state);
				state -> lexer_token = token_op_notequal;
				return;
			}
			state -> lexer_token = token_op_less;
			return;
		}
		if (c == '>')
		{
			if (lexer_curchar(state) == '>')
			{
				lexer_nextchar(state);
				state -> lexer_token = token_op_greaterequal;
				return;
			}
			if (lexer_curchar(state) == '<')
			{
				state -> lexer_token = token_op_notequal;
				lexer_nextchar(state);
				return;
			}
			state -> lexer_token = token_op_greater;
			return;
		}
		switch(c)
		{
		case '+':
			state -> lexer_token = token_op_plus;
			return;
		
		case '-':
			state -> lexer_token = token_op_minus;
			return;
		
		case '/':
			state -> lexer_token = token_op_divide;
			return;
		
		case '*':
			state -> lexer_token = token_op_times;
			return;
		
		case '%':
			state -> lexer_token = token_op_modulus;
			return;
		
		case '(':
			state -> lexer_token = token_op_oparen;
			return;
		
		case ')':
			state -> lexer_token = token_op_cparen;
			return;
		
		}
	}
	else
	{
		if (c == '=')
		{
			state -> lexer_token = token_op_assignment;
			return;
		}
	}
	
	/* return the character if all else fails */
	state -> lexer_token = token_char;
	state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2);
	state -> lexer_token_string[0] = c;
	state -> lexer_token_string[1] = 0;
	return;
}

char *lexer_return_token(cstate *state)
{
	static char *buffer = NULL;
	static int buflen = 0;
	int l;
	
	if (buflen == 0)
	{
		buffer = lw_alloc(128);
		buflen = 128;
	}

	l = snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token));
	if (l >= buflen)
	{
		buffer = lw_realloc(buffer, l + 1);
		buflen = l + 1;
		snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token));
	}
	return buffer;
}