Mercurial > hg > index.cgi
diff lwbasic/attic/lexer.c @ 185:cca933d32298
Clean up some mess in lwbasic directory
author | lost@l-w.ca |
---|---|
date | Thu, 22 Dec 2011 18:03:38 -0700 |
parents | lwbasic/lexer.c@5325b640424d |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lwbasic/attic/lexer.c Thu Dec 22 18:03:38 2011 -0700 @@ -0,0 +1,440 @@ +/* +lexer.c + +Copyright © 2011 William Astle + +This file is part of LWTOOLS. + +LWTOOLS is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software +Foundation, either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* +This handles the gritty details of parsing tokens +*/ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include <lw_alloc.h> +#include <lw_string.h> + +#define __lexer_c_seen__ +#include "lwbasic.h" + +/* +A token idenfier is returned by lexer(). The actual string value +is found in state->lexer_lexer_token_string; if the token as an integer value, +it will be found in state->lexer_token_number in the appropriate "value" +slot. +*/ + +struct token_list +{ + char *string; + int token; +}; + +/* keywords that appear as part of normal expressions */ +static struct token_list lexer_global_tokens[] = +{ + { "function", token_kw_function }, + { "sub", token_kw_sub }, + { "public", token_kw_public }, + { "private", token_kw_private }, + { "as", token_kw_as }, + { "params", token_kw_params }, + { "returns", token_kw_returns }, + { "integer", token_kw_integer }, + { "endsub", token_kw_endsub }, + { "endfunction", token_kw_endfunction }, + { "dim", token_kw_dim }, + { NULL } +}; + +/* contains "built in" function names */ +static struct token_list lexer_expr_tokens[] = +{ + { "and", token_op_and }, + { "or", token_op_or }, + { "band", token_op_band }, + { "bor", token_op_bor }, + { "bxor", token_op_bxor }, + { "xor", token_op_xor }, + { "not", token_op_not }, + { "bnot", token_op_bnot }, + { NULL } +}; + +static char *lexer_token_names[] = +{ + "SUB", + "FUNCTION", + "AS", + "PUBLIC", + "PRIVATE", + "PARAMS", + "RETURNS", + "INTEGER", + "ENDSUB", + "ENDFUNCTION", + "DIM", + "<assignment>", + "<equality>", + "<greater>", + "<less>", + "<greaterequal>", + "<lessequal>", + "<notequal>", + "<and>", + "<or>", + "<xor>", + "<bitwiseand>", + "<bitwiseor>", + "<bitwisexor>", + "<plus>", + "<minus>", + "<times>", + "<divide>", + "<modulus>", + "<openparen>", + "<closeparen>", + "<not>", + "<bitwisenot>", + "<identifier>", + "<char>", + "<uint>", + "<int>", + "<eol>", + "<eof>" +}; + +char *lexer_token_name(int token) +{ + if (token > token_eol) + return "???"; + return lexer_token_names[token]; +} + +static int lexer_getchar(cstate *state) +{ + int c; + c = input_getchar(state); + if (c == -2) + { + lwb_error("Error reading input stream."); + } + return c; +} + +static void lexer_nextchar(cstate *state) +{ + state -> lexer_curchar = lexer_getchar(state); + if (state -> lexer_curchar == state -> lexer_ignorechar) + state -> lexer_curchar = lexer_getchar(state); + state -> lexer_ignorechar = 0; +} + +static int lexer_curchar(cstate *state) +{ + if (state -> lexer_curchar == -1) + { + lexer_nextchar(state); + } + + return state -> lexer_curchar; +} + +static void lexer_skip_white(cstate *state) +{ + int c; + + for (;;) + { + c = lexer_curchar(state); + if (!(c == 0 || c == ' ' || c == '\t')) + return; + lexer_nextchar(state); + } +} + +/* must not be called unless the word will be non-zero length */ +static void lexer_word(cstate *state) +{ + int wordlen = 0; + int wordpos = 0; + char *word = NULL; + int c; + struct token_list *tok = NULL; + + for (;;) { + c = lexer_curchar(state); + if (c == '_' || (c >= '0' && c <= '9' ) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) + { + /* character is part of word */ + if (wordpos >= wordlen) + { + word = lw_realloc(word, wordlen + 32); + wordlen += 32; + } + word[wordpos++] = c; + } + else + break; + + lexer_nextchar(state); + } + + word[wordpos] = 0; + lw_free(state -> lexer_token_string); + state -> lexer_token_string = lw_strdup(word); + + switch (state -> parser_state) + { + default: + tok = lexer_global_tokens; + } + + if (state -> expression) + { + tok = lexer_expr_tokens; + } + + /* check for tokens if appropriate */ + /* force uppercase */ + if (tok) + { + for (c = 0; word[c]; c++) + if (word[c] >= 'A' && word[c] <= 'Z') + word[c] = word[c] + 0x20; + + while (tok -> string) + { + if (strcmp(tok -> string, word) == 0) + break; + tok++; + } + } + + lw_free(word); + if (tok && tok -> string) + state -> lexer_token = tok -> token; + else + state -> lexer_token = token_identifier; +} + +static void lexer_parse_number(cstate *state, int neg) +{ + unsigned long tint = 0; + int c; + + for (;;) + { + c = lexer_curchar(state); + if (c >= '0' && c <= '9') + { + tint *= 10 + (c - '0'); + } + else + { + /* end of the number here */ + if (neg) + { + if (tint > 0x80000000) + lwb_error("Integer overflow\n"); + state -> lexer_token_number.integer = -tint; + state -> lexer_token = token_int; + } + else + { + state -> lexer_token = token_uint; + state -> lexer_token_number.uinteger = tint; + } + return; + } + lexer_nextchar(state); + } +} + +static void lexer_empty_token(cstate *state) +{ + lw_free(state -> lexer_token_string); + state -> lexer_token_string = NULL; +} + +void lexer(cstate *state) +{ + int c; + + lexer_skip_white(state); + + lexer_empty_token(state); + + c = lexer_curchar(state); + if (c == -1) + { + state -> lexer_token = token_eof; + return; + } + + if (c == '\n') + { + /* LF */ + lexer_nextchar(state); + state -> lexer_ignorechar = '\r'; + state -> lexer_token = token_eol; + return; + } + + if (c == '\r') + { + /* CR */ + lexer_nextchar(state); + state -> lexer_ignorechar = '\n'; + state -> lexer_token = token_eol; + return; + } + + if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) + { + /* we have a word here; identifier, keyword, etc. */ + lexer_word(state); + return; + } + + if (state -> expression && c >= '0' && c <= '9') + { + /* we have a number */ + lexer_parse_number(state, 0); + return; + } + + lexer_nextchar(state); + if (state -> expression) + { + if (c == '-' && lexer_curchar(state) >= '0' && lexer_curchar(state) <= '9') + { + /* we have a negative number here */ + lexer_parse_number(state, 1); + return; + } + if (c == '=') + { + state -> lexer_token = token_op_equality; + return; + } + if (c == '<') + { + if (lexer_curchar(state) == '=') + { + lexer_nextchar(state); + state -> lexer_token = token_op_lessequal; + return; + } + if (lexer_curchar(state) == '>') + { + lexer_nextchar(state); + state -> lexer_token = token_op_notequal; + return; + } + state -> lexer_token = token_op_less; + return; + } + if (c == '>') + { + if (lexer_curchar(state) == '>') + { + lexer_nextchar(state); + state -> lexer_token = token_op_greaterequal; + return; + } + if (lexer_curchar(state) == '<') + { + state -> lexer_token = token_op_notequal; + lexer_nextchar(state); + return; + } + state -> lexer_token = token_op_greater; + return; + } + switch(c) + { + case '+': + state -> lexer_token = token_op_plus; + return; + + case '-': + state -> lexer_token = token_op_minus; + return; + + case '/': + state -> lexer_token = token_op_divide; + return; + + case '*': + state -> lexer_token = token_op_times; + return; + + case '%': + state -> lexer_token = token_op_modulus; + return; + + case '(': + state -> lexer_token = token_op_oparen; + return; + + case ')': + state -> lexer_token = token_op_cparen; + return; + + } + } + else + { + if (c == '=') + { + state -> lexer_token = token_op_assignment; + return; + } + } + + /* return the character if all else fails */ + state -> lexer_token = token_char; + state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2); + state -> lexer_token_string[0] = c; + state -> lexer_token_string[1] = 0; + return; +} + +char *lexer_return_token(cstate *state) +{ + static char *buffer = NULL; + static int buflen = 0; + int l; + + if (buflen == 0) + { + buffer = lw_alloc(128); + buflen = 128; + } + + l = snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token)); + if (l >= buflen) + { + buffer = lw_realloc(buffer, l + 1); + buflen = l + 1; + snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token)); + } + return buffer; +}