Mercurial > hg > index.cgi
diff lwbasic/lexer.c @ 25:87590f43e76d
Started lwbasic parser; checkpoint
author | lost@l-w.ca |
---|---|
date | Mon, 24 Jan 2011 20:08:09 -0700 |
parents | |
children | 26aa76da75ad |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lwbasic/lexer.c Mon Jan 24 20:08:09 2011 -0700 @@ -0,0 +1,216 @@ +/* +lexer.c + +Copyright © 2011 William Astle + +This file is part of LWTOOLS. + +LWTOOLS is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software +Foundation, either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* +This handles the gritty details of parsing tokens +*/ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include <lw_alloc.h> +#include <lw_string.h> + +#define __lexer_c_seen__ +#include "lwbasic.h" + +/* +A token idenfier is returned by lexer(). The actual string value +is found in state->lexer_lexer_token_string; if the token as an integer value, +it will be found in state->lexer_token_number in the appropriate "value" +slot. +*/ + +struct token_list +{ + char *string; + int token; +}; + +static struct token_list lexer_global_tokens[] = +{ + { "function", token_kw_function }, + { "sub", token_kw_sub }, + { "public", token_kw_public }, + { "private", token_kw_private }, + { "as", token_kw_as }, + { "params", token_kw_params }, + { "returns", token_kw_returns }, + { NULL } +}; + +static int lexer_getchar(cstate *state) +{ + int c; + c = input_getchar(state); + if (c == -2) + { + lwb_error("Error reading input stream."); + } + return c; +} + +static void lexer_nextchar(cstate *state) +{ + state -> lexer_curchar = lexer_getchar(state); + if (state -> lexer_curchar == state -> lexer_ignorechar) + state -> lexer_curchar = lexer_getchar(state); + state -> lexer_ignorechar = 0; +} + +static int lexer_curchar(cstate *state) +{ + if (state -> lexer_curchar == -1) + { + lexer_nextchar(state); + } + + return state -> lexer_curchar; +} + +static void lexer_skip_white(cstate *state) +{ + int c; + + for (;;) + { + c = lexer_curchar(state); + if (!(c == 0 || c == ' ' || c == '\t')) + return; + lexer_nextchar(state); + } +} + +/* must not be called unless the word will be non-zero length */ +static void lexer_word(cstate *state) +{ + int wordlen = 0; + int wordpos = 0; + char *word = NULL; + int c; + struct token_list *tok = NULL; + + for (;;) { + c = lexer_curchar(state); + if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) + { + /* character is part of word */ + if (wordpos >= wordlen) + { + word = lw_realloc(word, wordlen + 32); + wordlen += 32; + } + word[wordpos++] = c; + } + else + break; + + lexer_nextchar(state); + } + + word[wordpos] = 0; + lw_free(state -> lexer_token_string); + state -> lexer_token_string = lw_strdup(word); + + switch (state -> parser_state) + { + default: + tok = lexer_global_tokens; + } + + /* check for tokens if appropriate */ + /* force uppercase */ + if (tok) + { + for (c = 0; word[c]; c++) + if (word[c] >= 'A' && word[c] <= 'Z') + word[c] = word[c] + 0x20; + + while (tok -> string) + { + if (strcmp(tok -> string, word) == 0) + break; + tok++; + } + } + + lw_free(word); + if (tok && tok -> string) + state -> lexer_token = tok -> token; + else + state -> lexer_token = token_identifier; +} + +static void lexer_empty_token(cstate *state) +{ + lw_free(state -> lexer_token_string); + state -> lexer_token_string = NULL; +} + +void lexer(cstate *state) +{ + int c; + + lexer_skip_white(state); + + lexer_empty_token(state); + + c = lexer_curchar(state); + if (c == -1) + { + state -> lexer_token = token_eof; + return; + } + + if (c == '\n') + { + /* LF */ + lexer_nextchar(state); + state -> lexer_ignorechar = '\r'; + state -> lexer_token = token_eol; + return; + } + + if (c == '\r') + { + /* CR */ + lexer_nextchar(state); + state -> lexer_ignorechar = '\n'; + state -> lexer_token = token_eol; + return; + } + + if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) + { + /* we have a word here; identifier, keyword, etc. */ + lexer_word(state); + return; + } + + /* return the character if all else fails */ + state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2); + state -> lexer_token_string[0] = c; + state -> lexer_token_string[1] = 0; + lexer_nextchar(state); + state -> lexer_token = token_char; + return; +}