Mercurial > hg > index.cgi
view lwbasic/lexer.c @ 30:bcd532a90e53
Renamed "compiler" to "parser" for more consistent terminology
author | lost@l-w.ca |
---|---|
date | Thu, 03 Feb 2011 21:19:11 -0700 |
parents | 26aa76da75ad |
children | 574931d87abd |
line wrap: on
line source
/* lexer.c Copyright © 2011 William Astle This file is part of LWTOOLS. LWTOOLS is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* This handles the gritty details of parsing tokens */ #include <stdlib.h> #include <stdio.h> #include <string.h> #include <lw_alloc.h> #include <lw_string.h> #define __lexer_c_seen__ #include "lwbasic.h" /* A token idenfier is returned by lexer(). The actual string value is found in state->lexer_lexer_token_string; if the token as an integer value, it will be found in state->lexer_token_number in the appropriate "value" slot. */ struct token_list { char *string; int token; }; static struct token_list lexer_global_tokens[] = { { "function", token_kw_function }, { "sub", token_kw_sub }, { "public", token_kw_public }, { "private", token_kw_private }, { "as", token_kw_as }, { "params", token_kw_params }, { "returns", token_kw_returns }, { "integer", token_kw_integer }, { "endsub", token_kw_endsub }, { "endfunction", token_kw_endfunction }, { NULL } }; static int lexer_getchar(cstate *state) { int c; c = input_getchar(state); if (c == -2) { lwb_error("Error reading input stream."); } return c; } static void lexer_nextchar(cstate *state) { state -> lexer_curchar = lexer_getchar(state); if (state -> lexer_curchar == state -> lexer_ignorechar) state -> lexer_curchar = lexer_getchar(state); state -> lexer_ignorechar = 0; } static int lexer_curchar(cstate *state) { if (state -> lexer_curchar == -1) { lexer_nextchar(state); } return state -> lexer_curchar; } static void lexer_skip_white(cstate *state) { int c; for (;;) { c = lexer_curchar(state); if (!(c == 0 || c == ' ' || c == '\t')) return; lexer_nextchar(state); } } /* must not be called unless the word will be non-zero length */ static void lexer_word(cstate *state) { int wordlen = 0; int wordpos = 0; char *word = NULL; int c; struct token_list *tok = NULL; for (;;) { c = lexer_curchar(state); if (c == '_' || (c >= '0' && c <= '9' ) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) { /* character is part of word */ if (wordpos >= wordlen) { word = lw_realloc(word, wordlen + 32); wordlen += 32; } word[wordpos++] = c; } else break; lexer_nextchar(state); } word[wordpos] = 0; lw_free(state -> lexer_token_string); state -> lexer_token_string = lw_strdup(word); switch (state -> parser_state) { default: tok = lexer_global_tokens; } /* check for tokens if appropriate */ /* force uppercase */ if (tok) { for (c = 0; word[c]; c++) if (word[c] >= 'A' && word[c] <= 'Z') word[c] = word[c] + 0x20; while (tok -> string) { if (strcmp(tok -> string, word) == 0) break; tok++; } } lw_free(word); if (tok && tok -> string) state -> lexer_token = tok -> token; else state -> lexer_token = token_identifier; } static void lexer_empty_token(cstate *state) { lw_free(state -> lexer_token_string); state -> lexer_token_string = NULL; } void lexer(cstate *state) { int c; lexer_skip_white(state); lexer_empty_token(state); c = lexer_curchar(state); if (c == -1) { state -> lexer_token = token_eof; return; } if (c == '\n') { /* LF */ lexer_nextchar(state); state -> lexer_ignorechar = '\r'; state -> lexer_token = token_eol; return; } if (c == '\r') { /* CR */ lexer_nextchar(state); state -> lexer_ignorechar = '\n'; state -> lexer_token = token_eol; return; } if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) { /* we have a word here; identifier, keyword, etc. */ lexer_word(state); return; } /* return the character if all else fails */ state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2); state -> lexer_token_string[0] = c; state -> lexer_token_string[1] = 0; lexer_nextchar(state); state -> lexer_token = token_char; return; }