comparison lwbasic/lexer.c @ 25:87590f43e76d

Started lwbasic parser; checkpoint
author lost@l-w.ca
date Mon, 24 Jan 2011 20:08:09 -0700
parents
children 26aa76da75ad
comparison
equal deleted inserted replaced
24:421d7ceb4d86 25:87590f43e76d
1 /*
2 lexer.c
3
4 Copyright © 2011 William Astle
5
6 This file is part of LWTOOLS.
7
8 LWTOOLS is free software: you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation, either version 3 of the License, or (at your option) any later
11 version.
12
13 This program is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 more details.
17
18 You should have received a copy of the GNU General Public License along with
19 this program. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 /*
23 This handles the gritty details of parsing tokens
24 */
25
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29
30 #include <lw_alloc.h>
31 #include <lw_string.h>
32
33 #define __lexer_c_seen__
34 #include "lwbasic.h"
35
36 /*
37 A token idenfier is returned by lexer(). The actual string value
38 is found in state->lexer_lexer_token_string; if the token as an integer value,
39 it will be found in state->lexer_token_number in the appropriate "value"
40 slot.
41 */
42
43 struct token_list
44 {
45 char *string;
46 int token;
47 };
48
49 static struct token_list lexer_global_tokens[] =
50 {
51 { "function", token_kw_function },
52 { "sub", token_kw_sub },
53 { "public", token_kw_public },
54 { "private", token_kw_private },
55 { "as", token_kw_as },
56 { "params", token_kw_params },
57 { "returns", token_kw_returns },
58 { NULL }
59 };
60
61 static int lexer_getchar(cstate *state)
62 {
63 int c;
64 c = input_getchar(state);
65 if (c == -2)
66 {
67 lwb_error("Error reading input stream.");
68 }
69 return c;
70 }
71
72 static void lexer_nextchar(cstate *state)
73 {
74 state -> lexer_curchar = lexer_getchar(state);
75 if (state -> lexer_curchar == state -> lexer_ignorechar)
76 state -> lexer_curchar = lexer_getchar(state);
77 state -> lexer_ignorechar = 0;
78 }
79
80 static int lexer_curchar(cstate *state)
81 {
82 if (state -> lexer_curchar == -1)
83 {
84 lexer_nextchar(state);
85 }
86
87 return state -> lexer_curchar;
88 }
89
90 static void lexer_skip_white(cstate *state)
91 {
92 int c;
93
94 for (;;)
95 {
96 c = lexer_curchar(state);
97 if (!(c == 0 || c == ' ' || c == '\t'))
98 return;
99 lexer_nextchar(state);
100 }
101 }
102
103 /* must not be called unless the word will be non-zero length */
104 static void lexer_word(cstate *state)
105 {
106 int wordlen = 0;
107 int wordpos = 0;
108 char *word = NULL;
109 int c;
110 struct token_list *tok = NULL;
111
112 for (;;) {
113 c = lexer_curchar(state);
114 if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80)
115 {
116 /* character is part of word */
117 if (wordpos >= wordlen)
118 {
119 word = lw_realloc(word, wordlen + 32);
120 wordlen += 32;
121 }
122 word[wordpos++] = c;
123 }
124 else
125 break;
126
127 lexer_nextchar(state);
128 }
129
130 word[wordpos] = 0;
131 lw_free(state -> lexer_token_string);
132 state -> lexer_token_string = lw_strdup(word);
133
134 switch (state -> parser_state)
135 {
136 default:
137 tok = lexer_global_tokens;
138 }
139
140 /* check for tokens if appropriate */
141 /* force uppercase */
142 if (tok)
143 {
144 for (c = 0; word[c]; c++)
145 if (word[c] >= 'A' && word[c] <= 'Z')
146 word[c] = word[c] + 0x20;
147
148 while (tok -> string)
149 {
150 if (strcmp(tok -> string, word) == 0)
151 break;
152 tok++;
153 }
154 }
155
156 lw_free(word);
157 if (tok && tok -> string)
158 state -> lexer_token = tok -> token;
159 else
160 state -> lexer_token = token_identifier;
161 }
162
163 static void lexer_empty_token(cstate *state)
164 {
165 lw_free(state -> lexer_token_string);
166 state -> lexer_token_string = NULL;
167 }
168
169 void lexer(cstate *state)
170 {
171 int c;
172
173 lexer_skip_white(state);
174
175 lexer_empty_token(state);
176
177 c = lexer_curchar(state);
178 if (c == -1)
179 {
180 state -> lexer_token = token_eof;
181 return;
182 }
183
184 if (c == '\n')
185 {
186 /* LF */
187 lexer_nextchar(state);
188 state -> lexer_ignorechar = '\r';
189 state -> lexer_token = token_eol;
190 return;
191 }
192
193 if (c == '\r')
194 {
195 /* CR */
196 lexer_nextchar(state);
197 state -> lexer_ignorechar = '\n';
198 state -> lexer_token = token_eol;
199 return;
200 }
201
202 if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80)
203 {
204 /* we have a word here; identifier, keyword, etc. */
205 lexer_word(state);
206 return;
207 }
208
209 /* return the character if all else fails */
210 state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2);
211 state -> lexer_token_string[0] = c;
212 state -> lexer_token_string[1] = 0;
213 lexer_nextchar(state);
214 state -> lexer_token = token_char;
215 return;
216 }