Mercurial > hg > index.cgi
annotate lwbasic/lexer.c @ 35:cdb0175e1063
More work on expressions
author | Lost Wizard (lost@starbug3) |
---|---|
date | Sat, 05 Feb 2011 14:22:54 -0700 |
parents | bfea77812e64 |
children | 5325b640424d |
rev | line source |
---|---|
25 | 1 /* |
2 lexer.c | |
3 | |
4 Copyright © 2011 William Astle | |
5 | |
6 This file is part of LWTOOLS. | |
7 | |
8 LWTOOLS is free software: you can redistribute it and/or modify it under the | |
9 terms of the GNU General Public License as published by the Free Software | |
10 Foundation, either version 3 of the License, or (at your option) any later | |
11 version. | |
12 | |
13 This program is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
16 more details. | |
17 | |
18 You should have received a copy of the GNU General Public License along with | |
19 this program. If not, see <http://www.gnu.org/licenses/>. | |
20 */ | |
21 | |
22 /* | |
23 This handles the gritty details of parsing tokens | |
24 */ | |
25 | |
26 #include <stdlib.h> | |
27 #include <stdio.h> | |
28 #include <string.h> | |
29 | |
30 #include <lw_alloc.h> | |
31 #include <lw_string.h> | |
32 | |
33 #define __lexer_c_seen__ | |
34 #include "lwbasic.h" | |
35 | |
36 /* | |
37 A token idenfier is returned by lexer(). The actual string value | |
38 is found in state->lexer_lexer_token_string; if the token as an integer value, | |
39 it will be found in state->lexer_token_number in the appropriate "value" | |
40 slot. | |
41 */ | |
42 | |
43 struct token_list | |
44 { | |
45 char *string; | |
46 int token; | |
47 }; | |
48 | |
35 | 49 /* keywords that appear as part of normal expressions */ |
25 | 50 static struct token_list lexer_global_tokens[] = |
51 { | |
52 { "function", token_kw_function }, | |
53 { "sub", token_kw_sub }, | |
54 { "public", token_kw_public }, | |
55 { "private", token_kw_private }, | |
56 { "as", token_kw_as }, | |
57 { "params", token_kw_params }, | |
58 { "returns", token_kw_returns }, | |
26
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
59 { "integer", token_kw_integer }, |
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
60 { "endsub", token_kw_endsub }, |
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
61 { "endfunction", token_kw_endfunction }, |
33 | 62 { "dim", token_kw_dim }, |
35 | 63 { NULL } |
64 }; | |
65 | |
66 /* contains "built in" function names */ | |
67 static struct token_list lexer_expr_tokens[] = | |
68 { | |
69 { "and", token_op_and }, | |
70 { "or", token_op_or }, | |
71 { "band", token_op_band }, | |
72 { "bor", token_op_bor }, | |
73 { "bxor", token_op_bxor }, | |
74 { "xor", token_op_xor }, | |
25 | 75 { NULL } |
76 }; | |
77 | |
34 | 78 static char *lexer_token_names[] = |
79 { | |
80 "SUB", | |
81 "FUNCTION", | |
82 "AS", | |
83 "PUBLIC", | |
84 "PRIVATE", | |
85 "PARAMS", | |
86 "RETURNS", | |
87 "INTEGER", | |
88 "ENDSUB", | |
89 "ENDFUNCTION", | |
90 "DIM", | |
91 "<assignment>", | |
35 | 92 "<equality>", |
93 "<greater>", | |
94 "<less>", | |
95 "<greaterequal>", | |
96 "<lessequal>", | |
97 "<notequal>", | |
98 "<and>", | |
99 "<or>", | |
100 "<xor>", | |
101 "<bitwiseand>", | |
102 "<bitwiseor>", | |
103 "<bitwisexor>", | |
104 "<plus>", | |
105 "<minus>", | |
106 "<times>", | |
107 "<divide>", | |
108 "<modulus>", | |
34 | 109 "<identifier>", |
110 "<char>", | |
111 "<uint>", | |
112 "<int>", | |
113 "<eol>", | |
114 "<eof>" | |
115 }; | |
116 | |
117 char *lexer_token_name(int token) | |
118 { | |
119 if (token > token_eol) | |
120 return "???"; | |
121 return lexer_token_names[token]; | |
122 } | |
123 | |
25 | 124 static int lexer_getchar(cstate *state) |
125 { | |
126 int c; | |
127 c = input_getchar(state); | |
128 if (c == -2) | |
129 { | |
130 lwb_error("Error reading input stream."); | |
131 } | |
132 return c; | |
133 } | |
134 | |
135 static void lexer_nextchar(cstate *state) | |
136 { | |
137 state -> lexer_curchar = lexer_getchar(state); | |
138 if (state -> lexer_curchar == state -> lexer_ignorechar) | |
139 state -> lexer_curchar = lexer_getchar(state); | |
140 state -> lexer_ignorechar = 0; | |
141 } | |
142 | |
143 static int lexer_curchar(cstate *state) | |
144 { | |
145 if (state -> lexer_curchar == -1) | |
146 { | |
147 lexer_nextchar(state); | |
148 } | |
149 | |
150 return state -> lexer_curchar; | |
151 } | |
152 | |
153 static void lexer_skip_white(cstate *state) | |
154 { | |
155 int c; | |
156 | |
157 for (;;) | |
158 { | |
159 c = lexer_curchar(state); | |
160 if (!(c == 0 || c == ' ' || c == '\t')) | |
161 return; | |
162 lexer_nextchar(state); | |
163 } | |
164 } | |
165 | |
166 /* must not be called unless the word will be non-zero length */ | |
167 static void lexer_word(cstate *state) | |
168 { | |
169 int wordlen = 0; | |
170 int wordpos = 0; | |
171 char *word = NULL; | |
172 int c; | |
173 struct token_list *tok = NULL; | |
174 | |
175 for (;;) { | |
176 c = lexer_curchar(state); | |
26
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
177 if (c == '_' || (c >= '0' && c <= '9' ) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) |
25 | 178 { |
179 /* character is part of word */ | |
180 if (wordpos >= wordlen) | |
181 { | |
182 word = lw_realloc(word, wordlen + 32); | |
183 wordlen += 32; | |
184 } | |
185 word[wordpos++] = c; | |
186 } | |
187 else | |
188 break; | |
189 | |
190 lexer_nextchar(state); | |
191 } | |
192 | |
193 word[wordpos] = 0; | |
194 lw_free(state -> lexer_token_string); | |
195 state -> lexer_token_string = lw_strdup(word); | |
196 | |
197 switch (state -> parser_state) | |
198 { | |
199 default: | |
200 tok = lexer_global_tokens; | |
201 } | |
202 | |
35 | 203 if (state -> expression) |
204 { | |
205 tok = lexer_expr_tokens; | |
206 } | |
207 | |
25 | 208 /* check for tokens if appropriate */ |
209 /* force uppercase */ | |
210 if (tok) | |
211 { | |
212 for (c = 0; word[c]; c++) | |
213 if (word[c] >= 'A' && word[c] <= 'Z') | |
214 word[c] = word[c] + 0x20; | |
215 | |
216 while (tok -> string) | |
217 { | |
218 if (strcmp(tok -> string, word) == 0) | |
219 break; | |
220 tok++; | |
221 } | |
222 } | |
223 | |
224 lw_free(word); | |
225 if (tok && tok -> string) | |
226 state -> lexer_token = tok -> token; | |
227 else | |
228 state -> lexer_token = token_identifier; | |
229 } | |
230 | |
35 | 231 static void lexer_parse_number(cstate *state, int neg) |
232 { | |
233 unsigned long tint = 0; | |
234 int c; | |
235 | |
236 for (;;) | |
237 { | |
238 c = lexer_curchar(state); | |
239 if (c >= '0' && c <= '9') | |
240 { | |
241 tint *= 10 + (c - '0'); | |
242 } | |
243 else | |
244 { | |
245 /* end of the number here */ | |
246 if (neg) | |
247 { | |
248 if (tint > 0x80000000) | |
249 lwb_error("Integer overflow\n"); | |
250 state -> lexer_token_number.integer = -tint; | |
251 state -> lexer_token = token_int; | |
252 } | |
253 else | |
254 { | |
255 state -> lexer_token = token_uint; | |
256 state -> lexer_token_number.uinteger = tint; | |
257 } | |
258 return; | |
259 } | |
260 lexer_nextchar(state); | |
261 } | |
262 } | |
263 | |
25 | 264 static void lexer_empty_token(cstate *state) |
265 { | |
266 lw_free(state -> lexer_token_string); | |
267 state -> lexer_token_string = NULL; | |
268 } | |
269 | |
270 void lexer(cstate *state) | |
271 { | |
272 int c; | |
273 | |
274 lexer_skip_white(state); | |
275 | |
276 lexer_empty_token(state); | |
277 | |
278 c = lexer_curchar(state); | |
279 if (c == -1) | |
280 { | |
281 state -> lexer_token = token_eof; | |
282 return; | |
283 } | |
284 | |
285 if (c == '\n') | |
286 { | |
287 /* LF */ | |
288 lexer_nextchar(state); | |
289 state -> lexer_ignorechar = '\r'; | |
290 state -> lexer_token = token_eol; | |
291 return; | |
292 } | |
293 | |
294 if (c == '\r') | |
295 { | |
296 /* CR */ | |
297 lexer_nextchar(state); | |
298 state -> lexer_ignorechar = '\n'; | |
299 state -> lexer_token = token_eol; | |
300 return; | |
301 } | |
302 | |
303 if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) | |
304 { | |
305 /* we have a word here; identifier, keyword, etc. */ | |
306 lexer_word(state); | |
307 return; | |
308 } | |
35 | 309 |
310 if (state -> expression && c >= '0' && c <= '9') | |
311 { | |
312 /* we have a number */ | |
313 lexer_parse_number(state, 0); | |
314 return; | |
315 } | |
316 | |
317 lexer_nextchar(state); | |
318 if (state -> expression) | |
319 { | |
320 if (c == '-' && lexer_curchar(state) >= '0' && lexer_curchar(state) <= '9') | |
321 { | |
322 /* we have a negative number here */ | |
323 lexer_parse_number(state, 1); | |
324 return; | |
325 } | |
326 if (c == '=') | |
327 { | |
328 state -> lexer_token = token_op_equality; | |
329 return; | |
330 } | |
331 if (c == '<') | |
332 { | |
333 if (lexer_curchar(state) == '=') | |
334 { | |
335 lexer_nextchar(state); | |
336 state -> lexer_token = token_op_lessequal; | |
337 return; | |
338 } | |
339 if (lexer_curchar(state) == '>') | |
340 { | |
341 lexer_nextchar(state); | |
342 state -> lexer_token = token_op_notequal; | |
343 return; | |
344 } | |
345 state -> lexer_token = token_op_less; | |
346 return; | |
347 } | |
348 if (c == '>') | |
349 { | |
350 if (lexer_curchar(state) == '>') | |
351 { | |
352 lexer_nextchar(state); | |
353 state -> lexer_token = token_op_greaterequal; | |
354 return; | |
355 } | |
356 if (lexer_curchar(state) == '<') | |
357 { | |
358 state -> lexer_token = token_op_notequal; | |
359 lexer_nextchar(state); | |
360 return; | |
361 } | |
362 state -> lexer_token = token_op_greater; | |
363 return; | |
364 } | |
365 switch(c) | |
366 { | |
367 case '+': | |
368 state -> lexer_token = token_op_plus; | |
369 return; | |
370 | |
371 case '-': | |
372 state -> lexer_token = token_op_minus; | |
373 return; | |
374 | |
375 case '/': | |
376 state -> lexer_token = token_op_divide; | |
377 return; | |
378 | |
379 case '*': | |
380 state -> lexer_token = token_op_times; | |
381 return; | |
382 | |
383 case '%': | |
384 state -> lexer_token = token_op_modulus; | |
385 return; | |
386 | |
387 | |
388 } | |
389 } | |
390 else | |
391 { | |
392 if (c == '=') | |
393 { | |
394 state -> lexer_token = token_op_assignment; | |
395 return; | |
396 } | |
397 } | |
25 | 398 |
399 /* return the character if all else fails */ | |
35 | 400 state -> lexer_token = token_char; |
25 | 401 state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2); |
402 state -> lexer_token_string[0] = c; | |
403 state -> lexer_token_string[1] = 0; | |
404 return; | |
405 } | |
31
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
406 |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
407 char *lexer_return_token(cstate *state) |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
408 { |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
409 static char *buffer = NULL; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
410 static int buflen = 0; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
411 int l; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
412 |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
413 if (buflen == 0) |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
414 { |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
415 buffer = lw_alloc(128); |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
416 buflen = 128; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
417 } |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
418 |
34 | 419 l = snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token)); |
31
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
420 if (l >= buflen) |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
421 { |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
422 buffer = lw_realloc(buffer, l + 1); |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
423 buflen = l + 1; |
34 | 424 snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token)); |
31
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
425 } |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
426 return buffer; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
427 } |