Mercurial > hg > index.cgi
annotate lwbasic/lexer.c @ 100:7ce01324e391
Added tag lwtools-4.2 for changeset 1cb23a4e1e7d
author | lost@l-w.ca |
---|---|
date | Sat, 06 Aug 2011 10:51:33 -0600 |
parents | 5325b640424d |
children |
rev | line source |
---|---|
25 | 1 /* |
2 lexer.c | |
3 | |
4 Copyright © 2011 William Astle | |
5 | |
6 This file is part of LWTOOLS. | |
7 | |
8 LWTOOLS is free software: you can redistribute it and/or modify it under the | |
9 terms of the GNU General Public License as published by the Free Software | |
10 Foundation, either version 3 of the License, or (at your option) any later | |
11 version. | |
12 | |
13 This program is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
16 more details. | |
17 | |
18 You should have received a copy of the GNU General Public License along with | |
19 this program. If not, see <http://www.gnu.org/licenses/>. | |
20 */ | |
21 | |
22 /* | |
23 This handles the gritty details of parsing tokens | |
24 */ | |
25 | |
26 #include <stdlib.h> | |
27 #include <stdio.h> | |
28 #include <string.h> | |
29 | |
30 #include <lw_alloc.h> | |
31 #include <lw_string.h> | |
32 | |
33 #define __lexer_c_seen__ | |
34 #include "lwbasic.h" | |
35 | |
36 /* | |
37 A token idenfier is returned by lexer(). The actual string value | |
38 is found in state->lexer_lexer_token_string; if the token as an integer value, | |
39 it will be found in state->lexer_token_number in the appropriate "value" | |
40 slot. | |
41 */ | |
42 | |
43 struct token_list | |
44 { | |
45 char *string; | |
46 int token; | |
47 }; | |
48 | |
35 | 49 /* keywords that appear as part of normal expressions */ |
25 | 50 static struct token_list lexer_global_tokens[] = |
51 { | |
52 { "function", token_kw_function }, | |
53 { "sub", token_kw_sub }, | |
54 { "public", token_kw_public }, | |
55 { "private", token_kw_private }, | |
56 { "as", token_kw_as }, | |
57 { "params", token_kw_params }, | |
58 { "returns", token_kw_returns }, | |
26
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
59 { "integer", token_kw_integer }, |
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
60 { "endsub", token_kw_endsub }, |
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
61 { "endfunction", token_kw_endfunction }, |
33 | 62 { "dim", token_kw_dim }, |
35 | 63 { NULL } |
64 }; | |
65 | |
66 /* contains "built in" function names */ | |
67 static struct token_list lexer_expr_tokens[] = | |
68 { | |
69 { "and", token_op_and }, | |
70 { "or", token_op_or }, | |
71 { "band", token_op_band }, | |
72 { "bor", token_op_bor }, | |
73 { "bxor", token_op_bxor }, | |
74 { "xor", token_op_xor }, | |
36 | 75 { "not", token_op_not }, |
76 { "bnot", token_op_bnot }, | |
25 | 77 { NULL } |
78 }; | |
79 | |
34 | 80 static char *lexer_token_names[] = |
81 { | |
82 "SUB", | |
83 "FUNCTION", | |
84 "AS", | |
85 "PUBLIC", | |
86 "PRIVATE", | |
87 "PARAMS", | |
88 "RETURNS", | |
89 "INTEGER", | |
90 "ENDSUB", | |
91 "ENDFUNCTION", | |
92 "DIM", | |
93 "<assignment>", | |
35 | 94 "<equality>", |
95 "<greater>", | |
96 "<less>", | |
97 "<greaterequal>", | |
98 "<lessequal>", | |
99 "<notequal>", | |
100 "<and>", | |
101 "<or>", | |
102 "<xor>", | |
103 "<bitwiseand>", | |
104 "<bitwiseor>", | |
105 "<bitwisexor>", | |
106 "<plus>", | |
107 "<minus>", | |
108 "<times>", | |
109 "<divide>", | |
110 "<modulus>", | |
36 | 111 "<openparen>", |
112 "<closeparen>", | |
113 "<not>", | |
114 "<bitwisenot>", | |
34 | 115 "<identifier>", |
116 "<char>", | |
117 "<uint>", | |
118 "<int>", | |
119 "<eol>", | |
120 "<eof>" | |
121 }; | |
122 | |
123 char *lexer_token_name(int token) | |
124 { | |
125 if (token > token_eol) | |
126 return "???"; | |
127 return lexer_token_names[token]; | |
128 } | |
129 | |
25 | 130 static int lexer_getchar(cstate *state) |
131 { | |
132 int c; | |
133 c = input_getchar(state); | |
134 if (c == -2) | |
135 { | |
136 lwb_error("Error reading input stream."); | |
137 } | |
138 return c; | |
139 } | |
140 | |
141 static void lexer_nextchar(cstate *state) | |
142 { | |
143 state -> lexer_curchar = lexer_getchar(state); | |
144 if (state -> lexer_curchar == state -> lexer_ignorechar) | |
145 state -> lexer_curchar = lexer_getchar(state); | |
146 state -> lexer_ignorechar = 0; | |
147 } | |
148 | |
149 static int lexer_curchar(cstate *state) | |
150 { | |
151 if (state -> lexer_curchar == -1) | |
152 { | |
153 lexer_nextchar(state); | |
154 } | |
155 | |
156 return state -> lexer_curchar; | |
157 } | |
158 | |
159 static void lexer_skip_white(cstate *state) | |
160 { | |
161 int c; | |
162 | |
163 for (;;) | |
164 { | |
165 c = lexer_curchar(state); | |
166 if (!(c == 0 || c == ' ' || c == '\t')) | |
167 return; | |
168 lexer_nextchar(state); | |
169 } | |
170 } | |
171 | |
172 /* must not be called unless the word will be non-zero length */ | |
173 static void lexer_word(cstate *state) | |
174 { | |
175 int wordlen = 0; | |
176 int wordpos = 0; | |
177 char *word = NULL; | |
178 int c; | |
179 struct token_list *tok = NULL; | |
180 | |
181 for (;;) { | |
182 c = lexer_curchar(state); | |
26
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
183 if (c == '_' || (c >= '0' && c <= '9' ) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) |
25 | 184 { |
185 /* character is part of word */ | |
186 if (wordpos >= wordlen) | |
187 { | |
188 word = lw_realloc(word, wordlen + 32); | |
189 wordlen += 32; | |
190 } | |
191 word[wordpos++] = c; | |
192 } | |
193 else | |
194 break; | |
195 | |
196 lexer_nextchar(state); | |
197 } | |
198 | |
199 word[wordpos] = 0; | |
200 lw_free(state -> lexer_token_string); | |
201 state -> lexer_token_string = lw_strdup(word); | |
202 | |
203 switch (state -> parser_state) | |
204 { | |
205 default: | |
206 tok = lexer_global_tokens; | |
207 } | |
208 | |
35 | 209 if (state -> expression) |
210 { | |
211 tok = lexer_expr_tokens; | |
212 } | |
213 | |
25 | 214 /* check for tokens if appropriate */ |
215 /* force uppercase */ | |
216 if (tok) | |
217 { | |
218 for (c = 0; word[c]; c++) | |
219 if (word[c] >= 'A' && word[c] <= 'Z') | |
220 word[c] = word[c] + 0x20; | |
221 | |
222 while (tok -> string) | |
223 { | |
224 if (strcmp(tok -> string, word) == 0) | |
225 break; | |
226 tok++; | |
227 } | |
228 } | |
229 | |
230 lw_free(word); | |
231 if (tok && tok -> string) | |
232 state -> lexer_token = tok -> token; | |
233 else | |
234 state -> lexer_token = token_identifier; | |
235 } | |
236 | |
35 | 237 static void lexer_parse_number(cstate *state, int neg) |
238 { | |
239 unsigned long tint = 0; | |
240 int c; | |
241 | |
242 for (;;) | |
243 { | |
244 c = lexer_curchar(state); | |
245 if (c >= '0' && c <= '9') | |
246 { | |
247 tint *= 10 + (c - '0'); | |
248 } | |
249 else | |
250 { | |
251 /* end of the number here */ | |
252 if (neg) | |
253 { | |
254 if (tint > 0x80000000) | |
255 lwb_error("Integer overflow\n"); | |
256 state -> lexer_token_number.integer = -tint; | |
257 state -> lexer_token = token_int; | |
258 } | |
259 else | |
260 { | |
261 state -> lexer_token = token_uint; | |
262 state -> lexer_token_number.uinteger = tint; | |
263 } | |
264 return; | |
265 } | |
266 lexer_nextchar(state); | |
267 } | |
268 } | |
269 | |
25 | 270 static void lexer_empty_token(cstate *state) |
271 { | |
272 lw_free(state -> lexer_token_string); | |
273 state -> lexer_token_string = NULL; | |
274 } | |
275 | |
276 void lexer(cstate *state) | |
277 { | |
278 int c; | |
279 | |
280 lexer_skip_white(state); | |
281 | |
282 lexer_empty_token(state); | |
283 | |
284 c = lexer_curchar(state); | |
285 if (c == -1) | |
286 { | |
287 state -> lexer_token = token_eof; | |
288 return; | |
289 } | |
290 | |
291 if (c == '\n') | |
292 { | |
293 /* LF */ | |
294 lexer_nextchar(state); | |
295 state -> lexer_ignorechar = '\r'; | |
296 state -> lexer_token = token_eol; | |
297 return; | |
298 } | |
299 | |
300 if (c == '\r') | |
301 { | |
302 /* CR */ | |
303 lexer_nextchar(state); | |
304 state -> lexer_ignorechar = '\n'; | |
305 state -> lexer_token = token_eol; | |
306 return; | |
307 } | |
308 | |
309 if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) | |
310 { | |
311 /* we have a word here; identifier, keyword, etc. */ | |
312 lexer_word(state); | |
313 return; | |
314 } | |
35 | 315 |
316 if (state -> expression && c >= '0' && c <= '9') | |
317 { | |
318 /* we have a number */ | |
319 lexer_parse_number(state, 0); | |
320 return; | |
321 } | |
322 | |
323 lexer_nextchar(state); | |
324 if (state -> expression) | |
325 { | |
326 if (c == '-' && lexer_curchar(state) >= '0' && lexer_curchar(state) <= '9') | |
327 { | |
328 /* we have a negative number here */ | |
329 lexer_parse_number(state, 1); | |
330 return; | |
331 } | |
332 if (c == '=') | |
333 { | |
334 state -> lexer_token = token_op_equality; | |
335 return; | |
336 } | |
337 if (c == '<') | |
338 { | |
339 if (lexer_curchar(state) == '=') | |
340 { | |
341 lexer_nextchar(state); | |
342 state -> lexer_token = token_op_lessequal; | |
343 return; | |
344 } | |
345 if (lexer_curchar(state) == '>') | |
346 { | |
347 lexer_nextchar(state); | |
348 state -> lexer_token = token_op_notequal; | |
349 return; | |
350 } | |
351 state -> lexer_token = token_op_less; | |
352 return; | |
353 } | |
354 if (c == '>') | |
355 { | |
356 if (lexer_curchar(state) == '>') | |
357 { | |
358 lexer_nextchar(state); | |
359 state -> lexer_token = token_op_greaterequal; | |
360 return; | |
361 } | |
362 if (lexer_curchar(state) == '<') | |
363 { | |
364 state -> lexer_token = token_op_notequal; | |
365 lexer_nextchar(state); | |
366 return; | |
367 } | |
368 state -> lexer_token = token_op_greater; | |
369 return; | |
370 } | |
371 switch(c) | |
372 { | |
373 case '+': | |
374 state -> lexer_token = token_op_plus; | |
375 return; | |
376 | |
377 case '-': | |
378 state -> lexer_token = token_op_minus; | |
379 return; | |
380 | |
381 case '/': | |
382 state -> lexer_token = token_op_divide; | |
383 return; | |
384 | |
385 case '*': | |
386 state -> lexer_token = token_op_times; | |
387 return; | |
388 | |
389 case '%': | |
390 state -> lexer_token = token_op_modulus; | |
391 return; | |
392 | |
36 | 393 case '(': |
394 state -> lexer_token = token_op_oparen; | |
395 return; | |
396 | |
397 case ')': | |
398 state -> lexer_token = token_op_cparen; | |
399 return; | |
35 | 400 |
401 } | |
402 } | |
403 else | |
404 { | |
405 if (c == '=') | |
406 { | |
407 state -> lexer_token = token_op_assignment; | |
408 return; | |
409 } | |
410 } | |
25 | 411 |
412 /* return the character if all else fails */ | |
35 | 413 state -> lexer_token = token_char; |
25 | 414 state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2); |
415 state -> lexer_token_string[0] = c; | |
416 state -> lexer_token_string[1] = 0; | |
417 return; | |
418 } | |
31
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
419 |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
420 char *lexer_return_token(cstate *state) |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
421 { |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
422 static char *buffer = NULL; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
423 static int buflen = 0; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
424 int l; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
425 |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
426 if (buflen == 0) |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
427 { |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
428 buffer = lw_alloc(128); |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
429 buflen = 128; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
430 } |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
431 |
34 | 432 l = snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token)); |
31
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
433 if (l >= buflen) |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
434 { |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
435 buffer = lw_realloc(buffer, l + 1); |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
436 buflen = l + 1; |
34 | 437 snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token)); |
31
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
438 } |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
439 return buffer; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
440 } |