Mercurial > hg > index.cgi
annotate lwbasic/attic/lexer.c @ 207:07e1fac76321
Added pragma to allow non case sensitive symbols
Added "nosymbolcase" and "symbolnocase" pragmas to cause symbols defined
while the pragma is in effect to be treated as case insensitive. Also
documented the new pragma.
author | William Astle <lost@l-w.ca> |
---|---|
date | Sat, 09 Jun 2012 15:47:22 -0600 |
parents | cca933d32298 |
children |
rev | line source |
---|---|
25 | 1 /* |
2 lexer.c | |
3 | |
4 Copyright © 2011 William Astle | |
5 | |
6 This file is part of LWTOOLS. | |
7 | |
8 LWTOOLS is free software: you can redistribute it and/or modify it under the | |
9 terms of the GNU General Public License as published by the Free Software | |
10 Foundation, either version 3 of the License, or (at your option) any later | |
11 version. | |
12 | |
13 This program is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
16 more details. | |
17 | |
18 You should have received a copy of the GNU General Public License along with | |
19 this program. If not, see <http://www.gnu.org/licenses/>. | |
20 */ | |
21 | |
22 /* | |
23 This handles the gritty details of parsing tokens | |
24 */ | |
25 | |
26 #include <stdlib.h> | |
27 #include <stdio.h> | |
28 #include <string.h> | |
29 | |
30 #include <lw_alloc.h> | |
31 #include <lw_string.h> | |
32 | |
33 #define __lexer_c_seen__ | |
34 #include "lwbasic.h" | |
35 | |
36 /* | |
37 A token idenfier is returned by lexer(). The actual string value | |
38 is found in state->lexer_lexer_token_string; if the token as an integer value, | |
39 it will be found in state->lexer_token_number in the appropriate "value" | |
40 slot. | |
41 */ | |
42 | |
43 struct token_list | |
44 { | |
45 char *string; | |
46 int token; | |
47 }; | |
48 | |
35 | 49 /* keywords that appear as part of normal expressions */ |
25 | 50 static struct token_list lexer_global_tokens[] = |
51 { | |
52 { "function", token_kw_function }, | |
53 { "sub", token_kw_sub }, | |
54 { "public", token_kw_public }, | |
55 { "private", token_kw_private }, | |
56 { "as", token_kw_as }, | |
57 { "params", token_kw_params }, | |
58 { "returns", token_kw_returns }, | |
26
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
59 { "integer", token_kw_integer }, |
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
60 { "endsub", token_kw_endsub }, |
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
61 { "endfunction", token_kw_endfunction }, |
33 | 62 { "dim", token_kw_dim }, |
35 | 63 { NULL } |
64 }; | |
65 | |
66 /* contains "built in" function names */ | |
67 static struct token_list lexer_expr_tokens[] = | |
68 { | |
69 { "and", token_op_and }, | |
70 { "or", token_op_or }, | |
71 { "band", token_op_band }, | |
72 { "bor", token_op_bor }, | |
73 { "bxor", token_op_bxor }, | |
74 { "xor", token_op_xor }, | |
36 | 75 { "not", token_op_not }, |
76 { "bnot", token_op_bnot }, | |
25 | 77 { NULL } |
78 }; | |
79 | |
34 | 80 static char *lexer_token_names[] = |
81 { | |
82 "SUB", | |
83 "FUNCTION", | |
84 "AS", | |
85 "PUBLIC", | |
86 "PRIVATE", | |
87 "PARAMS", | |
88 "RETURNS", | |
89 "INTEGER", | |
90 "ENDSUB", | |
91 "ENDFUNCTION", | |
92 "DIM", | |
93 "<assignment>", | |
35 | 94 "<equality>", |
95 "<greater>", | |
96 "<less>", | |
97 "<greaterequal>", | |
98 "<lessequal>", | |
99 "<notequal>", | |
100 "<and>", | |
101 "<or>", | |
102 "<xor>", | |
103 "<bitwiseand>", | |
104 "<bitwiseor>", | |
105 "<bitwisexor>", | |
106 "<plus>", | |
107 "<minus>", | |
108 "<times>", | |
109 "<divide>", | |
110 "<modulus>", | |
36 | 111 "<openparen>", |
112 "<closeparen>", | |
113 "<not>", | |
114 "<bitwisenot>", | |
34 | 115 "<identifier>", |
116 "<char>", | |
117 "<uint>", | |
118 "<int>", | |
119 "<eol>", | |
120 "<eof>" | |
121 }; | |
122 | |
123 char *lexer_token_name(int token) | |
124 { | |
125 if (token > token_eol) | |
126 return "???"; | |
127 return lexer_token_names[token]; | |
128 } | |
129 | |
25 | 130 static int lexer_getchar(cstate *state) |
131 { | |
132 int c; | |
133 c = input_getchar(state); | |
134 if (c == -2) | |
135 { | |
136 lwb_error("Error reading input stream."); | |
137 } | |
138 return c; | |
139 } | |
140 | |
141 static void lexer_nextchar(cstate *state) | |
142 { | |
143 state -> lexer_curchar = lexer_getchar(state); | |
144 if (state -> lexer_curchar == state -> lexer_ignorechar) | |
145 state -> lexer_curchar = lexer_getchar(state); | |
146 state -> lexer_ignorechar = 0; | |
147 } | |
148 | |
149 static int lexer_curchar(cstate *state) | |
150 { | |
151 if (state -> lexer_curchar == -1) | |
152 { | |
153 lexer_nextchar(state); | |
154 } | |
155 | |
156 return state -> lexer_curchar; | |
157 } | |
158 | |
159 static void lexer_skip_white(cstate *state) | |
160 { | |
161 int c; | |
162 | |
163 for (;;) | |
164 { | |
165 c = lexer_curchar(state); | |
166 if (!(c == 0 || c == ' ' || c == '\t')) | |
167 return; | |
168 lexer_nextchar(state); | |
169 } | |
170 } | |
171 | |
172 /* must not be called unless the word will be non-zero length */ | |
173 static void lexer_word(cstate *state) | |
174 { | |
175 int wordlen = 0; | |
176 int wordpos = 0; | |
177 char *word = NULL; | |
178 int c; | |
179 struct token_list *tok = NULL; | |
180 | |
181 for (;;) { | |
182 c = lexer_curchar(state); | |
26
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
183 if (c == '_' || (c >= '0' && c <= '9' ) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) |
25 | 184 { |
185 /* character is part of word */ | |
186 if (wordpos >= wordlen) | |
187 { | |
188 word = lw_realloc(word, wordlen + 32); | |
189 wordlen += 32; | |
190 } | |
191 word[wordpos++] = c; | |
192 } | |
193 else | |
194 break; | |
195 | |
196 lexer_nextchar(state); | |
197 } | |
198 | |
199 word[wordpos] = 0; | |
200 lw_free(state -> lexer_token_string); | |
201 state -> lexer_token_string = lw_strdup(word); | |
202 | |
203 switch (state -> parser_state) | |
204 { | |
205 default: | |
206 tok = lexer_global_tokens; | |
207 } | |
208 | |
35 | 209 if (state -> expression) |
210 { | |
211 tok = lexer_expr_tokens; | |
212 } | |
213 | |
25 | 214 /* check for tokens if appropriate */ |
215 /* force uppercase */ | |
216 if (tok) | |
217 { | |
218 for (c = 0; word[c]; c++) | |
219 if (word[c] >= 'A' && word[c] <= 'Z') | |
220 word[c] = word[c] + 0x20; | |
221 | |
222 while (tok -> string) | |
223 { | |
224 if (strcmp(tok -> string, word) == 0) | |
225 break; | |
226 tok++; | |
227 } | |
228 } | |
229 | |
230 lw_free(word); | |
231 if (tok && tok -> string) | |
232 state -> lexer_token = tok -> token; | |
233 else | |
234 state -> lexer_token = token_identifier; | |
235 } | |
236 | |
35 | 237 static void lexer_parse_number(cstate *state, int neg) |
238 { | |
239 unsigned long tint = 0; | |
240 int c; | |
241 | |
242 for (;;) | |
243 { | |
244 c = lexer_curchar(state); | |
245 if (c >= '0' && c <= '9') | |
246 { | |
247 tint *= 10 + (c - '0'); | |
248 } | |
249 else | |
250 { | |
251 /* end of the number here */ | |
252 if (neg) | |
253 { | |
254 if (tint > 0x80000000) | |
255 lwb_error("Integer overflow\n"); | |
256 state -> lexer_token_number.integer = -tint; | |
257 state -> lexer_token = token_int; | |
258 } | |
259 else | |
260 { | |
261 state -> lexer_token = token_uint; | |
262 state -> lexer_token_number.uinteger = tint; | |
263 } | |
264 return; | |
265 } | |
266 lexer_nextchar(state); | |
267 } | |
268 } | |
269 | |
25 | 270 static void lexer_empty_token(cstate *state) |
271 { | |
272 lw_free(state -> lexer_token_string); | |
273 state -> lexer_token_string = NULL; | |
274 } | |
275 | |
276 void lexer(cstate *state) | |
277 { | |
278 int c; | |
279 | |
280 lexer_skip_white(state); | |
281 | |
282 lexer_empty_token(state); | |
283 | |
284 c = lexer_curchar(state); | |
285 if (c == -1) | |
286 { | |
287 state -> lexer_token = token_eof; | |
288 return; | |
289 } | |
290 | |
291 if (c == '\n') | |
292 { | |
293 /* LF */ | |
294 lexer_nextchar(state); | |
295 state -> lexer_ignorechar = '\r'; | |
296 state -> lexer_token = token_eol; | |
297 return; | |
298 } | |
299 | |
300 if (c == '\r') | |
301 { | |
302 /* CR */ | |
303 lexer_nextchar(state); | |
304 state -> lexer_ignorechar = '\n'; | |
305 state -> lexer_token = token_eol; | |
306 return; | |
307 } | |
308 | |
309 if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) | |
310 { | |
311 /* we have a word here; identifier, keyword, etc. */ | |
312 lexer_word(state); | |
313 return; | |
314 } | |
35 | 315 |
316 if (state -> expression && c >= '0' && c <= '9') | |
317 { | |
318 /* we have a number */ | |
319 lexer_parse_number(state, 0); | |
320 return; | |
321 } | |
322 | |
323 lexer_nextchar(state); | |
324 if (state -> expression) | |
325 { | |
326 if (c == '-' && lexer_curchar(state) >= '0' && lexer_curchar(state) <= '9') | |
327 { | |
328 /* we have a negative number here */ | |
329 lexer_parse_number(state, 1); | |
330 return; | |
331 } | |
332 if (c == '=') | |
333 { | |
334 state -> lexer_token = token_op_equality; | |
335 return; | |
336 } | |
337 if (c == '<') | |
338 { | |
339 if (lexer_curchar(state) == '=') | |
340 { | |
341 lexer_nextchar(state); | |
342 state -> lexer_token = token_op_lessequal; | |
343 return; | |
344 } | |
345 if (lexer_curchar(state) == '>') | |
346 { | |
347 lexer_nextchar(state); | |
348 state -> lexer_token = token_op_notequal; | |
349 return; | |
350 } | |
351 state -> lexer_token = token_op_less; | |
352 return; | |
353 } | |
354 if (c == '>') | |
355 { | |
356 if (lexer_curchar(state) == '>') | |
357 { | |
358 lexer_nextchar(state); | |
359 state -> lexer_token = token_op_greaterequal; | |
360 return; | |
361 } | |
362 if (lexer_curchar(state) == '<') | |
363 { | |
364 state -> lexer_token = token_op_notequal; | |
365 lexer_nextchar(state); | |
366 return; | |
367 } | |
368 state -> lexer_token = token_op_greater; | |
369 return; | |
370 } | |
371 switch(c) | |
372 { | |
373 case '+': | |
374 state -> lexer_token = token_op_plus; | |
375 return; | |
376 | |
377 case '-': | |
378 state -> lexer_token = token_op_minus; | |
379 return; | |
380 | |
381 case '/': | |
382 state -> lexer_token = token_op_divide; | |
383 return; | |
384 | |
385 case '*': | |
386 state -> lexer_token = token_op_times; | |
387 return; | |
388 | |
389 case '%': | |
390 state -> lexer_token = token_op_modulus; | |
391 return; | |
392 | |
36 | 393 case '(': |
394 state -> lexer_token = token_op_oparen; | |
395 return; | |
396 | |
397 case ')': | |
398 state -> lexer_token = token_op_cparen; | |
399 return; | |
35 | 400 |
401 } | |
402 } | |
403 else | |
404 { | |
405 if (c == '=') | |
406 { | |
407 state -> lexer_token = token_op_assignment; | |
408 return; | |
409 } | |
410 } | |
25 | 411 |
412 /* return the character if all else fails */ | |
35 | 413 state -> lexer_token = token_char; |
25 | 414 state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2); |
415 state -> lexer_token_string[0] = c; | |
416 state -> lexer_token_string[1] = 0; | |
417 return; | |
418 } | |
31
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
419 |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
420 char *lexer_return_token(cstate *state) |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
421 { |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
422 static char *buffer = NULL; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
423 static int buflen = 0; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
424 int l; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
425 |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
426 if (buflen == 0) |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
427 { |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
428 buffer = lw_alloc(128); |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
429 buflen = 128; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
430 } |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
431 |
34 | 432 l = snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token)); |
31
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
433 if (l >= buflen) |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
434 { |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
435 buffer = lw_realloc(buffer, l + 1); |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
436 buflen = l + 1; |
34 | 437 snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token)); |
31
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
438 } |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
439 return buffer; |
574931d87abd
Created a function to prettyprint the current lexer token
lost@l-w.ca
parents:
26
diff
changeset
|
440 } |