comparison lwbasic/attic/lexer.c @ 185:cca933d32298

Clean up some mess in lwbasic directory
author lost@l-w.ca
date Thu, 22 Dec 2011 18:03:38 -0700
parents lwbasic/lexer.c@5325b640424d
children
comparison
equal deleted inserted replaced
184:6433cb024174 185:cca933d32298
1 /*
2 lexer.c
3
4 Copyright © 2011 William Astle
5
6 This file is part of LWTOOLS.
7
8 LWTOOLS is free software: you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation, either version 3 of the License, or (at your option) any later
11 version.
12
13 This program is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 more details.
17
18 You should have received a copy of the GNU General Public License along with
19 this program. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 /*
23 This handles the gritty details of parsing tokens
24 */
25
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29
30 #include <lw_alloc.h>
31 #include <lw_string.h>
32
33 #define __lexer_c_seen__
34 #include "lwbasic.h"
35
36 /*
37 A token idenfier is returned by lexer(). The actual string value
38 is found in state->lexer_lexer_token_string; if the token as an integer value,
39 it will be found in state->lexer_token_number in the appropriate "value"
40 slot.
41 */
42
43 struct token_list
44 {
45 char *string;
46 int token;
47 };
48
49 /* keywords that appear as part of normal expressions */
50 static struct token_list lexer_global_tokens[] =
51 {
52 { "function", token_kw_function },
53 { "sub", token_kw_sub },
54 { "public", token_kw_public },
55 { "private", token_kw_private },
56 { "as", token_kw_as },
57 { "params", token_kw_params },
58 { "returns", token_kw_returns },
59 { "integer", token_kw_integer },
60 { "endsub", token_kw_endsub },
61 { "endfunction", token_kw_endfunction },
62 { "dim", token_kw_dim },
63 { NULL }
64 };
65
66 /* contains "built in" function names */
67 static struct token_list lexer_expr_tokens[] =
68 {
69 { "and", token_op_and },
70 { "or", token_op_or },
71 { "band", token_op_band },
72 { "bor", token_op_bor },
73 { "bxor", token_op_bxor },
74 { "xor", token_op_xor },
75 { "not", token_op_not },
76 { "bnot", token_op_bnot },
77 { NULL }
78 };
79
80 static char *lexer_token_names[] =
81 {
82 "SUB",
83 "FUNCTION",
84 "AS",
85 "PUBLIC",
86 "PRIVATE",
87 "PARAMS",
88 "RETURNS",
89 "INTEGER",
90 "ENDSUB",
91 "ENDFUNCTION",
92 "DIM",
93 "<assignment>",
94 "<equality>",
95 "<greater>",
96 "<less>",
97 "<greaterequal>",
98 "<lessequal>",
99 "<notequal>",
100 "<and>",
101 "<or>",
102 "<xor>",
103 "<bitwiseand>",
104 "<bitwiseor>",
105 "<bitwisexor>",
106 "<plus>",
107 "<minus>",
108 "<times>",
109 "<divide>",
110 "<modulus>",
111 "<openparen>",
112 "<closeparen>",
113 "<not>",
114 "<bitwisenot>",
115 "<identifier>",
116 "<char>",
117 "<uint>",
118 "<int>",
119 "<eol>",
120 "<eof>"
121 };
122
123 char *lexer_token_name(int token)
124 {
125 if (token > token_eol)
126 return "???";
127 return lexer_token_names[token];
128 }
129
130 static int lexer_getchar(cstate *state)
131 {
132 int c;
133 c = input_getchar(state);
134 if (c == -2)
135 {
136 lwb_error("Error reading input stream.");
137 }
138 return c;
139 }
140
141 static void lexer_nextchar(cstate *state)
142 {
143 state -> lexer_curchar = lexer_getchar(state);
144 if (state -> lexer_curchar == state -> lexer_ignorechar)
145 state -> lexer_curchar = lexer_getchar(state);
146 state -> lexer_ignorechar = 0;
147 }
148
149 static int lexer_curchar(cstate *state)
150 {
151 if (state -> lexer_curchar == -1)
152 {
153 lexer_nextchar(state);
154 }
155
156 return state -> lexer_curchar;
157 }
158
159 static void lexer_skip_white(cstate *state)
160 {
161 int c;
162
163 for (;;)
164 {
165 c = lexer_curchar(state);
166 if (!(c == 0 || c == ' ' || c == '\t'))
167 return;
168 lexer_nextchar(state);
169 }
170 }
171
172 /* must not be called unless the word will be non-zero length */
173 static void lexer_word(cstate *state)
174 {
175 int wordlen = 0;
176 int wordpos = 0;
177 char *word = NULL;
178 int c;
179 struct token_list *tok = NULL;
180
181 for (;;) {
182 c = lexer_curchar(state);
183 if (c == '_' || (c >= '0' && c <= '9' ) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80)
184 {
185 /* character is part of word */
186 if (wordpos >= wordlen)
187 {
188 word = lw_realloc(word, wordlen + 32);
189 wordlen += 32;
190 }
191 word[wordpos++] = c;
192 }
193 else
194 break;
195
196 lexer_nextchar(state);
197 }
198
199 word[wordpos] = 0;
200 lw_free(state -> lexer_token_string);
201 state -> lexer_token_string = lw_strdup(word);
202
203 switch (state -> parser_state)
204 {
205 default:
206 tok = lexer_global_tokens;
207 }
208
209 if (state -> expression)
210 {
211 tok = lexer_expr_tokens;
212 }
213
214 /* check for tokens if appropriate */
215 /* force uppercase */
216 if (tok)
217 {
218 for (c = 0; word[c]; c++)
219 if (word[c] >= 'A' && word[c] <= 'Z')
220 word[c] = word[c] + 0x20;
221
222 while (tok -> string)
223 {
224 if (strcmp(tok -> string, word) == 0)
225 break;
226 tok++;
227 }
228 }
229
230 lw_free(word);
231 if (tok && tok -> string)
232 state -> lexer_token = tok -> token;
233 else
234 state -> lexer_token = token_identifier;
235 }
236
237 static void lexer_parse_number(cstate *state, int neg)
238 {
239 unsigned long tint = 0;
240 int c;
241
242 for (;;)
243 {
244 c = lexer_curchar(state);
245 if (c >= '0' && c <= '9')
246 {
247 tint *= 10 + (c - '0');
248 }
249 else
250 {
251 /* end of the number here */
252 if (neg)
253 {
254 if (tint > 0x80000000)
255 lwb_error("Integer overflow\n");
256 state -> lexer_token_number.integer = -tint;
257 state -> lexer_token = token_int;
258 }
259 else
260 {
261 state -> lexer_token = token_uint;
262 state -> lexer_token_number.uinteger = tint;
263 }
264 return;
265 }
266 lexer_nextchar(state);
267 }
268 }
269
270 static void lexer_empty_token(cstate *state)
271 {
272 lw_free(state -> lexer_token_string);
273 state -> lexer_token_string = NULL;
274 }
275
276 void lexer(cstate *state)
277 {
278 int c;
279
280 lexer_skip_white(state);
281
282 lexer_empty_token(state);
283
284 c = lexer_curchar(state);
285 if (c == -1)
286 {
287 state -> lexer_token = token_eof;
288 return;
289 }
290
291 if (c == '\n')
292 {
293 /* LF */
294 lexer_nextchar(state);
295 state -> lexer_ignorechar = '\r';
296 state -> lexer_token = token_eol;
297 return;
298 }
299
300 if (c == '\r')
301 {
302 /* CR */
303 lexer_nextchar(state);
304 state -> lexer_ignorechar = '\n';
305 state -> lexer_token = token_eol;
306 return;
307 }
308
309 if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80)
310 {
311 /* we have a word here; identifier, keyword, etc. */
312 lexer_word(state);
313 return;
314 }
315
316 if (state -> expression && c >= '0' && c <= '9')
317 {
318 /* we have a number */
319 lexer_parse_number(state, 0);
320 return;
321 }
322
323 lexer_nextchar(state);
324 if (state -> expression)
325 {
326 if (c == '-' && lexer_curchar(state) >= '0' && lexer_curchar(state) <= '9')
327 {
328 /* we have a negative number here */
329 lexer_parse_number(state, 1);
330 return;
331 }
332 if (c == '=')
333 {
334 state -> lexer_token = token_op_equality;
335 return;
336 }
337 if (c == '<')
338 {
339 if (lexer_curchar(state) == '=')
340 {
341 lexer_nextchar(state);
342 state -> lexer_token = token_op_lessequal;
343 return;
344 }
345 if (lexer_curchar(state) == '>')
346 {
347 lexer_nextchar(state);
348 state -> lexer_token = token_op_notequal;
349 return;
350 }
351 state -> lexer_token = token_op_less;
352 return;
353 }
354 if (c == '>')
355 {
356 if (lexer_curchar(state) == '>')
357 {
358 lexer_nextchar(state);
359 state -> lexer_token = token_op_greaterequal;
360 return;
361 }
362 if (lexer_curchar(state) == '<')
363 {
364 state -> lexer_token = token_op_notequal;
365 lexer_nextchar(state);
366 return;
367 }
368 state -> lexer_token = token_op_greater;
369 return;
370 }
371 switch(c)
372 {
373 case '+':
374 state -> lexer_token = token_op_plus;
375 return;
376
377 case '-':
378 state -> lexer_token = token_op_minus;
379 return;
380
381 case '/':
382 state -> lexer_token = token_op_divide;
383 return;
384
385 case '*':
386 state -> lexer_token = token_op_times;
387 return;
388
389 case '%':
390 state -> lexer_token = token_op_modulus;
391 return;
392
393 case '(':
394 state -> lexer_token = token_op_oparen;
395 return;
396
397 case ')':
398 state -> lexer_token = token_op_cparen;
399 return;
400
401 }
402 }
403 else
404 {
405 if (c == '=')
406 {
407 state -> lexer_token = token_op_assignment;
408 return;
409 }
410 }
411
412 /* return the character if all else fails */
413 state -> lexer_token = token_char;
414 state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2);
415 state -> lexer_token_string[0] = c;
416 state -> lexer_token_string[1] = 0;
417 return;
418 }
419
420 char *lexer_return_token(cstate *state)
421 {
422 static char *buffer = NULL;
423 static int buflen = 0;
424 int l;
425
426 if (buflen == 0)
427 {
428 buffer = lw_alloc(128);
429 buflen = 128;
430 }
431
432 l = snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token));
433 if (l >= buflen)
434 {
435 buffer = lw_realloc(buffer, l + 1);
436 buflen = l + 1;
437 snprintf(buffer, buflen, "%s (%s)", state -> lexer_token_string, lexer_token_name(state -> lexer_token));
438 }
439 return buffer;
440 }