comparison lwcc/lex.c @ 295:4b17780f2777 ccdev

Checkpoint lwcc development Changed tactics with the preprocessor. Instead of getting clever and trying to do things the "fast" way, instead, just tokenize the whole input and process it that way. Also, set up so the preprocessor and compiler can be integrated instead of having to have a specifically correct output for the preprocessed file. Also removed the subdirectories in the lwcc directory. It made things more complicated than they needed to be.
author William Astle <lost@l-w.ca>
date Thu, 12 Sep 2013 22:06:26 -0600
parents
children 83fcc1ed6ad6
comparison
equal deleted inserted replaced
294:048adfee2933 295:4b17780f2777
1 /*
2 lwcc/lex.c
3
4 Copyright © 2013 William Astle
5
6 This file is part of LWTOOLS.
7
8 LWTOOLS is free software: you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation, either version 3 of the License, or (at your option) any later
11 version.
12
13 This program is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 more details.
17
18 You should have received a copy of the GNU General Public License along with
19 this program. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include <ctype.h>
23 #include <stdio.h>
24
25 #include <lw_alloc.h>
26
27 #include "cpp.h"
28 #include "strbuf.h"
29 #include "token.h"
30
31 /* fetch a raw input byte from the current file. Will return CPP_EOF if
32 EOF is encountered and CPP_EOL if an end of line sequence is encountered.
33 End of line is defined as either CR, CRLF, LF, or LFCR. CPP_EOL is
34 returned on the first CR or LF encountered. The complementary CR or LF
35 is munched, if present, when the *next* character is read. This always
36 operates on file_stack.
37
38 This function also accounts for line numbers in input files and also
39 character columns.
40 */
41 static int fetch_byte_ll(struct preproc_info *pp)
42 {
43 int c;
44
45 if (pp -> eolstate != 0)
46 {
47 pp -> lineno++;
48 pp -> column = 0;
49 }
50 c = getc(pp -> fp);
51 pp -> column++;
52 if (pp -> eolstate == 1)
53 {
54 // just saw CR, munch LF
55 if (c == 10)
56 c = getc(pp -> fp);
57 pp -> eolstate = 0;
58 }
59 else if (pp -> eolstate == 2)
60 {
61 // just saw LF, much CR
62 if (c == 13)
63 c = getc(pp -> fp);
64 pp -> eolstate = 0;
65 }
66
67 if (c == 10)
68 {
69 // we have LF - end of line, flag to munch CR
70 pp -> eolstate = 2;
71 c = CPP_EOL;
72 }
73 else if (c == 13)
74 {
75 // we have CR - end of line, flag to munch LF
76 pp -> eolstate = 1;
77 c = CPP_EOL;
78 }
79 else if (c == EOF)
80 {
81 c = CPP_EOF;
82 }
83 return c;
84 }
85
86 /* This function takes a sequence of bytes from the _ll function above
87 and does trigraph interpretation on it, but only if the global
88 trigraphs is nonzero. */
89 static int fetch_byte_tg(struct preproc_info *pp)
90 {
91 int c;
92
93 if (!pp -> trigraphs)
94 {
95 c = fetch_byte_ll(pp);
96 }
97 else
98 {
99 /* we have to do the trigraph shit here */
100 if (pp -> ra != CPP_NOUNG)
101 {
102 if (pp -> qseen > 0)
103 {
104 c = '?';
105 pp -> qseen -= 1;
106 return c;
107 }
108 else
109 {
110 c = pp -> ra;
111 pp -> ra = CPP_NOUNG;
112 return c;
113 }
114 }
115
116 c = fetch_byte_ll(pp);
117 while (c == '?')
118 {
119 pp -> qseen++;
120 c = fetch_byte_ll(pp);
121 }
122
123 if (pp -> qseen >= 2)
124 {
125 // we have a trigraph
126 switch (c)
127 {
128 case '=':
129 c = '#';
130 pp -> qseen -= 2;
131 break;
132
133 case '/':
134 c = '\\';
135 pp -> qseen -= 2;
136 break;
137
138 case '\'':
139 c = '^';
140 pp -> qseen -= 2;
141 break;
142
143 case '(':
144 c = '[';
145 pp -> qseen -= 2;
146 break;
147
148 case ')':
149 c = ']';
150 pp -> qseen -= 2;
151 break;
152
153 case '!':
154 c = '|';
155 pp -> qseen -= 2;
156 break;
157
158 case '<':
159 c = '{';
160 pp -> qseen -= 2;
161 break;
162
163 case '>':
164 c = '}';
165 pp -> qseen -= 2;
166 break;
167
168 case '-':
169 c = '~';
170 pp -> qseen -= 2;
171 break;
172 }
173 if (pp -> qseen > 0)
174 {
175 pp -> ra = c;
176 c = '?';
177 pp -> qseen--;
178 }
179 }
180 else if (pp -> qseen > 0)
181 {
182 pp -> ra = c;
183 c = '?';
184 pp -> qseen--;
185 }
186 }
187 return c;
188 }
189
190 /* This function puts a byte back onto the front of the input stream used
191 by fetch_byte(). Theoretically, an unlimited number of characters can
192 be unfetched. Line and column counting may be incorrect if unfetched
193 characters cross a token boundary. */
194 static void preproc_lex_unfetch_byte(struct preproc_info *pp, int c)
195 {
196 if (pp -> ungetbufl >= pp -> ungetbufs)
197 {
198 pp -> ungetbufs += 100;
199 pp -> ungetbuf = lw_realloc(pp -> ungetbuf, pp -> ungetbufs);
200 }
201 pp -> ungetbuf[pp -> ungetbufl++] = c;
202 }
203
204 /* This function retrieves a byte from the input stream. It performs
205 backslash-newline splicing on the returned bytes. Any character
206 retrieved from the unfetch buffer is presumed to have already passed
207 the backslash-newline filter. */
208 static int fetch_byte(struct preproc_info *pp)
209 {
210 int c;
211
212 if (pp -> ungetbufl > 0)
213 {
214 pp -> ungetbufl--;
215 c = pp -> ungetbuf[pp -> ungetbufl];
216 if (pp -> ungetbufl == 0)
217 {
218 lw_free(pp -> ungetbuf);
219 pp -> ungetbuf = NULL;
220 pp -> ungetbufs = 0;
221 }
222 return c;
223 }
224
225 again:
226 if (pp -> unget != CPP_NOUNG)
227 {
228 c = pp -> unget;
229 pp -> unget = CPP_NOUNG;
230 }
231 else
232 {
233 c = fetch_byte_tg(pp);
234 }
235 if (c == '\\')
236 {
237 int c2;
238 c2 = fetch_byte_tg(pp);
239 if (c2 == CPP_EOL)
240 goto again;
241 else
242 pp -> unget = c2;
243 }
244 return c;
245 }
246
247
248
249 /*
250 Lex a token off the current input file.
251
252 Returned tokens are as follows:
253
254 * all words starting with [a-zA-Z_] are returned as TOK_IDENT
255 * numbers are returned as their appropriate type
256 * all whitespace in a sequence, including comments, is returned as
257 a single instance of TOK_WSPACE
258 * TOK_EOL is returned in the case of the end of a line
259 * TOK_EOF is returned when the end of the file is reached
260 * If no TOK_EOL appears before TOK_EOF, a TOK_EOL will be synthesised
261 * Any symbolic operator, etc., recognized by C will be returned as such
262 a token
263 * TOK_HASH will be returned for a #
264 * trigraphs will be interpreted
265 * backslash-newline will be interpreted
266 * any instance of CR, LF, CRLF, or LFCR will be interpreted as TOK_EOL
267 */
268
269
270 static int preproc_lex_fetch_byte(struct preproc_info *pp)
271 {
272 int c;
273 c = fetch_byte(pp);
274 if (c == CPP_EOF && pp -> eolseen == 0)
275 {
276 preproc_throw_warning(pp, "No newline at end of file");
277 pp -> eolseen = 1;
278 return CPP_EOL;
279 }
280
281 if (c == CPP_EOL)
282 {
283 pp -> eolseen = 1;
284 return c;
285 }
286
287 pp -> eolseen = 0;
288
289 /* convert comments to a single space here */
290 if (c == '/')
291 {
292 int c2;
293 c2 = fetch_byte(pp);
294 if (c2 == '/')
295 {
296 /* single line comment */
297 c = ' ';
298 for (;;)
299 {
300 c2 = fetch_byte(pp);
301 if (c2 == CPP_EOF || c2 == CPP_EOL)
302 break;
303 }
304 preproc_lex_unfetch_byte(pp, c2);
305 }
306 else if (c2 == '*')
307 {
308 /* block comment */
309 c = ' ';
310 for (;;)
311 {
312 c2 = fetch_byte(pp);
313 if (c2 == CPP_EOL || c2 == CPP_EOF)
314 {
315 preproc_lex_unfetch_byte(pp, c);
316 break;
317 }
318 if (c2 == '*')
319 {
320 /* maybe end of comment */
321 c2 = preproc_lex_fetch_byte(pp);
322 if (c2 == '/')
323 break;
324 }
325 }
326 }
327 else
328 {
329 /* not a comment - restore lookahead character */
330 preproc_lex_unfetch_byte(pp, c2);
331 }
332 }
333 return c;
334 }
335
336 struct token *preproc_lex_next_token(struct preproc_info *pp)
337 {
338 int sline = pp -> lineno;
339 int scol = pp -> column;
340 char *strval = NULL;
341 int ttype = TOK_NONE;
342 int c, c2;
343 int cl;
344 struct strbuf *strbuf;
345 struct token *t;
346
347 c = preproc_lex_fetch_byte(pp);
348 if (c == CPP_EOF)
349 {
350 if (pp -> nlseen == 0)
351 {
352 c = CPP_EOL;
353 }
354 }
355
356 if (c == CPP_EOF)
357 {
358 ttype = TOK_EOF;
359 goto out;
360 }
361 if (c == CPP_EOL)
362 {
363 pp -> nlseen = 1;
364 ttype = TOK_EOL;
365 goto out;
366 }
367
368 pp -> nlseen = 0;
369 if (isspace(c))
370 {
371 while (isspace(c))
372 c = preproc_lex_fetch_byte(pp);
373 preproc_lex_unfetch_byte(pp, c);
374 ttype = TOK_WSPACE;
375 goto out;
376 }
377
378 switch (c)
379 {
380 case '?':
381 ttype = TOK_QMARK;
382 goto out;
383
384 case ':':
385 ttype = TOK_COLON;
386 goto out;
387
388 case ',':
389 ttype = TOK_COMMA;
390 goto out;
391
392 case '(':
393 ttype = TOK_OPAREN;
394 goto out;
395
396 case ')':
397 ttype = TOK_CPAREN;
398 goto out;
399
400 case '{':
401 ttype = TOK_OBRACE;
402 goto out;
403
404 case '}':
405 ttype = TOK_CBRACE;
406 goto out;
407
408 case '[':
409 ttype = TOK_OSQUARE;
410 goto out;
411
412 case ']':
413 ttype = TOK_CSQUARE;
414 goto out;
415
416 case '~':
417 ttype = TOK_COM;
418 goto out;
419
420 case ';':
421 ttype = TOK_EOS;
422 goto out;
423
424 /* and now for the possible multi character tokens */
425 case '#':
426 ttype = TOK_HASH;
427 c = preproc_lex_fetch_byte(pp);
428 if (c == '#')
429 ttype = TOK_DBLHASH;
430 else
431 preproc_lex_unfetch_byte(pp, c);
432 goto out;
433
434 case '^':
435 ttype = TOK_XOR;
436 c = preproc_lex_fetch_byte(pp);
437 if (c == '=')
438 ttype = TOK_XORASS;
439 else
440 preproc_lex_unfetch_byte(pp, c);
441 goto out;
442
443 case '!':
444 ttype = TOK_BNOT;
445 c = preproc_lex_fetch_byte(pp);
446 if (c == '=')
447 ttype = TOK_NE;
448 else
449 preproc_lex_unfetch_byte(pp, c);
450 goto out;
451
452 case '*':
453 ttype = TOK_STAR;
454 c = preproc_lex_fetch_byte(pp);
455 if (c == '=')
456 ttype = TOK_MULASS;
457 else
458 preproc_lex_unfetch_byte(pp, c);
459 goto out;
460
461 case '/':
462 ttype = TOK_DIV;
463 c = preproc_lex_fetch_byte(pp);
464 if (c == '=')
465 ttype = TOK_DIVASS;
466 else
467 preproc_lex_unfetch_byte(pp, c);
468 goto out;
469
470 case '=':
471 ttype = TOK_ASS;
472 c = preproc_lex_fetch_byte(pp);
473 if (c == '=')
474 ttype = TOK_EQ;
475 else
476 preproc_lex_unfetch_byte(pp, c);
477 goto out;
478
479 case '%':
480 ttype = TOK_MOD;
481 c = preproc_lex_fetch_byte(pp);
482 if (c == '=')
483 ttype = TOK_MODASS;
484 else
485 preproc_lex_unfetch_byte(pp, c);
486 goto out;
487
488 case '-':
489 ttype = TOK_SUB;
490 c = preproc_lex_fetch_byte(pp);
491 if (c == '=')
492 ttype = TOK_SUBASS;
493 else if (c == '-')
494 ttype = TOK_DBLSUB;
495 else if (c == '>')
496 ttype = TOK_ARROW;
497 else
498 preproc_lex_unfetch_byte(pp, c);
499 goto out;
500
501 case '+':
502 ttype = TOK_ADD;
503 c = preproc_lex_fetch_byte(pp);
504 if (c == '=')
505 ttype = TOK_ADDASS;
506 else if (c == '+')
507 ttype = TOK_DBLADD;
508 else
509 preproc_lex_unfetch_byte(pp, c);
510 goto out;
511
512
513 case '&':
514 ttype = TOK_BWAND;
515 c = preproc_lex_fetch_byte(pp);
516 if (c == '=')
517 ttype = TOK_BWANDASS;
518 else if (c == '&')
519 ttype = TOK_BAND;
520 else
521 preproc_lex_unfetch_byte(pp, c);
522 goto out;
523
524 case '|':
525 ttype = TOK_BWOR;
526 c = preproc_lex_fetch_byte(pp);
527 if (c == '=')
528 ttype = TOK_BWORASS;
529 else if (c == '|')
530 ttype = TOK_BOR;
531 else
532 preproc_lex_unfetch_byte(pp, c);
533 goto out;
534
535 case '<':
536 ttype = TOK_LT;
537 c = preproc_lex_fetch_byte(pp);
538 if (c == '=')
539 ttype = TOK_LE;
540 else if (c == '<')
541 {
542 ttype = TOK_LSH;
543 c = preproc_lex_fetch_byte(pp);
544 if (c == '=')
545 ttype = TOK_LSHASS;
546 else
547 preproc_lex_unfetch_byte(pp, c);
548 }
549 else
550 preproc_lex_unfetch_byte(pp, c);
551 goto out;
552
553
554 case '>':
555 ttype = TOK_GT;
556 c = preproc_lex_fetch_byte(pp);
557 if (c == '=')
558 ttype = TOK_GE;
559 else if (c == '>')
560 {
561 ttype = TOK_RSH;
562 c = preproc_lex_fetch_byte(pp);
563 if (c == '=')
564 ttype = TOK_RSHASS;
565 else
566 preproc_lex_unfetch_byte(pp, c);
567 }
568 else
569 preproc_lex_unfetch_byte(pp, c);
570 goto out;
571
572 case '\'':
573 /* character constant - turns into a uint */
574 chrlit:
575 cl = 0;
576 strbuf = strbuf_new();
577 for (;;)
578 {
579 c = preproc_lex_fetch_byte(pp);
580 if (c == CPP_EOF || c == CPP_EOL || c == '\'')
581 break;
582 cl++;
583 if (c == '\\')
584 {
585 strbuf_add(strbuf, '\\');
586 c = preproc_lex_fetch_byte(pp);
587 if (c == CPP_EOF || c == CPP_EOL)
588 {
589 preproc_throw_error(pp, "Invalid character constant");
590 break;
591 }
592 cl++;
593 strbuf_add(strbuf, c);
594 continue;
595 }
596 strbuf_add(strbuf, c);
597 }
598 if (cl == 0)
599 preproc_throw_error(pp, "Invalid character constant");
600 strval = strbuf_end(strbuf);
601 ttype = TOK_CHR_LIT;
602 goto out;
603
604 case '"':
605 strlit:
606 /* string literal */
607 strbuf = strbuf_new();
608 for (;;)
609 {
610 c = preproc_lex_fetch_byte(pp);
611 if (c == CPP_EOF || c == CPP_EOL || c == '"')
612 break;
613 if (c == '\\')
614 {
615 strbuf_add(strbuf, '\\');
616 c = preproc_lex_fetch_byte(pp);
617 if (c == CPP_EOF || c == CPP_EOL)
618 {
619 preproc_throw_error(pp, "Invalid string constant");
620 break;
621 }
622 cl++;
623 strbuf_add(strbuf, c);
624 continue;
625 }
626 strbuf_add(strbuf, c);
627 }
628 strval = strbuf_end(strbuf);
629 ttype = TOK_STR_LIT;
630 goto out;
631
632 case 'L':
633 /* check for wide string or wide char const */
634 c2 = preproc_lex_fetch_byte(pp);
635 if (c2 == '\'')
636 {
637 goto chrlit;
638 }
639 else if (c2 == '"')
640 {
641 goto strlit;
642 }
643 preproc_lex_unfetch_byte(pp, c2);
644 /* fall through for identifier */
645 case '_':
646 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
647 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
648 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
649 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
650 case 'y': case 'z':
651 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
652 case 'G': case 'H': case 'I': case 'J': case 'K':
653 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
654 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
655 case 'Y': case 'Z':
656 /* we have an identifier here */
657 strbuf = strbuf_new();
658 strbuf_add(strbuf, c);
659 for (;;)
660 {
661 c = preproc_lex_fetch_byte(pp);
662 if ((c == '_') || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
663 {
664 strbuf_add(strbuf, c);
665 continue;
666 }
667 else
668 {
669 strbuf_add(strbuf, 0);
670 strval = strbuf_end(strbuf);
671 break;
672 }
673 }
674 preproc_lex_unfetch_byte(pp, c);
675 ttype = TOK_IDENT;
676 goto out;
677
678 case '.':
679 c = preproc_lex_fetch_byte(pp);
680 if (c >= '0' && c <= '9')
681 {
682 strbuf = strbuf_new();
683 strbuf_add(strbuf, '.');
684 goto numlit;
685 }
686 else if (c == '.')
687 {
688 c = preproc_lex_fetch_byte(pp);
689 if (c == '.')
690 {
691 ttype = TOK_ELLIPSIS;
692 goto out;
693 }
694 preproc_lex_unfetch_byte(pp, c);
695 }
696 preproc_lex_unfetch_byte(pp, c);
697 ttype = TOK_DOT;
698 goto out;
699
700 case '0': case '1': case '2': case '3': case '4':
701 case '5': case '6': case '7': case '8': case '9':
702 strbuf = strbuf_new();
703 numlit:
704 strbuf_add(strbuf, c);
705 for (;;)
706 {
707 c = preproc_lex_fetch_byte(pp);
708 if (!((c == '_') || (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')))
709 break;
710 strbuf_add(strbuf, c);
711 if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
712 {
713 c = preproc_lex_fetch_byte(pp);
714 if (c == '+' || c == '-')
715 {
716 strbuf_add(strbuf, c);
717 continue;
718 }
719 preproc_lex_unfetch_byte(pp, c);
720 }
721 }
722 strval = strbuf_end(strbuf);
723 preproc_lex_unfetch_byte(pp, c);
724 goto out;
725
726 default:
727 ttype = TOK_CHAR;
728 strval = lw_alloc(2);
729 strval[0] = c;
730 strval[1] = 0;
731 break;
732 }
733 out:
734 t = token_create(ttype, strval, sline, scol, pp -> fn);
735 lw_free(strval);
736 return t;
737 }