Mercurial > hg > index.cgi
comparison lwcc/cpp/file.c @ 292:40ecbd5da481 ccdev
Part one of the C preprocessor
This is part one of the C preprocessor. It finds and then fails to intepret
directives. Also handles line splicing and trigraphs.
author | William Astle <lost@l-w.ca> |
---|---|
date | Sun, 08 Sep 2013 21:58:12 -0600 |
parents | |
children | c419b3b3d43f |
comparison
equal
deleted
inserted
replaced
291:83f682ed4d65 | 292:40ecbd5da481 |
---|---|
1 /* | |
2 lwcc/cpp/file.c | |
3 | |
4 Copyright © 2013 William Astle | |
5 | |
6 This file is part of LWTOOLS. | |
7 | |
8 LWTOOLS is free software: you can redistribute it and/or modify it under the | |
9 terms of the GNU General Public License as published by the Free Software | |
10 Foundation, either version 3 of the License, or (at your option) any later | |
11 version. | |
12 | |
13 This program is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
16 more details. | |
17 | |
18 You should have received a copy of the GNU General Public License along with | |
19 this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | |
21 | |
22 NOTES: | |
23 | |
24 The function fetch_byte() grabs a byte from the input file. It returns | |
25 CPP_EOF if end of file has been reached. The resulting byte has passed | |
26 through three filters, in order: | |
27 | |
28 * All CRLF, LFCR, LF, and CR have been converted to CPP_EOL | |
29 * If enabled (--trigraphs), trigraphs have been interpreted | |
30 * \\n (backslash-newline) has been processed (eliminated) | |
31 | |
32 To obtain a byte without processing \\n, call fetch_byte_tg(). | |
33 | |
34 */ | |
35 | |
36 #include <errno.h> | |
37 #include <stdio.h> | |
38 #include <string.h> | |
39 | |
40 #include <lw_alloc.h> | |
41 | |
42 #include "cpp.h" | |
43 | |
44 struct file_stack_e *file_stack = NULL; | |
45 | |
46 int is_whitespace(int c) | |
47 { | |
48 switch (c) | |
49 { | |
50 case ' ': | |
51 case '\t': | |
52 case '\r': | |
53 case '\n': | |
54 return 1; | |
55 } | |
56 return 0; | |
57 } | |
58 | |
59 int is_sidchr(c) | |
60 { | |
61 if (c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) | |
62 return 1; | |
63 return 0; | |
64 } | |
65 | |
66 int is_idchr(int c) | |
67 { | |
68 if (c >= '0' && c <= '9') | |
69 return 1; | |
70 return is_sidchr(c); | |
71 } | |
72 | |
73 int is_ep(int c) | |
74 { | |
75 if (c == 'e' || c == 'E' || c == 'p' || c == 'P') | |
76 return 1; | |
77 return 0; | |
78 } | |
79 | |
80 int is_hex(int c) | |
81 { | |
82 if (c >= 'a' && c <= 'f') | |
83 return 1; | |
84 if (c >= 'A' && c <= 'F') | |
85 return 1; | |
86 if (c >= '0' && c <= '9') | |
87 return 1; | |
88 return 0; | |
89 } | |
90 | |
91 int is_dec(int c) | |
92 { | |
93 if (c >= '0' && c <= '9') | |
94 return 1; | |
95 return 0; | |
96 } | |
97 | |
98 static void outchr(int c) | |
99 { | |
100 fputc(c, output_fp); | |
101 } | |
102 | |
103 static void outstr(char *s) | |
104 { | |
105 while (*s) | |
106 outchr(*s++); | |
107 } | |
108 | |
109 int fetch_byte_ll(struct file_stack_e *f) | |
110 { | |
111 int c; | |
112 | |
113 if (f -> eolstate != 0) | |
114 { | |
115 f -> line++; | |
116 f -> col = 0; | |
117 } | |
118 c = getc(f -> fp); | |
119 f -> col++; | |
120 if (f -> eolstate == 1) | |
121 { | |
122 // just saw CR, munch LF | |
123 if (c == 10) | |
124 c = getc(f -> fp); | |
125 f -> eolstate = 0; | |
126 } | |
127 else if (f -> eolstate == 2) | |
128 { | |
129 // just saw LF, much CR | |
130 if (c == 13) | |
131 c = getc(f -> fp); | |
132 f -> eolstate = 0; | |
133 } | |
134 | |
135 if (c == 10) | |
136 { | |
137 // we have LF - end of line, flag to munch CR | |
138 f -> eolstate = 2; | |
139 c = CPP_EOL; | |
140 } | |
141 else if (c == 13) | |
142 { | |
143 // we have CR - end of line, flag to munch LF | |
144 f -> eolstate = 1; | |
145 c = CPP_EOL; | |
146 } | |
147 else if (c == EOF) | |
148 { | |
149 c = CPP_EOF; | |
150 } | |
151 return c; | |
152 } | |
153 | |
154 int fetch_byte_tg(struct file_stack_e *f) | |
155 { | |
156 int c; | |
157 | |
158 if (!trigraphs) | |
159 { | |
160 c = fetch_byte_ll(f); | |
161 } | |
162 else | |
163 { | |
164 /* we have to do the trigraph shit here */ | |
165 if (f -> ra != CPP_NOUNG) | |
166 { | |
167 if (f -> qseen > 0) | |
168 { | |
169 c = '?'; | |
170 f -> qseen -= 1; | |
171 return c; | |
172 } | |
173 else | |
174 { | |
175 c = f -> ra; | |
176 f -> ra = CPP_NOUNG; | |
177 return c; | |
178 } | |
179 } | |
180 | |
181 c = fetch_byte_ll(f); | |
182 while (c == '?') | |
183 { | |
184 f -> qseen++; | |
185 c = fetch_byte_ll(f); | |
186 } | |
187 | |
188 if (f -> qseen >= 2) | |
189 { | |
190 // we have a trigraph | |
191 switch (c) | |
192 { | |
193 case '=': | |
194 c = '#'; | |
195 f -> qseen -= 2; | |
196 break; | |
197 | |
198 case '/': | |
199 c = '\\'; | |
200 f -> qseen -= 2; | |
201 break; | |
202 | |
203 case '\'': | |
204 c = '^'; | |
205 f -> qseen -= 2; | |
206 break; | |
207 | |
208 case '(': | |
209 c = '['; | |
210 f -> qseen -= 2; | |
211 break; | |
212 | |
213 case ')': | |
214 c = ']'; | |
215 f -> qseen -= 2; | |
216 break; | |
217 | |
218 case '!': | |
219 c = '|'; | |
220 f -> qseen -= 2; | |
221 break; | |
222 | |
223 case '<': | |
224 c = '{'; | |
225 f -> qseen -= 2; | |
226 break; | |
227 | |
228 case '>': | |
229 c = '}'; | |
230 f -> qseen -= 2; | |
231 break; | |
232 | |
233 case '~': | |
234 c = '~'; | |
235 f -> qseen -= 2; | |
236 break; | |
237 } | |
238 if (f -> qseen > 0) | |
239 { | |
240 f -> ra = c; | |
241 c = '?'; | |
242 f -> qseen--; | |
243 } | |
244 } | |
245 else if (f -> qseen > 0) | |
246 { | |
247 f -> ra = c; | |
248 c = '?'; | |
249 f -> qseen--; | |
250 } | |
251 } | |
252 return c; | |
253 } | |
254 | |
255 int fetch_byte(struct file_stack_e *f) | |
256 { | |
257 int c; | |
258 | |
259 again: | |
260 if (f -> unget != CPP_NOUNG) | |
261 { | |
262 c = f -> unget; | |
263 f -> unget = CPP_NOUNG; | |
264 } | |
265 else | |
266 { | |
267 c = fetch_byte_tg(f); | |
268 } | |
269 if (c == '\\') | |
270 { | |
271 int c2; | |
272 c2 = fetch_byte_tg(f); | |
273 if (c2 == CPP_EOL) | |
274 goto again; | |
275 else | |
276 f -> unget = c2; | |
277 } | |
278 f -> curc = c; | |
279 return c; | |
280 } | |
281 | |
282 static void skip_line(struct file_stack_e *f) | |
283 { | |
284 int c; | |
285 while ((c = fetch_byte(f)) != CPP_EOL && c != CPP_EOF) | |
286 /* do nothing */ ; | |
287 } | |
288 | |
289 | |
290 struct | |
291 { | |
292 char *name; | |
293 void (*fn)(struct file_stack_e *); | |
294 } directives[] = | |
295 { | |
296 { NULL, NULL }, | |
297 { NULL, NULL } | |
298 }; | |
299 | |
300 /* | |
301 This handles a preprocessing directive. Such a directive goes from the | |
302 next character to be retrieved from f until the first instance of CPP_EOL | |
303 or CPP_EOF. | |
304 */ | |
305 void handle_directive(struct file_stack_e *f) | |
306 { | |
307 int c, i; | |
308 char kw[20]; | |
309 | |
310 again: | |
311 while ((c = fetch_byte(f)) == ' ' || c == '\t') | |
312 /* do nothing */ ; | |
313 if (c == '/') | |
314 { | |
315 // maybe a comment // | |
316 c = fetch_byte(f); | |
317 if (c == '/') | |
318 { | |
319 // line comment | |
320 skip_line(f); | |
321 return; | |
322 } | |
323 if (c == '*') | |
324 { | |
325 // block comment | |
326 while (1) | |
327 { | |
328 c = fetch_byte(f); | |
329 if (c == CPP_EOF) | |
330 return; | |
331 if (c == '*') | |
332 { | |
333 c = fetch_byte(f); | |
334 if (c == '/') | |
335 { | |
336 // end of comment - try again for directive | |
337 goto again; | |
338 } | |
339 if (c == CPP_EOF) | |
340 return; | |
341 } | |
342 } | |
343 } | |
344 } | |
345 | |
346 // empty directive - do nothing | |
347 if (c == CPP_EOL) | |
348 return; | |
349 | |
350 if (c < 'a' || c > 'z') | |
351 goto out; | |
352 | |
353 i = 0; | |
354 do | |
355 { | |
356 kw[i++] = c; | |
357 if (i == sizeof(kw) - 1) | |
358 goto out; // keyword too long | |
359 c = fetch_byte(f); | |
360 } while ((c >= 'a' && c <= 'z') || (c == '_')); | |
361 kw[i++] = '\0'; | |
362 | |
363 /* we have a keyword here */ | |
364 for (i = 0; directives[i].name; i++) | |
365 { | |
366 if (strcmp(directives[i].name, kw) == 0) | |
367 { | |
368 (*directives[i].fn)(f); | |
369 return; | |
370 } | |
371 } | |
372 | |
373 /* if we fall through here, we have an unknown directive */ | |
374 out: | |
375 do_error("invalid preprocessor directive"); | |
376 skip_line(f); | |
377 } | |
378 | |
379 /* | |
380 Notes: | |
381 | |
382 Rather than tokenize the entire file, we run through it interpreting | |
383 things only as much as we need to in order to identify the following: | |
384 | |
385 preprocessing directives (#...) | |
386 identifiers which might need to be replaced with macros | |
387 | |
388 We have to interpret strings, character constants, and numbers to prevent | |
389 false positives in those situations. | |
390 | |
391 When we find a preprocessing directive, it is handled with a more | |
392 aggressive tokenization process and then intepreted accordingly. | |
393 | |
394 nlws is used to record the fact that only whitespace has occurred at the | |
395 start of a line. Whitespace is defined as comments or isspace(c). It gets | |
396 reset to 1 after each EOL character. If a non-whitespace character is | |
397 encountered, it is set to -1. If the character processing decides it really | |
398 is a whitespace character, it will set nlws back to 1 (block comment). | |
399 Elsewise, it will get set to 0 if it is still -1 when the loop starts again. | |
400 | |
401 This is needed so we can identify whitespace interposed before a | |
402 preprocessor directive. This is the only case where it matters for | |
403 the preprocessor. | |
404 | |
405 */ | |
406 void preprocess_file(struct file_stack_e *f) | |
407 { | |
408 int c; | |
409 int nlws = 1; | |
410 | |
411 while (1) | |
412 { | |
413 c = fetch_byte(f); | |
414 again: | |
415 if (nlws == -1) | |
416 nlws = 0; | |
417 if (c == CPP_EOF) | |
418 { | |
419 outchr('\n'); | |
420 return; | |
421 } | |
422 if (c == CPP_EOL) | |
423 { | |
424 nlws = 1; | |
425 outchr('\n'); | |
426 continue; | |
427 } | |
428 | |
429 if (!is_whitespace(c)) | |
430 nlws = -1; | |
431 | |
432 if (is_sidchr(c)) | |
433 { | |
434 // have identifier here - parse it off | |
435 char *ident = NULL; | |
436 int idlen = 0; | |
437 | |
438 do | |
439 { | |
440 ident = lw_realloc(ident, idlen + 1); | |
441 ident[idlen++] = c; | |
442 ident[idlen] = '\0'; | |
443 c = fetch_byte(f); | |
444 } while (is_idchr(c)); | |
445 | |
446 /* do something with the identifier here - macros, etc. */ | |
447 outstr(ident); | |
448 lw_free(ident); | |
449 | |
450 goto again; | |
451 } | |
452 | |
453 switch (c) | |
454 { | |
455 default: | |
456 outchr(c); | |
457 break; | |
458 | |
459 case '.': // a number - to prevent seeing an identifier in middle of number | |
460 outchr(c); | |
461 c = fetch_byte(f); | |
462 if (!is_dec(c)) | |
463 goto again; | |
464 /* fall through */ | |
465 case '0': | |
466 case '1': | |
467 case '2': | |
468 case '3': | |
469 case '4': | |
470 case '5': | |
471 case '6': | |
472 case '7': | |
473 case '8': | |
474 case '9': | |
475 do | |
476 { | |
477 outchr(c); | |
478 c = fetch_byte(f); | |
479 if (c == CPP_EOF) | |
480 return; | |
481 if (is_ep(c)) | |
482 { | |
483 outchr(c); | |
484 c = fetch_byte(f); | |
485 if (c == '-' || c == '+') | |
486 { | |
487 outchr(c); | |
488 c = fetch_byte(f); | |
489 } | |
490 } | |
491 } while ((is_idchr(c)) || (c == '.')); | |
492 goto again; | |
493 | |
494 case '#': | |
495 if (nlws) | |
496 { | |
497 handle_directive(f); | |
498 /* note: no need to reset nlws */ | |
499 } | |
500 else | |
501 outchr('#'); | |
502 break; | |
503 | |
504 case '\'': // character constant | |
505 outchr('\''); | |
506 while ((c = fetch_byte(f)) != '\'') | |
507 { | |
508 if (c == '\\') | |
509 { | |
510 outchr('\\'); | |
511 c = fetch_byte(f); | |
512 } | |
513 if (c == CPP_EOL) | |
514 { | |
515 do_warning("Unterminated character constant"); | |
516 goto again; | |
517 } | |
518 if (c == CPP_EOF) | |
519 return; | |
520 outchr(c); | |
521 } | |
522 outchr(c); | |
523 break; | |
524 | |
525 case '"': // strings | |
526 outchr(c); | |
527 while ((c = fetch_byte(f)) != '"') | |
528 { | |
529 if (c == '\\') | |
530 { | |
531 outchr('\\'); | |
532 c = fetch_byte(f); | |
533 } | |
534 if (c == CPP_EOL) | |
535 { | |
536 do_warning("unterminated string literal"); | |
537 goto again; | |
538 } | |
539 if (c == CPP_EOF) | |
540 return; | |
541 outchr(c); | |
542 } | |
543 outchr(c); | |
544 break; | |
545 | |
546 case '/': // comments | |
547 c = fetch_byte(f); | |
548 if (c == '/') | |
549 { | |
550 // line comment | |
551 outchr(' '); | |
552 do | |
553 { | |
554 c = fetch_byte(f); | |
555 } while (c != CPP_EOF && c != CPP_EOL); | |
556 } | |
557 else if (c == '*') | |
558 { | |
559 // block comment | |
560 for (;;) | |
561 { | |
562 c = fetch_byte(f); | |
563 if (c == CPP_EOF) | |
564 { | |
565 break; | |
566 } | |
567 if (c == CPP_EOL) | |
568 { | |
569 continue; | |
570 } | |
571 if (c == '*') | |
572 { | |
573 // maybe end of comment | |
574 c = fetch_byte(f); | |
575 if (c == '/') | |
576 { | |
577 // end of comment | |
578 break; | |
579 } | |
580 } | |
581 } | |
582 // replace comment with a single space | |
583 outchr(' '); | |
584 if (nlws == -1) | |
585 nlws = 1; | |
586 continue; | |
587 } | |
588 else | |
589 { | |
590 // restore eaten '/' | |
591 outchr('/'); | |
592 // process the character we just fetched | |
593 goto again; | |
594 } | |
595 } // switch | |
596 } // processing loop | |
597 } | |
598 | |
599 int process_file(const char *f) | |
600 { | |
601 struct file_stack_e *nf; | |
602 FILE *fp; | |
603 | |
604 fprintf(stderr, "Processing %s\n", f); | |
605 | |
606 if (strcmp(f, "-") == 0) | |
607 fp = stdin; | |
608 else | |
609 fp = fopen(f, "rb"); | |
610 if (fp == NULL) | |
611 { | |
612 do_warning("Cannot open %s: %s", f, strerror(errno)); | |
613 return -1; | |
614 } | |
615 | |
616 /* push the file onto the file stack */ | |
617 nf = lw_alloc(sizeof(struct file_stack_e)); | |
618 nf -> fn = f; | |
619 nf -> fp = fp; | |
620 nf -> next = file_stack; | |
621 nf -> line = 1; | |
622 nf -> col = 0; | |
623 nf -> qseen = 0; | |
624 nf -> ra = CPP_NOUNG; | |
625 nf -> unget = CPP_NOUNG; | |
626 file_stack = nf; | |
627 | |
628 /* go preprocess the file */ | |
629 preprocess_file(nf); | |
630 | |
631 if (nf -> fp != stdin) | |
632 fclose(nf -> fp); | |
633 file_stack = nf -> next; | |
634 lw_free(nf); | |
635 return 0; | |
636 } |