comparison lwcc/cpp/file.c @ 292:40ecbd5da481 ccdev

Part one of the C preprocessor This is part one of the C preprocessor. It finds and then fails to intepret directives. Also handles line splicing and trigraphs.
author William Astle <lost@l-w.ca>
date Sun, 08 Sep 2013 21:58:12 -0600
parents
children c419b3b3d43f
comparison
equal deleted inserted replaced
291:83f682ed4d65 292:40ecbd5da481
1 /*
2 lwcc/cpp/file.c
3
4 Copyright © 2013 William Astle
5
6 This file is part of LWTOOLS.
7
8 LWTOOLS is free software: you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation, either version 3 of the License, or (at your option) any later
11 version.
12
13 This program is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 more details.
17
18 You should have received a copy of the GNU General Public License along with
19 this program. If not, see <http://www.gnu.org/licenses/>.
20
21
22 NOTES:
23
24 The function fetch_byte() grabs a byte from the input file. It returns
25 CPP_EOF if end of file has been reached. The resulting byte has passed
26 through three filters, in order:
27
28 * All CRLF, LFCR, LF, and CR have been converted to CPP_EOL
29 * If enabled (--trigraphs), trigraphs have been interpreted
30 * \\n (backslash-newline) has been processed (eliminated)
31
32 To obtain a byte without processing \\n, call fetch_byte_tg().
33
34 */
35
36 #include <errno.h>
37 #include <stdio.h>
38 #include <string.h>
39
40 #include <lw_alloc.h>
41
42 #include "cpp.h"
43
44 struct file_stack_e *file_stack = NULL;
45
46 int is_whitespace(int c)
47 {
48 switch (c)
49 {
50 case ' ':
51 case '\t':
52 case '\r':
53 case '\n':
54 return 1;
55 }
56 return 0;
57 }
58
59 int is_sidchr(c)
60 {
61 if (c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
62 return 1;
63 return 0;
64 }
65
66 int is_idchr(int c)
67 {
68 if (c >= '0' && c <= '9')
69 return 1;
70 return is_sidchr(c);
71 }
72
73 int is_ep(int c)
74 {
75 if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
76 return 1;
77 return 0;
78 }
79
80 int is_hex(int c)
81 {
82 if (c >= 'a' && c <= 'f')
83 return 1;
84 if (c >= 'A' && c <= 'F')
85 return 1;
86 if (c >= '0' && c <= '9')
87 return 1;
88 return 0;
89 }
90
91 int is_dec(int c)
92 {
93 if (c >= '0' && c <= '9')
94 return 1;
95 return 0;
96 }
97
98 static void outchr(int c)
99 {
100 fputc(c, output_fp);
101 }
102
103 static void outstr(char *s)
104 {
105 while (*s)
106 outchr(*s++);
107 }
108
109 int fetch_byte_ll(struct file_stack_e *f)
110 {
111 int c;
112
113 if (f -> eolstate != 0)
114 {
115 f -> line++;
116 f -> col = 0;
117 }
118 c = getc(f -> fp);
119 f -> col++;
120 if (f -> eolstate == 1)
121 {
122 // just saw CR, munch LF
123 if (c == 10)
124 c = getc(f -> fp);
125 f -> eolstate = 0;
126 }
127 else if (f -> eolstate == 2)
128 {
129 // just saw LF, much CR
130 if (c == 13)
131 c = getc(f -> fp);
132 f -> eolstate = 0;
133 }
134
135 if (c == 10)
136 {
137 // we have LF - end of line, flag to munch CR
138 f -> eolstate = 2;
139 c = CPP_EOL;
140 }
141 else if (c == 13)
142 {
143 // we have CR - end of line, flag to munch LF
144 f -> eolstate = 1;
145 c = CPP_EOL;
146 }
147 else if (c == EOF)
148 {
149 c = CPP_EOF;
150 }
151 return c;
152 }
153
154 int fetch_byte_tg(struct file_stack_e *f)
155 {
156 int c;
157
158 if (!trigraphs)
159 {
160 c = fetch_byte_ll(f);
161 }
162 else
163 {
164 /* we have to do the trigraph shit here */
165 if (f -> ra != CPP_NOUNG)
166 {
167 if (f -> qseen > 0)
168 {
169 c = '?';
170 f -> qseen -= 1;
171 return c;
172 }
173 else
174 {
175 c = f -> ra;
176 f -> ra = CPP_NOUNG;
177 return c;
178 }
179 }
180
181 c = fetch_byte_ll(f);
182 while (c == '?')
183 {
184 f -> qseen++;
185 c = fetch_byte_ll(f);
186 }
187
188 if (f -> qseen >= 2)
189 {
190 // we have a trigraph
191 switch (c)
192 {
193 case '=':
194 c = '#';
195 f -> qseen -= 2;
196 break;
197
198 case '/':
199 c = '\\';
200 f -> qseen -= 2;
201 break;
202
203 case '\'':
204 c = '^';
205 f -> qseen -= 2;
206 break;
207
208 case '(':
209 c = '[';
210 f -> qseen -= 2;
211 break;
212
213 case ')':
214 c = ']';
215 f -> qseen -= 2;
216 break;
217
218 case '!':
219 c = '|';
220 f -> qseen -= 2;
221 break;
222
223 case '<':
224 c = '{';
225 f -> qseen -= 2;
226 break;
227
228 case '>':
229 c = '}';
230 f -> qseen -= 2;
231 break;
232
233 case '~':
234 c = '~';
235 f -> qseen -= 2;
236 break;
237 }
238 if (f -> qseen > 0)
239 {
240 f -> ra = c;
241 c = '?';
242 f -> qseen--;
243 }
244 }
245 else if (f -> qseen > 0)
246 {
247 f -> ra = c;
248 c = '?';
249 f -> qseen--;
250 }
251 }
252 return c;
253 }
254
255 int fetch_byte(struct file_stack_e *f)
256 {
257 int c;
258
259 again:
260 if (f -> unget != CPP_NOUNG)
261 {
262 c = f -> unget;
263 f -> unget = CPP_NOUNG;
264 }
265 else
266 {
267 c = fetch_byte_tg(f);
268 }
269 if (c == '\\')
270 {
271 int c2;
272 c2 = fetch_byte_tg(f);
273 if (c2 == CPP_EOL)
274 goto again;
275 else
276 f -> unget = c2;
277 }
278 f -> curc = c;
279 return c;
280 }
281
282 static void skip_line(struct file_stack_e *f)
283 {
284 int c;
285 while ((c = fetch_byte(f)) != CPP_EOL && c != CPP_EOF)
286 /* do nothing */ ;
287 }
288
289
290 struct
291 {
292 char *name;
293 void (*fn)(struct file_stack_e *);
294 } directives[] =
295 {
296 { NULL, NULL },
297 { NULL, NULL }
298 };
299
300 /*
301 This handles a preprocessing directive. Such a directive goes from the
302 next character to be retrieved from f until the first instance of CPP_EOL
303 or CPP_EOF.
304 */
305 void handle_directive(struct file_stack_e *f)
306 {
307 int c, i;
308 char kw[20];
309
310 again:
311 while ((c = fetch_byte(f)) == ' ' || c == '\t')
312 /* do nothing */ ;
313 if (c == '/')
314 {
315 // maybe a comment //
316 c = fetch_byte(f);
317 if (c == '/')
318 {
319 // line comment
320 skip_line(f);
321 return;
322 }
323 if (c == '*')
324 {
325 // block comment
326 while (1)
327 {
328 c = fetch_byte(f);
329 if (c == CPP_EOF)
330 return;
331 if (c == '*')
332 {
333 c = fetch_byte(f);
334 if (c == '/')
335 {
336 // end of comment - try again for directive
337 goto again;
338 }
339 if (c == CPP_EOF)
340 return;
341 }
342 }
343 }
344 }
345
346 // empty directive - do nothing
347 if (c == CPP_EOL)
348 return;
349
350 if (c < 'a' || c > 'z')
351 goto out;
352
353 i = 0;
354 do
355 {
356 kw[i++] = c;
357 if (i == sizeof(kw) - 1)
358 goto out; // keyword too long
359 c = fetch_byte(f);
360 } while ((c >= 'a' && c <= 'z') || (c == '_'));
361 kw[i++] = '\0';
362
363 /* we have a keyword here */
364 for (i = 0; directives[i].name; i++)
365 {
366 if (strcmp(directives[i].name, kw) == 0)
367 {
368 (*directives[i].fn)(f);
369 return;
370 }
371 }
372
373 /* if we fall through here, we have an unknown directive */
374 out:
375 do_error("invalid preprocessor directive");
376 skip_line(f);
377 }
378
379 /*
380 Notes:
381
382 Rather than tokenize the entire file, we run through it interpreting
383 things only as much as we need to in order to identify the following:
384
385 preprocessing directives (#...)
386 identifiers which might need to be replaced with macros
387
388 We have to interpret strings, character constants, and numbers to prevent
389 false positives in those situations.
390
391 When we find a preprocessing directive, it is handled with a more
392 aggressive tokenization process and then intepreted accordingly.
393
394 nlws is used to record the fact that only whitespace has occurred at the
395 start of a line. Whitespace is defined as comments or isspace(c). It gets
396 reset to 1 after each EOL character. If a non-whitespace character is
397 encountered, it is set to -1. If the character processing decides it really
398 is a whitespace character, it will set nlws back to 1 (block comment).
399 Elsewise, it will get set to 0 if it is still -1 when the loop starts again.
400
401 This is needed so we can identify whitespace interposed before a
402 preprocessor directive. This is the only case where it matters for
403 the preprocessor.
404
405 */
406 void preprocess_file(struct file_stack_e *f)
407 {
408 int c;
409 int nlws = 1;
410
411 while (1)
412 {
413 c = fetch_byte(f);
414 again:
415 if (nlws == -1)
416 nlws = 0;
417 if (c == CPP_EOF)
418 {
419 outchr('\n');
420 return;
421 }
422 if (c == CPP_EOL)
423 {
424 nlws = 1;
425 outchr('\n');
426 continue;
427 }
428
429 if (!is_whitespace(c))
430 nlws = -1;
431
432 if (is_sidchr(c))
433 {
434 // have identifier here - parse it off
435 char *ident = NULL;
436 int idlen = 0;
437
438 do
439 {
440 ident = lw_realloc(ident, idlen + 1);
441 ident[idlen++] = c;
442 ident[idlen] = '\0';
443 c = fetch_byte(f);
444 } while (is_idchr(c));
445
446 /* do something with the identifier here - macros, etc. */
447 outstr(ident);
448 lw_free(ident);
449
450 goto again;
451 }
452
453 switch (c)
454 {
455 default:
456 outchr(c);
457 break;
458
459 case '.': // a number - to prevent seeing an identifier in middle of number
460 outchr(c);
461 c = fetch_byte(f);
462 if (!is_dec(c))
463 goto again;
464 /* fall through */
465 case '0':
466 case '1':
467 case '2':
468 case '3':
469 case '4':
470 case '5':
471 case '6':
472 case '7':
473 case '8':
474 case '9':
475 do
476 {
477 outchr(c);
478 c = fetch_byte(f);
479 if (c == CPP_EOF)
480 return;
481 if (is_ep(c))
482 {
483 outchr(c);
484 c = fetch_byte(f);
485 if (c == '-' || c == '+')
486 {
487 outchr(c);
488 c = fetch_byte(f);
489 }
490 }
491 } while ((is_idchr(c)) || (c == '.'));
492 goto again;
493
494 case '#':
495 if (nlws)
496 {
497 handle_directive(f);
498 /* note: no need to reset nlws */
499 }
500 else
501 outchr('#');
502 break;
503
504 case '\'': // character constant
505 outchr('\'');
506 while ((c = fetch_byte(f)) != '\'')
507 {
508 if (c == '\\')
509 {
510 outchr('\\');
511 c = fetch_byte(f);
512 }
513 if (c == CPP_EOL)
514 {
515 do_warning("Unterminated character constant");
516 goto again;
517 }
518 if (c == CPP_EOF)
519 return;
520 outchr(c);
521 }
522 outchr(c);
523 break;
524
525 case '"': // strings
526 outchr(c);
527 while ((c = fetch_byte(f)) != '"')
528 {
529 if (c == '\\')
530 {
531 outchr('\\');
532 c = fetch_byte(f);
533 }
534 if (c == CPP_EOL)
535 {
536 do_warning("unterminated string literal");
537 goto again;
538 }
539 if (c == CPP_EOF)
540 return;
541 outchr(c);
542 }
543 outchr(c);
544 break;
545
546 case '/': // comments
547 c = fetch_byte(f);
548 if (c == '/')
549 {
550 // line comment
551 outchr(' ');
552 do
553 {
554 c = fetch_byte(f);
555 } while (c != CPP_EOF && c != CPP_EOL);
556 }
557 else if (c == '*')
558 {
559 // block comment
560 for (;;)
561 {
562 c = fetch_byte(f);
563 if (c == CPP_EOF)
564 {
565 break;
566 }
567 if (c == CPP_EOL)
568 {
569 continue;
570 }
571 if (c == '*')
572 {
573 // maybe end of comment
574 c = fetch_byte(f);
575 if (c == '/')
576 {
577 // end of comment
578 break;
579 }
580 }
581 }
582 // replace comment with a single space
583 outchr(' ');
584 if (nlws == -1)
585 nlws = 1;
586 continue;
587 }
588 else
589 {
590 // restore eaten '/'
591 outchr('/');
592 // process the character we just fetched
593 goto again;
594 }
595 } // switch
596 } // processing loop
597 }
598
599 int process_file(const char *f)
600 {
601 struct file_stack_e *nf;
602 FILE *fp;
603
604 fprintf(stderr, "Processing %s\n", f);
605
606 if (strcmp(f, "-") == 0)
607 fp = stdin;
608 else
609 fp = fopen(f, "rb");
610 if (fp == NULL)
611 {
612 do_warning("Cannot open %s: %s", f, strerror(errno));
613 return -1;
614 }
615
616 /* push the file onto the file stack */
617 nf = lw_alloc(sizeof(struct file_stack_e));
618 nf -> fn = f;
619 nf -> fp = fp;
620 nf -> next = file_stack;
621 nf -> line = 1;
622 nf -> col = 0;
623 nf -> qseen = 0;
624 nf -> ra = CPP_NOUNG;
625 nf -> unget = CPP_NOUNG;
626 file_stack = nf;
627
628 /* go preprocess the file */
629 preprocess_file(nf);
630
631 if (nf -> fp != stdin)
632 fclose(nf -> fp);
633 file_stack = nf -> next;
634 lw_free(nf);
635 return 0;
636 }