comparison lwcc/cpp/file.c @ 293:c419b3b3d43f ccdev

Checkpoint on lwcc-cpp development This is a checkpoint with some substantial code cleanups on what is so far implemented. This should avoid substantial code duplication later.
author William Astle <lost@l-w.ca>
date Mon, 09 Sep 2013 23:07:19 -0600
parents 40ecbd5da481
children 048adfee2933
comparison
equal deleted inserted replaced
292:40ecbd5da481 293:c419b3b3d43f
16 more details. 16 more details.
17 17
18 You should have received a copy of the GNU General Public License along with 18 You should have received a copy of the GNU General Public License along with
19 this program. If not, see <http://www.gnu.org/licenses/>. 19 this program. If not, see <http://www.gnu.org/licenses/>.
20 20
21
22 NOTES:
23
24 The function fetch_byte() grabs a byte from the input file. It returns
25 CPP_EOF if end of file has been reached. The resulting byte has passed
26 through three filters, in order:
27
28 * All CRLF, LFCR, LF, and CR have been converted to CPP_EOL
29 * If enabled (--trigraphs), trigraphs have been interpreted
30 * \\n (backslash-newline) has been processed (eliminated)
31
32 To obtain a byte without processing \\n, call fetch_byte_tg().
33
34 */ 21 */
35 22
36 #include <errno.h> 23 #include <errno.h>
37 #include <stdio.h> 24 #include <stdio.h>
38 #include <string.h> 25 #include <string.h>
41 28
42 #include "cpp.h" 29 #include "cpp.h"
43 30
44 struct file_stack_e *file_stack = NULL; 31 struct file_stack_e *file_stack = NULL;
45 32
46 int is_whitespace(int c) 33 /* output a byte to the current output stream as long as we aren't in the
47 { 34 middle of a false conditional. CPP_EOL will be converted to '\n'
48 switch (c) 35 on output. */
49 { 36 void outchr(int c)
50 case ' ': 37 {
51 case '\t': 38 if (skip_level)
52 case '\r': 39 return;
53 case '\n': 40 if (c == CPP_EOL)
54 return 1; 41 c = '\n';
55 }
56 return 0;
57 }
58
59 int is_sidchr(c)
60 {
61 if (c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
62 return 1;
63 return 0;
64 }
65
66 int is_idchr(int c)
67 {
68 if (c >= '0' && c <= '9')
69 return 1;
70 return is_sidchr(c);
71 }
72
73 int is_ep(int c)
74 {
75 if (c == 'e' || c == 'E' || c == 'p' || c == 'P')
76 return 1;
77 return 0;
78 }
79
80 int is_hex(int c)
81 {
82 if (c >= 'a' && c <= 'f')
83 return 1;
84 if (c >= 'A' && c <= 'F')
85 return 1;
86 if (c >= '0' && c <= '9')
87 return 1;
88 return 0;
89 }
90
91 int is_dec(int c)
92 {
93 if (c >= '0' && c <= '9')
94 return 1;
95 return 0;
96 }
97
98 static void outchr(int c)
99 {
100 fputc(c, output_fp); 42 fputc(c, output_fp);
101 } 43 }
102 44
103 static void outstr(char *s) 45 /* output a string to the current output stream as long as we aren't in the
104 { 46 middle of a false conditional */
47 void outstr(char *s)
48 {
49 if (skip_level)
50 return;
105 while (*s) 51 while (*s)
106 outchr(*s++); 52 outchr(*s++);
107 } 53 }
108 54
109 int fetch_byte_ll(struct file_stack_e *f) 55 /* fetch a raw input byte from the current file. Will return CPP_EOF if
56 EOF is encountered and CPP_EOL if an end of line sequence is encountered.
57 End of line is defined as either CR, CRLF, LF, or LFCR. CPP_EOL is
58 returned on the first CR or LF encountered. The complementary CR or LF
59 is munched, if present, when the *next* character is read. This always
60 operates on file_stack.
61
62 This function also accounts for line numbers in input files and also
63 character columns.
64 */
65 int fetch_byte_ll(void)
110 { 66 {
111 int c; 67 int c;
112 68
113 if (f -> eolstate != 0) 69 if (file_stack -> eolstate != 0)
114 { 70 {
115 f -> line++; 71 file_stack -> line++;
116 f -> col = 0; 72 file_stack -> col = 0;
117 } 73 }
118 c = getc(f -> fp); 74 c = getc(file_stack -> fp);
119 f -> col++; 75 file_stack -> col++;
120 if (f -> eolstate == 1) 76 if (file_stack -> eolstate == 1)
121 { 77 {
122 // just saw CR, munch LF 78 // just saw CR, munch LF
123 if (c == 10) 79 if (c == 10)
124 c = getc(f -> fp); 80 c = getc(file_stack -> fp);
125 f -> eolstate = 0; 81 file_stack -> eolstate = 0;
126 } 82 }
127 else if (f -> eolstate == 2) 83 else if (file_stack -> eolstate == 2)
128 { 84 {
129 // just saw LF, much CR 85 // just saw LF, much CR
130 if (c == 13) 86 if (c == 13)
131 c = getc(f -> fp); 87 c = getc(file_stack -> fp);
132 f -> eolstate = 0; 88 file_stack -> eolstate = 0;
133 } 89 }
134 90
135 if (c == 10) 91 if (c == 10)
136 { 92 {
137 // we have LF - end of line, flag to munch CR 93 // we have LF - end of line, flag to munch CR
138 f -> eolstate = 2; 94 file_stack -> eolstate = 2;
139 c = CPP_EOL; 95 c = CPP_EOL;
140 } 96 }
141 else if (c == 13) 97 else if (c == 13)
142 { 98 {
143 // we have CR - end of line, flag to munch LF 99 // we have CR - end of line, flag to munch LF
144 f -> eolstate = 1; 100 file_stack -> eolstate = 1;
145 c = CPP_EOL; 101 c = CPP_EOL;
146 } 102 }
147 else if (c == EOF) 103 else if (c == EOF)
148 { 104 {
149 c = CPP_EOF; 105 c = CPP_EOF;
150 } 106 }
151 return c; 107 return c;
152 } 108 }
153 109
154 int fetch_byte_tg(struct file_stack_e *f) 110 /* This function takes a sequence of bytes from the _ll function above
111 and does trigraph interpretation on it, but only if the global
112 trigraphs is nonzero. */
113 int fetch_byte_tg(void)
155 { 114 {
156 int c; 115 int c;
157 116
158 if (!trigraphs) 117 if (!trigraphs)
159 { 118 {
160 c = fetch_byte_ll(f); 119 c = fetch_byte_ll();
161 } 120 }
162 else 121 else
163 { 122 {
164 /* we have to do the trigraph shit here */ 123 /* we have to do the trigraph shit here */
165 if (f -> ra != CPP_NOUNG) 124 if (file_stack -> ra != CPP_NOUNG)
166 { 125 {
167 if (f -> qseen > 0) 126 if (file_stack -> qseen > 0)
168 { 127 {
169 c = '?'; 128 c = '?';
170 f -> qseen -= 1; 129 file_stack -> qseen -= 1;
171 return c; 130 return c;
172 } 131 }
173 else 132 else
174 { 133 {
175 c = f -> ra; 134 c = file_stack -> ra;
176 f -> ra = CPP_NOUNG; 135 file_stack -> ra = CPP_NOUNG;
177 return c; 136 return c;
178 } 137 }
179 } 138 }
180 139
181 c = fetch_byte_ll(f); 140 c = fetch_byte_ll();
182 while (c == '?') 141 while (c == '?')
183 { 142 {
184 f -> qseen++; 143 file_stack -> qseen++;
185 c = fetch_byte_ll(f); 144 c = fetch_byte_ll();
186 } 145 }
187 146
188 if (f -> qseen >= 2) 147 if (file_stack -> qseen >= 2)
189 { 148 {
190 // we have a trigraph 149 // we have a trigraph
191 switch (c) 150 switch (c)
192 { 151 {
193 case '=': 152 case '=':
194 c = '#'; 153 c = '#';
195 f -> qseen -= 2; 154 file_stack -> qseen -= 2;
196 break; 155 break;
197 156
198 case '/': 157 case '/':
199 c = '\\'; 158 c = '\\';
200 f -> qseen -= 2; 159 file_stack -> qseen -= 2;
201 break; 160 break;
202 161
203 case '\'': 162 case '\'':
204 c = '^'; 163 c = '^';
205 f -> qseen -= 2; 164 file_stack -> qseen -= 2;
206 break; 165 break;
207 166
208 case '(': 167 case '(':
209 c = '['; 168 c = '[';
210 f -> qseen -= 2; 169 file_stack -> qseen -= 2;
211 break; 170 break;
212 171
213 case ')': 172 case ')':
214 c = ']'; 173 c = ']';
215 f -> qseen -= 2; 174 file_stack -> qseen -= 2;
216 break; 175 break;
217 176
218 case '!': 177 case '!':
219 c = '|'; 178 c = '|';
220 f -> qseen -= 2; 179 file_stack -> qseen -= 2;
221 break; 180 break;
222 181
223 case '<': 182 case '<':
224 c = '{'; 183 c = '{';
225 f -> qseen -= 2; 184 file_stack -> qseen -= 2;
226 break; 185 break;
227 186
228 case '>': 187 case '>':
229 c = '}'; 188 c = '}';
230 f -> qseen -= 2; 189 file_stack -> qseen -= 2;
231 break; 190 break;
232 191
233 case '~': 192 case '~':
234 c = '~'; 193 c = '~';
235 f -> qseen -= 2; 194 file_stack -> qseen -= 2;
236 break; 195 break;
237 } 196 }
238 if (f -> qseen > 0) 197 if (file_stack -> qseen > 0)
239 { 198 {
240 f -> ra = c; 199 file_stack -> ra = c;
241 c = '?'; 200 c = '?';
242 f -> qseen--; 201 file_stack -> qseen--;
243 } 202 }
244 } 203 }
245 else if (f -> qseen > 0) 204 else if (file_stack -> qseen > 0)
246 { 205 {
247 f -> ra = c; 206 file_stack -> ra = c;
248 c = '?'; 207 c = '?';
249 f -> qseen--; 208 file_stack -> qseen--;
250 } 209 }
251 } 210 }
252 return c; 211 return c;
253 } 212 }
254 213
255 int fetch_byte(struct file_stack_e *f) 214 /* This function puts a byte back onto the front of the input stream used
215 by fetch_byte(). Theoretically, an unlimited number of characters can
216 be unfetched. Line and column counting may be incorrect if unfetched
217 characters cross a token boundary. */
218 void unfetch_byte(int c)
219 {
220 if (file_stack -> ungetbufl >= file_stack -> ungetbufs)
221 {
222 file_stack -> ungetbufs += 100;
223 file_stack -> ungetbuf = lw_realloc(file_stack -> ungetbuf, file_stack -> ungetbufs);
224 }
225 file_stack -> ungetbuf[file_stack -> ungetbufl++] = c;
226 }
227
228 /* This function retrieves a byte from the input stream. It performs
229 backslash-newline splicing on the returned bytes. Any character
230 retrieved from the unfetch buffer is presumed to have already passed
231 the backslash-newline filter. */
232 int fetch_byte(void)
256 { 233 {
257 int c; 234 int c;
235
236 if (file_stack -> ungetbufl > 0)
237 {
238 file_stack -> ungetbufl--;
239 c = file_stack -> ungetbuf[file_stack -> ungetbufl];
240 if (file_stack -> ungetbufl == 0)
241 {
242 lw_free(file_stack -> ungetbuf);
243 file_stack -> ungetbuf = NULL;
244 file_stack -> ungetbufs = 0;
245 }
246 return c;
247 }
258 248
259 again: 249 again:
260 if (f -> unget != CPP_NOUNG) 250 if (file_stack -> unget != CPP_NOUNG)
261 { 251 {
262 c = f -> unget; 252 c = file_stack -> unget;
263 f -> unget = CPP_NOUNG; 253 file_stack -> unget = CPP_NOUNG;
264 } 254 }
265 else 255 else
266 { 256 {
267 c = fetch_byte_tg(f); 257 c = fetch_byte_tg();
268 } 258 }
269 if (c == '\\') 259 if (c == '\\')
270 { 260 {
271 int c2; 261 int c2;
272 c2 = fetch_byte_tg(f); 262 c2 = fetch_byte_tg();
273 if (c2 == CPP_EOL) 263 if (c2 == CPP_EOL)
274 goto again; 264 goto again;
275 else 265 else
276 f -> unget = c2; 266 file_stack -> unget = c2;
277 } 267 }
278 f -> curc = c; 268 file_stack -> curc = c;
279 return c; 269 return c;
280 } 270 }
281 271
282 static void skip_line(struct file_stack_e *f) 272 /* This function opens (if not stdin) the file f and pushes it onto the
283 { 273 top of the input file stack. It then proceeds to process the file
284 int c; 274 and return. Nonzero return means the file could not be opened. */
285 while ((c = fetch_byte(f)) != CPP_EOL && c != CPP_EOF)
286 /* do nothing */ ;
287 }
288
289
290 struct
291 {
292 char *name;
293 void (*fn)(struct file_stack_e *);
294 } directives[] =
295 {
296 { NULL, NULL },
297 { NULL, NULL }
298 };
299
300 /*
301 This handles a preprocessing directive. Such a directive goes from the
302 next character to be retrieved from f until the first instance of CPP_EOL
303 or CPP_EOF.
304 */
305 void handle_directive(struct file_stack_e *f)
306 {
307 int c, i;
308 char kw[20];
309
310 again:
311 while ((c = fetch_byte(f)) == ' ' || c == '\t')
312 /* do nothing */ ;
313 if (c == '/')
314 {
315 // maybe a comment //
316 c = fetch_byte(f);
317 if (c == '/')
318 {
319 // line comment
320 skip_line(f);
321 return;
322 }
323 if (c == '*')
324 {
325 // block comment
326 while (1)
327 {
328 c = fetch_byte(f);
329 if (c == CPP_EOF)
330 return;
331 if (c == '*')
332 {
333 c = fetch_byte(f);
334 if (c == '/')
335 {
336 // end of comment - try again for directive
337 goto again;
338 }
339 if (c == CPP_EOF)
340 return;
341 }
342 }
343 }
344 }
345
346 // empty directive - do nothing
347 if (c == CPP_EOL)
348 return;
349
350 if (c < 'a' || c > 'z')
351 goto out;
352
353 i = 0;
354 do
355 {
356 kw[i++] = c;
357 if (i == sizeof(kw) - 1)
358 goto out; // keyword too long
359 c = fetch_byte(f);
360 } while ((c >= 'a' && c <= 'z') || (c == '_'));
361 kw[i++] = '\0';
362
363 /* we have a keyword here */
364 for (i = 0; directives[i].name; i++)
365 {
366 if (strcmp(directives[i].name, kw) == 0)
367 {
368 (*directives[i].fn)(f);
369 return;
370 }
371 }
372
373 /* if we fall through here, we have an unknown directive */
374 out:
375 do_error("invalid preprocessor directive");
376 skip_line(f);
377 }
378
379 /*
380 Notes:
381
382 Rather than tokenize the entire file, we run through it interpreting
383 things only as much as we need to in order to identify the following:
384
385 preprocessing directives (#...)
386 identifiers which might need to be replaced with macros
387
388 We have to interpret strings, character constants, and numbers to prevent
389 false positives in those situations.
390
391 When we find a preprocessing directive, it is handled with a more
392 aggressive tokenization process and then intepreted accordingly.
393
394 nlws is used to record the fact that only whitespace has occurred at the
395 start of a line. Whitespace is defined as comments or isspace(c). It gets
396 reset to 1 after each EOL character. If a non-whitespace character is
397 encountered, it is set to -1. If the character processing decides it really
398 is a whitespace character, it will set nlws back to 1 (block comment).
399 Elsewise, it will get set to 0 if it is still -1 when the loop starts again.
400
401 This is needed so we can identify whitespace interposed before a
402 preprocessor directive. This is the only case where it matters for
403 the preprocessor.
404
405 */
406 void preprocess_file(struct file_stack_e *f)
407 {
408 int c;
409 int nlws = 1;
410
411 while (1)
412 {
413 c = fetch_byte(f);
414 again:
415 if (nlws == -1)
416 nlws = 0;
417 if (c == CPP_EOF)
418 {
419 outchr('\n');
420 return;
421 }
422 if (c == CPP_EOL)
423 {
424 nlws = 1;
425 outchr('\n');
426 continue;
427 }
428
429 if (!is_whitespace(c))
430 nlws = -1;
431
432 if (is_sidchr(c))
433 {
434 // have identifier here - parse it off
435 char *ident = NULL;
436 int idlen = 0;
437
438 do
439 {
440 ident = lw_realloc(ident, idlen + 1);
441 ident[idlen++] = c;
442 ident[idlen] = '\0';
443 c = fetch_byte(f);
444 } while (is_idchr(c));
445
446 /* do something with the identifier here - macros, etc. */
447 outstr(ident);
448 lw_free(ident);
449
450 goto again;
451 }
452
453 switch (c)
454 {
455 default:
456 outchr(c);
457 break;
458
459 case '.': // a number - to prevent seeing an identifier in middle of number
460 outchr(c);
461 c = fetch_byte(f);
462 if (!is_dec(c))
463 goto again;
464 /* fall through */
465 case '0':
466 case '1':
467 case '2':
468 case '3':
469 case '4':
470 case '5':
471 case '6':
472 case '7':
473 case '8':
474 case '9':
475 do
476 {
477 outchr(c);
478 c = fetch_byte(f);
479 if (c == CPP_EOF)
480 return;
481 if (is_ep(c))
482 {
483 outchr(c);
484 c = fetch_byte(f);
485 if (c == '-' || c == '+')
486 {
487 outchr(c);
488 c = fetch_byte(f);
489 }
490 }
491 } while ((is_idchr(c)) || (c == '.'));
492 goto again;
493
494 case '#':
495 if (nlws)
496 {
497 handle_directive(f);
498 /* note: no need to reset nlws */
499 }
500 else
501 outchr('#');
502 break;
503
504 case '\'': // character constant
505 outchr('\'');
506 while ((c = fetch_byte(f)) != '\'')
507 {
508 if (c == '\\')
509 {
510 outchr('\\');
511 c = fetch_byte(f);
512 }
513 if (c == CPP_EOL)
514 {
515 do_warning("Unterminated character constant");
516 goto again;
517 }
518 if (c == CPP_EOF)
519 return;
520 outchr(c);
521 }
522 outchr(c);
523 break;
524
525 case '"': // strings
526 outchr(c);
527 while ((c = fetch_byte(f)) != '"')
528 {
529 if (c == '\\')
530 {
531 outchr('\\');
532 c = fetch_byte(f);
533 }
534 if (c == CPP_EOL)
535 {
536 do_warning("unterminated string literal");
537 goto again;
538 }
539 if (c == CPP_EOF)
540 return;
541 outchr(c);
542 }
543 outchr(c);
544 break;
545
546 case '/': // comments
547 c = fetch_byte(f);
548 if (c == '/')
549 {
550 // line comment
551 outchr(' ');
552 do
553 {
554 c = fetch_byte(f);
555 } while (c != CPP_EOF && c != CPP_EOL);
556 }
557 else if (c == '*')
558 {
559 // block comment
560 for (;;)
561 {
562 c = fetch_byte(f);
563 if (c == CPP_EOF)
564 {
565 break;
566 }
567 if (c == CPP_EOL)
568 {
569 continue;
570 }
571 if (c == '*')
572 {
573 // maybe end of comment
574 c = fetch_byte(f);
575 if (c == '/')
576 {
577 // end of comment
578 break;
579 }
580 }
581 }
582 // replace comment with a single space
583 outchr(' ');
584 if (nlws == -1)
585 nlws = 1;
586 continue;
587 }
588 else
589 {
590 // restore eaten '/'
591 outchr('/');
592 // process the character we just fetched
593 goto again;
594 }
595 } // switch
596 } // processing loop
597 }
598
599 int process_file(const char *f) 275 int process_file(const char *f)
600 { 276 {
601 struct file_stack_e *nf; 277 struct file_stack_e nf;
602 FILE *fp; 278 FILE *fp;
603 279
604 fprintf(stderr, "Processing %s\n", f); 280 fprintf(stderr, "Processing %s\n", f);
605 281
606 if (strcmp(f, "-") == 0) 282 if (strcmp(f, "-") == 0)
612 do_warning("Cannot open %s: %s", f, strerror(errno)); 288 do_warning("Cannot open %s: %s", f, strerror(errno));
613 return -1; 289 return -1;
614 } 290 }
615 291
616 /* push the file onto the file stack */ 292 /* push the file onto the file stack */
617 nf = lw_alloc(sizeof(struct file_stack_e)); 293 nf.fn = f;
618 nf -> fn = f; 294 nf.fp = fp;
619 nf -> fp = fp; 295 nf.next = file_stack;
620 nf -> next = file_stack; 296 nf.line = 1;
621 nf -> line = 1; 297 nf.col = 0;
622 nf -> col = 0; 298 nf.qseen = 0;
623 nf -> qseen = 0; 299 nf.ra = CPP_NOUNG;
624 nf -> ra = CPP_NOUNG; 300 nf.unget = CPP_NOUNG;
625 nf -> unget = CPP_NOUNG; 301 file_stack = &nf;
626 file_stack = nf; 302 nf.ungetbuf = NULL;
627 303 nf.ungetbufs = 0;
304 nf.ungetbufl = 0;
305
628 /* go preprocess the file */ 306 /* go preprocess the file */
629 preprocess_file(nf); 307 preprocess_file();
630 308
631 if (nf -> fp != stdin) 309 if (nf.fp != stdin)
632 fclose(nf -> fp); 310 fclose(nf.fp);
633 file_stack = nf -> next; 311 file_stack = nf.next;
634 lw_free(nf);
635 return 0; 312 return 0;
636 } 313 }