Mercurial > hg > index.cgi
annotate lwbasic/lexer.c @ 30:bcd532a90e53
Renamed "compiler" to "parser" for more consistent terminology
author | lost@l-w.ca |
---|---|
date | Thu, 03 Feb 2011 21:19:11 -0700 |
parents | 26aa76da75ad |
children | 574931d87abd |
rev | line source |
---|---|
25 | 1 /* |
2 lexer.c | |
3 | |
4 Copyright © 2011 William Astle | |
5 | |
6 This file is part of LWTOOLS. | |
7 | |
8 LWTOOLS is free software: you can redistribute it and/or modify it under the | |
9 terms of the GNU General Public License as published by the Free Software | |
10 Foundation, either version 3 of the License, or (at your option) any later | |
11 version. | |
12 | |
13 This program is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
16 more details. | |
17 | |
18 You should have received a copy of the GNU General Public License along with | |
19 this program. If not, see <http://www.gnu.org/licenses/>. | |
20 */ | |
21 | |
22 /* | |
23 This handles the gritty details of parsing tokens | |
24 */ | |
25 | |
26 #include <stdlib.h> | |
27 #include <stdio.h> | |
28 #include <string.h> | |
29 | |
30 #include <lw_alloc.h> | |
31 #include <lw_string.h> | |
32 | |
33 #define __lexer_c_seen__ | |
34 #include "lwbasic.h" | |
35 | |
36 /* | |
37 A token idenfier is returned by lexer(). The actual string value | |
38 is found in state->lexer_lexer_token_string; if the token as an integer value, | |
39 it will be found in state->lexer_token_number in the appropriate "value" | |
40 slot. | |
41 */ | |
42 | |
43 struct token_list | |
44 { | |
45 char *string; | |
46 int token; | |
47 }; | |
48 | |
49 static struct token_list lexer_global_tokens[] = | |
50 { | |
51 { "function", token_kw_function }, | |
52 { "sub", token_kw_sub }, | |
53 { "public", token_kw_public }, | |
54 { "private", token_kw_private }, | |
55 { "as", token_kw_as }, | |
56 { "params", token_kw_params }, | |
57 { "returns", token_kw_returns }, | |
26
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
58 { "integer", token_kw_integer }, |
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
59 { "endsub", token_kw_endsub }, |
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
60 { "endfunction", token_kw_endfunction }, |
25 | 61 { NULL } |
62 }; | |
63 | |
64 static int lexer_getchar(cstate *state) | |
65 { | |
66 int c; | |
67 c = input_getchar(state); | |
68 if (c == -2) | |
69 { | |
70 lwb_error("Error reading input stream."); | |
71 } | |
72 return c; | |
73 } | |
74 | |
75 static void lexer_nextchar(cstate *state) | |
76 { | |
77 state -> lexer_curchar = lexer_getchar(state); | |
78 if (state -> lexer_curchar == state -> lexer_ignorechar) | |
79 state -> lexer_curchar = lexer_getchar(state); | |
80 state -> lexer_ignorechar = 0; | |
81 } | |
82 | |
83 static int lexer_curchar(cstate *state) | |
84 { | |
85 if (state -> lexer_curchar == -1) | |
86 { | |
87 lexer_nextchar(state); | |
88 } | |
89 | |
90 return state -> lexer_curchar; | |
91 } | |
92 | |
93 static void lexer_skip_white(cstate *state) | |
94 { | |
95 int c; | |
96 | |
97 for (;;) | |
98 { | |
99 c = lexer_curchar(state); | |
100 if (!(c == 0 || c == ' ' || c == '\t')) | |
101 return; | |
102 lexer_nextchar(state); | |
103 } | |
104 } | |
105 | |
106 /* must not be called unless the word will be non-zero length */ | |
107 static void lexer_word(cstate *state) | |
108 { | |
109 int wordlen = 0; | |
110 int wordpos = 0; | |
111 char *word = NULL; | |
112 int c; | |
113 struct token_list *tok = NULL; | |
114 | |
115 for (;;) { | |
116 c = lexer_curchar(state); | |
26
26aa76da75ad
Additional parsing in function/sub; emission of prolog/epilog code
lost@l-w.ca
parents:
25
diff
changeset
|
117 if (c == '_' || (c >= '0' && c <= '9' ) || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) |
25 | 118 { |
119 /* character is part of word */ | |
120 if (wordpos >= wordlen) | |
121 { | |
122 word = lw_realloc(word, wordlen + 32); | |
123 wordlen += 32; | |
124 } | |
125 word[wordpos++] = c; | |
126 } | |
127 else | |
128 break; | |
129 | |
130 lexer_nextchar(state); | |
131 } | |
132 | |
133 word[wordpos] = 0; | |
134 lw_free(state -> lexer_token_string); | |
135 state -> lexer_token_string = lw_strdup(word); | |
136 | |
137 switch (state -> parser_state) | |
138 { | |
139 default: | |
140 tok = lexer_global_tokens; | |
141 } | |
142 | |
143 /* check for tokens if appropriate */ | |
144 /* force uppercase */ | |
145 if (tok) | |
146 { | |
147 for (c = 0; word[c]; c++) | |
148 if (word[c] >= 'A' && word[c] <= 'Z') | |
149 word[c] = word[c] + 0x20; | |
150 | |
151 while (tok -> string) | |
152 { | |
153 if (strcmp(tok -> string, word) == 0) | |
154 break; | |
155 tok++; | |
156 } | |
157 } | |
158 | |
159 lw_free(word); | |
160 if (tok && tok -> string) | |
161 state -> lexer_token = tok -> token; | |
162 else | |
163 state -> lexer_token = token_identifier; | |
164 } | |
165 | |
166 static void lexer_empty_token(cstate *state) | |
167 { | |
168 lw_free(state -> lexer_token_string); | |
169 state -> lexer_token_string = NULL; | |
170 } | |
171 | |
172 void lexer(cstate *state) | |
173 { | |
174 int c; | |
175 | |
176 lexer_skip_white(state); | |
177 | |
178 lexer_empty_token(state); | |
179 | |
180 c = lexer_curchar(state); | |
181 if (c == -1) | |
182 { | |
183 state -> lexer_token = token_eof; | |
184 return; | |
185 } | |
186 | |
187 if (c == '\n') | |
188 { | |
189 /* LF */ | |
190 lexer_nextchar(state); | |
191 state -> lexer_ignorechar = '\r'; | |
192 state -> lexer_token = token_eol; | |
193 return; | |
194 } | |
195 | |
196 if (c == '\r') | |
197 { | |
198 /* CR */ | |
199 lexer_nextchar(state); | |
200 state -> lexer_ignorechar = '\n'; | |
201 state -> lexer_token = token_eol; | |
202 return; | |
203 } | |
204 | |
205 if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) | |
206 { | |
207 /* we have a word here; identifier, keyword, etc. */ | |
208 lexer_word(state); | |
209 return; | |
210 } | |
211 | |
212 /* return the character if all else fails */ | |
213 state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2); | |
214 state -> lexer_token_string[0] = c; | |
215 state -> lexer_token_string[1] = 0; | |
216 lexer_nextchar(state); | |
217 state -> lexer_token = token_char; | |
218 return; | |
219 } |