25
|
1 /*
|
|
2 lexer.c
|
|
3
|
|
4 Copyright © 2011 William Astle
|
|
5
|
|
6 This file is part of LWTOOLS.
|
|
7
|
|
8 LWTOOLS is free software: you can redistribute it and/or modify it under the
|
|
9 terms of the GNU General Public License as published by the Free Software
|
|
10 Foundation, either version 3 of the License, or (at your option) any later
|
|
11 version.
|
|
12
|
|
13 This program is distributed in the hope that it will be useful, but WITHOUT
|
|
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
16 more details.
|
|
17
|
|
18 You should have received a copy of the GNU General Public License along with
|
|
19 this program. If not, see <http://www.gnu.org/licenses/>.
|
|
20 */
|
|
21
|
|
22 /*
|
|
23 This handles the gritty details of parsing tokens
|
|
24 */
|
|
25
|
|
26 #include <stdlib.h>
|
|
27 #include <stdio.h>
|
|
28 #include <string.h>
|
|
29
|
|
30 #include <lw_alloc.h>
|
|
31 #include <lw_string.h>
|
|
32
|
|
33 #define __lexer_c_seen__
|
|
34 #include "lwbasic.h"
|
|
35
|
|
36 /*
|
|
37 A token idenfier is returned by lexer(). The actual string value
|
|
38 is found in state->lexer_lexer_token_string; if the token as an integer value,
|
|
39 it will be found in state->lexer_token_number in the appropriate "value"
|
|
40 slot.
|
|
41 */
|
|
42
|
|
43 struct token_list
|
|
44 {
|
|
45 char *string;
|
|
46 int token;
|
|
47 };
|
|
48
|
|
49 static struct token_list lexer_global_tokens[] =
|
|
50 {
|
|
51 { "function", token_kw_function },
|
|
52 { "sub", token_kw_sub },
|
|
53 { "public", token_kw_public },
|
|
54 { "private", token_kw_private },
|
|
55 { "as", token_kw_as },
|
|
56 { "params", token_kw_params },
|
|
57 { "returns", token_kw_returns },
|
|
58 { NULL }
|
|
59 };
|
|
60
|
|
61 static int lexer_getchar(cstate *state)
|
|
62 {
|
|
63 int c;
|
|
64 c = input_getchar(state);
|
|
65 if (c == -2)
|
|
66 {
|
|
67 lwb_error("Error reading input stream.");
|
|
68 }
|
|
69 return c;
|
|
70 }
|
|
71
|
|
72 static void lexer_nextchar(cstate *state)
|
|
73 {
|
|
74 state -> lexer_curchar = lexer_getchar(state);
|
|
75 if (state -> lexer_curchar == state -> lexer_ignorechar)
|
|
76 state -> lexer_curchar = lexer_getchar(state);
|
|
77 state -> lexer_ignorechar = 0;
|
|
78 }
|
|
79
|
|
80 static int lexer_curchar(cstate *state)
|
|
81 {
|
|
82 if (state -> lexer_curchar == -1)
|
|
83 {
|
|
84 lexer_nextchar(state);
|
|
85 }
|
|
86
|
|
87 return state -> lexer_curchar;
|
|
88 }
|
|
89
|
|
90 static void lexer_skip_white(cstate *state)
|
|
91 {
|
|
92 int c;
|
|
93
|
|
94 for (;;)
|
|
95 {
|
|
96 c = lexer_curchar(state);
|
|
97 if (!(c == 0 || c == ' ' || c == '\t'))
|
|
98 return;
|
|
99 lexer_nextchar(state);
|
|
100 }
|
|
101 }
|
|
102
|
|
103 /* must not be called unless the word will be non-zero length */
|
|
104 static void lexer_word(cstate *state)
|
|
105 {
|
|
106 int wordlen = 0;
|
|
107 int wordpos = 0;
|
|
108 char *word = NULL;
|
|
109 int c;
|
|
110 struct token_list *tok = NULL;
|
|
111
|
|
112 for (;;) {
|
|
113 c = lexer_curchar(state);
|
|
114 if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80)
|
|
115 {
|
|
116 /* character is part of word */
|
|
117 if (wordpos >= wordlen)
|
|
118 {
|
|
119 word = lw_realloc(word, wordlen + 32);
|
|
120 wordlen += 32;
|
|
121 }
|
|
122 word[wordpos++] = c;
|
|
123 }
|
|
124 else
|
|
125 break;
|
|
126
|
|
127 lexer_nextchar(state);
|
|
128 }
|
|
129
|
|
130 word[wordpos] = 0;
|
|
131 lw_free(state -> lexer_token_string);
|
|
132 state -> lexer_token_string = lw_strdup(word);
|
|
133
|
|
134 switch (state -> parser_state)
|
|
135 {
|
|
136 default:
|
|
137 tok = lexer_global_tokens;
|
|
138 }
|
|
139
|
|
140 /* check for tokens if appropriate */
|
|
141 /* force uppercase */
|
|
142 if (tok)
|
|
143 {
|
|
144 for (c = 0; word[c]; c++)
|
|
145 if (word[c] >= 'A' && word[c] <= 'Z')
|
|
146 word[c] = word[c] + 0x20;
|
|
147
|
|
148 while (tok -> string)
|
|
149 {
|
|
150 if (strcmp(tok -> string, word) == 0)
|
|
151 break;
|
|
152 tok++;
|
|
153 }
|
|
154 }
|
|
155
|
|
156 lw_free(word);
|
|
157 if (tok && tok -> string)
|
|
158 state -> lexer_token = tok -> token;
|
|
159 else
|
|
160 state -> lexer_token = token_identifier;
|
|
161 }
|
|
162
|
|
163 static void lexer_empty_token(cstate *state)
|
|
164 {
|
|
165 lw_free(state -> lexer_token_string);
|
|
166 state -> lexer_token_string = NULL;
|
|
167 }
|
|
168
|
|
169 void lexer(cstate *state)
|
|
170 {
|
|
171 int c;
|
|
172
|
|
173 lexer_skip_white(state);
|
|
174
|
|
175 lexer_empty_token(state);
|
|
176
|
|
177 c = lexer_curchar(state);
|
|
178 if (c == -1)
|
|
179 {
|
|
180 state -> lexer_token = token_eof;
|
|
181 return;
|
|
182 }
|
|
183
|
|
184 if (c == '\n')
|
|
185 {
|
|
186 /* LF */
|
|
187 lexer_nextchar(state);
|
|
188 state -> lexer_ignorechar = '\r';
|
|
189 state -> lexer_token = token_eol;
|
|
190 return;
|
|
191 }
|
|
192
|
|
193 if (c == '\r')
|
|
194 {
|
|
195 /* CR */
|
|
196 lexer_nextchar(state);
|
|
197 state -> lexer_ignorechar = '\n';
|
|
198 state -> lexer_token = token_eol;
|
|
199 return;
|
|
200 }
|
|
201
|
|
202 if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80)
|
|
203 {
|
|
204 /* we have a word here; identifier, keyword, etc. */
|
|
205 lexer_word(state);
|
|
206 return;
|
|
207 }
|
|
208
|
|
209 /* return the character if all else fails */
|
|
210 state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2);
|
|
211 state -> lexer_token_string[0] = c;
|
|
212 state -> lexer_token_string[1] = 0;
|
|
213 lexer_nextchar(state);
|
|
214 state -> lexer_token = token_char;
|
|
215 return;
|
|
216 }
|