Mercurial > hg > index.cgi
comparison lwbasic/lexer.c @ 25:87590f43e76d
Started lwbasic parser; checkpoint
author | lost@l-w.ca |
---|---|
date | Mon, 24 Jan 2011 20:08:09 -0700 |
parents | |
children | 26aa76da75ad |
comparison
equal
deleted
inserted
replaced
24:421d7ceb4d86 | 25:87590f43e76d |
---|---|
1 /* | |
2 lexer.c | |
3 | |
4 Copyright © 2011 William Astle | |
5 | |
6 This file is part of LWTOOLS. | |
7 | |
8 LWTOOLS is free software: you can redistribute it and/or modify it under the | |
9 terms of the GNU General Public License as published by the Free Software | |
10 Foundation, either version 3 of the License, or (at your option) any later | |
11 version. | |
12 | |
13 This program is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
16 more details. | |
17 | |
18 You should have received a copy of the GNU General Public License along with | |
19 this program. If not, see <http://www.gnu.org/licenses/>. | |
20 */ | |
21 | |
22 /* | |
23 This handles the gritty details of parsing tokens | |
24 */ | |
25 | |
26 #include <stdlib.h> | |
27 #include <stdio.h> | |
28 #include <string.h> | |
29 | |
30 #include <lw_alloc.h> | |
31 #include <lw_string.h> | |
32 | |
33 #define __lexer_c_seen__ | |
34 #include "lwbasic.h" | |
35 | |
36 /* | |
37 A token idenfier is returned by lexer(). The actual string value | |
38 is found in state->lexer_lexer_token_string; if the token as an integer value, | |
39 it will be found in state->lexer_token_number in the appropriate "value" | |
40 slot. | |
41 */ | |
42 | |
43 struct token_list | |
44 { | |
45 char *string; | |
46 int token; | |
47 }; | |
48 | |
49 static struct token_list lexer_global_tokens[] = | |
50 { | |
51 { "function", token_kw_function }, | |
52 { "sub", token_kw_sub }, | |
53 { "public", token_kw_public }, | |
54 { "private", token_kw_private }, | |
55 { "as", token_kw_as }, | |
56 { "params", token_kw_params }, | |
57 { "returns", token_kw_returns }, | |
58 { NULL } | |
59 }; | |
60 | |
61 static int lexer_getchar(cstate *state) | |
62 { | |
63 int c; | |
64 c = input_getchar(state); | |
65 if (c == -2) | |
66 { | |
67 lwb_error("Error reading input stream."); | |
68 } | |
69 return c; | |
70 } | |
71 | |
72 static void lexer_nextchar(cstate *state) | |
73 { | |
74 state -> lexer_curchar = lexer_getchar(state); | |
75 if (state -> lexer_curchar == state -> lexer_ignorechar) | |
76 state -> lexer_curchar = lexer_getchar(state); | |
77 state -> lexer_ignorechar = 0; | |
78 } | |
79 | |
80 static int lexer_curchar(cstate *state) | |
81 { | |
82 if (state -> lexer_curchar == -1) | |
83 { | |
84 lexer_nextchar(state); | |
85 } | |
86 | |
87 return state -> lexer_curchar; | |
88 } | |
89 | |
90 static void lexer_skip_white(cstate *state) | |
91 { | |
92 int c; | |
93 | |
94 for (;;) | |
95 { | |
96 c = lexer_curchar(state); | |
97 if (!(c == 0 || c == ' ' || c == '\t')) | |
98 return; | |
99 lexer_nextchar(state); | |
100 } | |
101 } | |
102 | |
103 /* must not be called unless the word will be non-zero length */ | |
104 static void lexer_word(cstate *state) | |
105 { | |
106 int wordlen = 0; | |
107 int wordpos = 0; | |
108 char *word = NULL; | |
109 int c; | |
110 struct token_list *tok = NULL; | |
111 | |
112 for (;;) { | |
113 c = lexer_curchar(state); | |
114 if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) | |
115 { | |
116 /* character is part of word */ | |
117 if (wordpos >= wordlen) | |
118 { | |
119 word = lw_realloc(word, wordlen + 32); | |
120 wordlen += 32; | |
121 } | |
122 word[wordpos++] = c; | |
123 } | |
124 else | |
125 break; | |
126 | |
127 lexer_nextchar(state); | |
128 } | |
129 | |
130 word[wordpos] = 0; | |
131 lw_free(state -> lexer_token_string); | |
132 state -> lexer_token_string = lw_strdup(word); | |
133 | |
134 switch (state -> parser_state) | |
135 { | |
136 default: | |
137 tok = lexer_global_tokens; | |
138 } | |
139 | |
140 /* check for tokens if appropriate */ | |
141 /* force uppercase */ | |
142 if (tok) | |
143 { | |
144 for (c = 0; word[c]; c++) | |
145 if (word[c] >= 'A' && word[c] <= 'Z') | |
146 word[c] = word[c] + 0x20; | |
147 | |
148 while (tok -> string) | |
149 { | |
150 if (strcmp(tok -> string, word) == 0) | |
151 break; | |
152 tok++; | |
153 } | |
154 } | |
155 | |
156 lw_free(word); | |
157 if (tok && tok -> string) | |
158 state -> lexer_token = tok -> token; | |
159 else | |
160 state -> lexer_token = token_identifier; | |
161 } | |
162 | |
163 static void lexer_empty_token(cstate *state) | |
164 { | |
165 lw_free(state -> lexer_token_string); | |
166 state -> lexer_token_string = NULL; | |
167 } | |
168 | |
169 void lexer(cstate *state) | |
170 { | |
171 int c; | |
172 | |
173 lexer_skip_white(state); | |
174 | |
175 lexer_empty_token(state); | |
176 | |
177 c = lexer_curchar(state); | |
178 if (c == -1) | |
179 { | |
180 state -> lexer_token = token_eof; | |
181 return; | |
182 } | |
183 | |
184 if (c == '\n') | |
185 { | |
186 /* LF */ | |
187 lexer_nextchar(state); | |
188 state -> lexer_ignorechar = '\r'; | |
189 state -> lexer_token = token_eol; | |
190 return; | |
191 } | |
192 | |
193 if (c == '\r') | |
194 { | |
195 /* CR */ | |
196 lexer_nextchar(state); | |
197 state -> lexer_ignorechar = '\n'; | |
198 state -> lexer_token = token_eol; | |
199 return; | |
200 } | |
201 | |
202 if (c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c >= 0x80) | |
203 { | |
204 /* we have a word here; identifier, keyword, etc. */ | |
205 lexer_word(state); | |
206 return; | |
207 } | |
208 | |
209 /* return the character if all else fails */ | |
210 state -> lexer_token_string = lw_realloc(state -> lexer_token_string, 2); | |
211 state -> lexer_token_string[0] = c; | |
212 state -> lexer_token_string[1] = 0; | |
213 lexer_nextchar(state); | |
214 state -> lexer_token = token_char; | |
215 return; | |
216 } |