lexer.spl (8491B)
1 // Copyright 2023, Brian Swetland <swetland@frotz.net> 2 // Licensed under the Apache License, Version 2.0. 3 4 fn error_begin() i32 { 5 writes(2, "\n"); 6 writes(2, ctx.filename); 7 writes(2, ":"); 8 writei(2, ctx.linenumber); 9 writes(2, ": error: "); 10 return 2; 11 } 12 13 fn error_end() { 14 writes(2, "\n"); 15 os_exit(1); 16 } 17 18 // ================================================================ 19 // lexical scanner 20 21 // currently unrecognized: # $ ? \ ` 22 var lextab [256]u8 = { 23 tEOF, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 24 tINV, tSPC, tEOL, tSPC, tINV, tSPC, tINV, tINV, 25 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 26 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 27 // ! " # $ % & ' 28 tSPC, tBANG, tDQT, tMSC, tMSC, tPERCENT, tAMP, tSQT, 29 // ( ) * + , - . / 30 tOPAREN, tCPAREN, tSTAR, tPLUS, tCOMMA, tMINUS, tDOT, tSLASH, 31 // 0 1 2 3 4 5 6 7 32 tNUM, tNUM, tNUM, tNUM, tNUM, tNUM, tNUM, tNUM, 33 // 8 9 : ; < = > ? 34 tNUM, tNUM, tCOLON, tSEMI, tLT, tASSIGN, tGT, tMSC, 35 // @ A B C D E F G 36 tAT, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, 37 // H I J K L M N O 38 tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, 39 // P Q R S T U V W 40 tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, 41 // X Y Z [ \ ] ^ _ 42 tIDN, tIDN, tIDN, tOBRACK, tMSC, tCBRACK, tCARET, tIDN, 43 // ` a b c d e f g 44 tMSC, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, 45 // h i j k l m n o 46 tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, 47 // p q r s t u v w 48 tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, 49 // x y z { | } ~ 50 tIDN, tIDN, tIDN, tOBRACE, tPIPE, tCBRACE, tNOT, tINV, 51 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 52 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 53 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 54 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 55 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 56 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 57 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 58 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 59 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 60 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 61 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 62 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 63 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 64 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 65 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 66 tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, 67 }; 68 69 fn unhex(ch u32) i32 { 70 if (ch >= '0') && (ch <= '9') { 71 return ch - '0'; 72 } 73 if (ch >= 'a') && (ch <= 'f') { 74 return ch - 'a' + 10; 75 } 76 if (ch >= 'A') && (ch <= 'F') { 77 return ch - 'A' + 10; 78 } 79 return -1; 80 } 81 82 fn scan() Token { 83 ctx.byteoffset++; 84 var ch i32 = readc(ctx.fd_in); 85 if ch < 0 { 86 ctx.cc = 0; 87 } else { 88 ctx.cc = ch; 89 } 90 return ctx.cc; 91 } 92 93 fn unescape(n u32) u32 { 94 if n == 'n' { 95 return 10; 96 } else if n == 'r' { 97 return 13; 98 } else if n == 't' { 99 return 9; 100 } else if n == '"' { 101 return '"'; 102 } else if n == '\'' { 103 return '\''; 104 } else if n == '\\' { 105 return '\\'; 106 } else if n == 'x' { 107 var x0 u32 = unhex(scan()); 108 var x1 u32 = unhex(scan()); 109 if (x0 < 0) || (x1 < 0) { 110 error("invalid hex escape"); 111 } 112 return (x0 << 4) | x1; 113 } else { 114 error("invalid escape ", n); 115 return 0; 116 } 117 } 118 119 fn scan_string(cc u32, nc u32) Token { 120 var n u32 = 0; 121 while true { 122 if nc == '"' { 123 nc = scan(); 124 break; 125 } else if nc == 0 { 126 error("unterminated string"); 127 } else if nc == '\\' { 128 ctx.tmp[n] = unescape(scan()); 129 } else { 130 ctx.tmp[n] = nc; 131 } 132 nc = scan(); 133 n++; 134 if n == 255 { 135 error("constant string too large"); 136 } 137 } 138 ctx.tmp[n] = 0; 139 return tSTR; 140 } 141 142 fn scan_keyword(len u32) Token { 143 ctx.tmp[len] = 0; 144 var idn String = string_make(ctx.tmp, len); 145 ctx.ident = idn; 146 147 if len == 2 { 148 if idn == ctx.idn_if { return tIF; }; 149 if idn == ctx.idn_fn { return tFN; } 150 } else if len == 3 { 151 if idn == ctx.idn_for { return tFOR; } 152 if idn == ctx.idn_var { return tVAR; } 153 if idn == ctx.idn_nil { return tNIL; } 154 if idn == ctx.idn_new { return tNEW; } 155 } else if len == 4 { 156 if idn == ctx.idn_case { return tCASE; } 157 if idn == ctx.idn_else { return tELSE; } 158 if idn == ctx.idn_enum { return tENUM; } 159 if idn == ctx.idn_true { return tTRUE; } 160 } else if len == 5 { 161 if idn == ctx.idn_break { return tBREAK; } 162 if idn == ctx.idn_while { return tWHILE; } 163 if idn == ctx.idn_false { return tFALSE; } 164 } else if len == 6 { 165 if idn == ctx.idn_switch { return tSWITCH; } 166 if idn == ctx.idn_struct { return tSTRUCT; } 167 if idn == ctx.idn_return { return tRETURN; } 168 } else if len == 8 { 169 if idn == ctx.idn_continue { return tCONTINUE; } 170 } 171 return tIDN; 172 } 173 174 fn scan_number(cc u32, nc u32) Token { 175 var n u32 = 1; 176 var val u32 = cc - '0'; 177 178 if (cc == '0') && (nc == 'b') { // binary 179 nc = scan(); 180 while (nc == '0') || (nc == '1') { 181 val = (val << 1) | (nc - '0'); 182 nc = scan(); 183 n++; 184 if (n == 34) { 185 error("binary constant too large"); 186 } 187 } 188 } else if (cc == '0') && (nc == 'x') { // hex 189 nc = scan(); 190 while true { 191 var tmp i32 = unhex(nc); 192 if tmp == -1 { 193 break; 194 } 195 val = (val << 4) | tmp; 196 nc = scan(); 197 n++; 198 if n == 10 { 199 error("hex constant too large"); 200 } 201 } 202 } else { // decimal 203 while lextab[nc] == tNUM { 204 var tmp u32 = (val * 10) + (nc - '0'); 205 if tmp <= val { 206 error("decimal constant too large"); 207 } 208 val = tmp; 209 nc = scan(); 210 n++; 211 } 212 } 213 ctx.num = val; 214 return tNUM; 215 } 216 217 fn scan_ident(cc u32, nc u32) Token { 218 ctx.tmp[0] = cc; 219 var n u32 = 1; 220 221 while true { 222 var tok Token = lextab[nc]; 223 if (tok == tIDN) || (tok == tNUM) { 224 ctx.tmp[n] = nc; 225 n++; 226 if (n == 32) { error("identifier too large"); } 227 nc = scan(); 228 } else { 229 break; 230 } 231 } 232 return scan_keyword(n); 233 } 234 235 fn _next() Token { 236 var nc u8 = ctx.cc; 237 while true { 238 var cc u8 = nc; 239 nc = scan(); 240 var tok Token = lextab[cc]; 241 if tok == tNUM { // 0..9 242 return scan_number(cc, nc); 243 } else if tok == tIDN { // _ A..Z a..z 244 return scan_ident(cc, nc); 245 } else if tok == tDQT { // " 246 return scan_string(cc, nc); 247 } else if tok == tSQT { // ' 248 ctx.num = nc; 249 if nc == '\\' { 250 ctx.num = unescape(scan()); 251 } 252 nc = scan(); 253 if nc != '\'' { 254 error("unterminated character constant"); 255 } 256 nc = scan(); 257 return tNUM; 258 } else if tok == tPLUS { 259 if nc == '+' { tok = tINC; nc = scan(); } 260 } else if tok == tMINUS { 261 if nc == '-' { tok = tDEC; nc = scan(); } 262 } else if tok == tAMP { 263 if nc == '&' { tok = tAND; nc = scan(); } 264 } else if tok == tPIPE { 265 if nc == '|' { tok = tOR; nc = scan(); } 266 } else if tok == tGT { 267 if nc == '=' { tok = tGE; nc = scan(); } 268 else if nc == '>' { tok = tRIGHT; nc = scan(); } 269 } else if tok == tLT { 270 if nc == '=' { tok = tLE; nc = scan(); } 271 else if nc == '<' { tok = tLEFT; nc = scan(); } 272 } else if tok == tASSIGN { 273 if nc == '=' { tok = tEQ; nc = scan(); } 274 } else if tok == tBANG { 275 if nc == '=' { tok = tNE; nc = scan(); } 276 } else if tok == tSLASH { 277 if nc == '/' { 278 // comment -- consume until EOL or EOF 279 while (nc != '\n') && (nc != 0) { 280 nc = scan(); 281 } 282 continue; 283 } 284 } else if tok == tEOL { 285 ctx.linenumber++; 286 ctx.lineoffset = ctx.byteoffset; 287 //if ctx.flags & cfVisibleEOL { 288 // return tEOL; 289 //} 290 continue; 291 } else if tok == tSPC { 292 continue; 293 } else if (tok == tMSC) || (tok == tINV) { 294 error("unknown character ", @u32 cc); 295 } 296 297 // if we're an AddOp or MulOp, followed by an '=' 298 if ((tok & 0xF0) == 0x10) && (nc == '=') { 299 nc = scan(); 300 // transform us to a XEQ operation 301 tok = tok + 0x10; 302 } 303 304 return tok; 305 } 306 } 307 308 fn printstr(fd i32, s str) { 309 var n u32 = 0; 310 writec(fd, '"'); 311 while true { 312 var ch u32 = s[n]; 313 if ch == 0 { 314 break; 315 } else if (ch < ' ') || (ch > '~') { 316 writex(fd, ch); 317 } else if (ch == '"') || (ch == '\\') { 318 writec(fd, '\\'); 319 writec(fd, ch); 320 } else { 321 writec(fd, ch); 322 } 323 n++; 324 } 325 writec(fd, '"'); 326 } 327 328 fn token_print(fd i32) { 329 if ctx.tok == tNUM { 330 writec(fd, '#'); 331 writex(fd, ctx.num); 332 } else if ctx.tok == tIDN { 333 writec(fd, '@'); 334 writes(fd, ctx.tmp); 335 } else if ctx.tok == tEOL { 336 writec(fd, '\n'); 337 } else if ctx.tok == tSTR { 338 printstr(fd, ctx.tmp); 339 } else { 340 writes(fd, tnames[ctx.tok]); 341 } 342 writec(fd, ' '); 343 } 344 345 fn next() Token { 346 ctx.tok = _next(); 347 return ctx.tok; 348 } 349