spl

systems programming language
git clone http://frotz.net/git/spl.git
Log | Files | Refs | README | LICENSE

lexer.spl (8491B)


      1 // Copyright 2023, Brian Swetland <swetland@frotz.net>
      2 // Licensed under the Apache License, Version 2.0.
      3 
      4 fn error_begin() i32 {
      5 	writes(2, "\n");
      6 	writes(2, ctx.filename);
      7 	writes(2, ":");
      8 	writei(2, ctx.linenumber);
      9 	writes(2, ": error: ");
     10 	return 2;
     11 }
     12 
     13 fn error_end() {
     14 	writes(2, "\n");
     15 	os_exit(1);
     16 }
     17 
     18 // ================================================================
     19 // lexical scanner
     20 
     21 // currently unrecognized: # $ ? \ `
     22 var lextab [256]u8 = {
     23 	tEOF, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     24 	tINV, tSPC, tEOL, tSPC, tINV, tSPC, tINV, tINV,
     25 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     26 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     27 //	      !     "     #     $     %     &     '
     28 	tSPC, tBANG, tDQT, tMSC, tMSC, tPERCENT, tAMP, tSQT,
     29 //	(     )     *     +     ,     -     .     /
     30 	tOPAREN, tCPAREN, tSTAR, tPLUS, tCOMMA, tMINUS, tDOT, tSLASH,
     31 //	0     1     2     3     4     5     6     7
     32 	tNUM, tNUM, tNUM, tNUM, tNUM, tNUM, tNUM, tNUM,
     33 //	8     9     :     ;     <     =     >     ?
     34 	tNUM, tNUM, tCOLON, tSEMI, tLT, tASSIGN, tGT, tMSC,
     35 //	@     A     B     C     D     E     F     G
     36 	tAT,  tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN,
     37 //	H     I     J     K     L     M     N     O
     38 	tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN,
     39 //	P     Q     R     S     T     U     V     W
     40 	tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN,
     41 //	X     Y     Z     [     \     ]     ^     _
     42 	tIDN, tIDN, tIDN, tOBRACK, tMSC, tCBRACK, tCARET, tIDN,
     43 //	`     a     b     c     d     e     f     g
     44 	tMSC, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN,
     45 //	h     i     j     k     l     m     n     o
     46 	tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN,
     47 //	p     q     r     s     t     u     v     w
     48 	tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN,
     49 //	x     y     z     {     |     }     ~
     50 	tIDN, tIDN, tIDN, tOBRACE, tPIPE, tCBRACE, tNOT, tINV,
     51 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     52 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     53 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     54 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     55 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     56 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     57 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     58 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     59 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     60 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     61 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     62 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     63 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     64 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     65 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     66 	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
     67 };
     68 
     69 fn unhex(ch u32) i32 {
     70 	if (ch >= '0') && (ch <= '9') {
     71 		return ch - '0';
     72 	}
     73 	if (ch >= 'a') && (ch <= 'f') {
     74 		return ch - 'a' + 10;
     75 	}
     76 	if (ch >= 'A') && (ch <= 'F') {
     77 		return ch - 'A' + 10;
     78 	}
     79 	return -1;
     80 }
     81 
     82 fn scan() Token {
     83 	ctx.byteoffset++;
     84 	var ch i32 = readc(ctx.fd_in);
     85 	if ch < 0 {
     86 		ctx.cc = 0;
     87 	} else {
     88 		ctx.cc = ch;
     89 	}
     90 	return ctx.cc;
     91 }
     92 
     93 fn unescape(n u32) u32 {
     94 	if n == 'n' {
     95 		return 10;
     96 	} else if n == 'r' {
     97 		return 13;
     98 	} else if n == 't' {
     99 		return 9;
    100 	} else if n == '"' {
    101 		return '"';
    102 	} else if n == '\'' {
    103 		return '\'';
    104 	} else if n == '\\' {
    105 		return '\\';
    106 	} else if n == 'x' {
    107 		var x0 u32 = unhex(scan());
    108 		var x1 u32 = unhex(scan());
    109 		if (x0 < 0) || (x1 < 0) {
    110 			error("invalid hex escape");
    111 		}
    112 		return (x0 << 4) | x1;
    113 	} else {
    114 		error("invalid escape ", n);
    115 		return 0;
    116 	}
    117 }
    118 
    119 fn scan_string(cc u32, nc u32) Token {
    120 	var n u32 = 0;
    121 	while true {
    122 		if nc == '"' {
    123 			nc = scan();
    124 			break;
    125 		} else if nc == 0 {
    126 			error("unterminated string");
    127 		} else if nc == '\\' {
    128 			ctx.tmp[n] = unescape(scan());
    129 		} else {
    130 			ctx.tmp[n] = nc;
    131 		}
    132 		nc = scan();
    133 		n++;
    134 		if n == 255 {
    135 			error("constant string too large");
    136 		}
    137 	}
    138 	ctx.tmp[n] = 0;
    139 	return tSTR;
    140 }
    141 
    142 fn scan_keyword(len u32) Token {
    143 	ctx.tmp[len] = 0;
    144 	var idn String = string_make(ctx.tmp, len);
    145 	ctx.ident = idn;
    146 
    147 	if len == 2 {
    148 		if idn == ctx.idn_if { return tIF; };
    149 		if idn == ctx.idn_fn { return tFN; }
    150 	} else if len == 3 {
    151 		if idn == ctx.idn_for { return tFOR; }
    152 		if idn == ctx.idn_var { return tVAR; }
    153 		if idn == ctx.idn_nil { return tNIL; }
    154 		if idn == ctx.idn_new { return tNEW; }
    155 	} else if len == 4 {
    156 		if idn == ctx.idn_case { return tCASE; }
    157 		if idn == ctx.idn_else { return tELSE; }
    158 		if idn == ctx.idn_enum { return tENUM; }
    159 		if idn == ctx.idn_true { return tTRUE; }
    160 	} else if len == 5 {
    161 		if idn == ctx.idn_break { return tBREAK; }
    162 		if idn == ctx.idn_while { return tWHILE; }
    163 		if idn == ctx.idn_false { return tFALSE; }
    164 	} else if len == 6 {
    165 		if idn == ctx.idn_switch { return tSWITCH; }
    166 		if idn == ctx.idn_struct { return tSTRUCT; }
    167 		if idn == ctx.idn_return { return tRETURN; }
    168 	} else if len == 8 {
    169 		if idn == ctx.idn_continue { return tCONTINUE; }
    170 	}
    171 	return tIDN;
    172 }
    173 
    174 fn scan_number(cc u32, nc u32) Token {
    175 	var n u32 = 1;
    176 	var val u32 = cc - '0';
    177 
    178 	if (cc == '0') && (nc == 'b') { // binary
    179 		nc = scan();
    180 		while (nc == '0') || (nc == '1') {
    181 			val = (val << 1) | (nc - '0');
    182 			nc = scan();
    183 			n++;
    184 			if (n == 34) {
    185 				error("binary constant too large");
    186 			}
    187 		}
    188 	} else if (cc == '0') && (nc == 'x') { // hex
    189 		nc = scan();
    190 		while true {
    191 			var tmp i32 = unhex(nc);
    192 			if tmp == -1 {
    193 				break;
    194 			}
    195 			val = (val << 4) | tmp;
    196 			nc = scan();
    197 			n++;
    198 			if n == 10 {
    199 				error("hex constant too large");
    200 			}
    201 		}
    202 	} else { // decimal
    203 		while lextab[nc] == tNUM {
    204 			var tmp u32 = (val * 10) + (nc - '0');
    205 			if tmp <= val {
    206 				error("decimal constant too large");
    207 			}
    208 			val = tmp;
    209 			nc = scan();
    210 			n++;
    211 		}
    212 	}
    213 	ctx.num = val;
    214 	return tNUM;
    215 }
    216 
    217 fn scan_ident(cc u32, nc u32) Token {
    218 	ctx.tmp[0] = cc;
    219 	var n u32 = 1;
    220 
    221 	while true {
    222 		var tok Token = lextab[nc];
    223 		if (tok == tIDN) || (tok == tNUM) {
    224 			ctx.tmp[n] = nc;
    225 			n++;
    226 			if (n == 32) { error("identifier too large"); }
    227 			nc = scan();
    228 		} else {
    229 			break;
    230 		}
    231 	}
    232 	return scan_keyword(n);
    233 }
    234 
    235 fn _next() Token {
    236 	var nc u8 = ctx.cc;
    237 	while true {
    238 		var cc u8 = nc;
    239 		nc = scan();
    240 		var tok Token = lextab[cc];
    241 		if tok == tNUM { // 0..9
    242 			return scan_number(cc, nc);
    243 		} else if tok == tIDN { // _ A..Z a..z
    244 			return scan_ident(cc, nc);
    245 		} else if tok == tDQT { // "
    246 			return scan_string(cc, nc);
    247 		} else if tok == tSQT { // '
    248 			ctx.num = nc;
    249 			if nc == '\\' {
    250 				ctx.num = unescape(scan());
    251 			}
    252 			nc = scan();
    253 			if nc != '\'' {
    254 				error("unterminated character constant");
    255 			}
    256 			nc = scan();
    257 			return tNUM;
    258 		} else if tok == tPLUS {
    259 			if nc == '+' { tok = tINC; nc = scan(); }
    260 		} else if tok == tMINUS {
    261 			if nc == '-' { tok = tDEC; nc = scan(); }
    262 		} else if tok == tAMP {
    263 			if nc == '&' { tok = tAND; nc = scan(); }
    264 		} else if tok == tPIPE {
    265 			if nc == '|' { tok = tOR; nc = scan(); }
    266 		} else if tok == tGT {
    267 			if nc == '=' { tok = tGE; nc = scan(); }
    268 			else if nc == '>' { tok = tRIGHT; nc = scan(); }
    269 		} else if tok == tLT {
    270 			if nc == '=' { tok = tLE; nc = scan(); }
    271 			else if nc == '<' { tok = tLEFT; nc = scan(); }
    272 		} else if tok == tASSIGN {
    273 			if nc == '=' { tok = tEQ; nc = scan(); }
    274 		} else if tok == tBANG {
    275 			if nc == '=' { tok = tNE; nc = scan(); }
    276 		} else if tok == tSLASH {
    277 			if nc == '/' {
    278 				// comment -- consume until EOL or EOF
    279 				while (nc != '\n') && (nc != 0) {
    280 					nc = scan();
    281 				}
    282 				continue;
    283 			}
    284 		} else if tok == tEOL {
    285 			ctx.linenumber++;
    286 			ctx.lineoffset = ctx.byteoffset;
    287 			//if ctx.flags & cfVisibleEOL {
    288 			//	return tEOL;
    289 			//}
    290 			continue;
    291 		} else if tok == tSPC {
    292 			continue;
    293 		} else if (tok == tMSC) || (tok == tINV) {
    294 			error("unknown character ", @u32 cc);
    295 		}
    296 
    297 		// if we're an AddOp or MulOp, followed by an '='
    298 		if ((tok & 0xF0) == 0x10) && (nc == '=') {
    299 			nc = scan();
    300 			// transform us to a XEQ operation
    301 			tok = tok + 0x10;
    302 		}
    303 
    304 		return tok;
    305 	}
    306 }
    307 
    308 fn printstr(fd i32, s str) {
    309 	var n u32 = 0;
    310 	writec(fd, '"');
    311 	while true {
    312 		var ch u32 = s[n];
    313 		if ch == 0 {
    314 			break;
    315 		} else if (ch < ' ') || (ch > '~') {
    316 			writex(fd, ch);
    317 		} else if (ch == '"') || (ch == '\\') {
    318 			writec(fd, '\\');
    319 			writec(fd, ch);
    320 		} else {
    321 			writec(fd, ch);
    322 		}
    323 		n++;
    324 	}
    325 	writec(fd, '"');
    326 }
    327 
    328 fn token_print(fd i32) {
    329 	if ctx.tok == tNUM {
    330 		writec(fd, '#');
    331 		writex(fd, ctx.num);
    332 	} else if ctx.tok == tIDN {
    333 		writec(fd, '@');
    334 		writes(fd, ctx.tmp);
    335 	} else if ctx.tok == tEOL {
    336 		writec(fd, '\n');
    337 	} else if ctx.tok == tSTR {
    338 		printstr(fd, ctx.tmp);
    339 	} else {
    340 		writes(fd, tnames[ctx.tok]);
    341 	}
    342 	writec(fd, ' ');
    343 }
    344 
    345 fn next() Token {
    346 	ctx.tok = _next();
    347 	return ctx.tok;
    348 }
    349