compiler: first chunk of the lexer ported - spl - Unnamed repository; edit this file 'description' to name the repository.

commit eb7745b26c8c173293aab76056acdc329a842cc8
parent c91b7a75c525c32f9c44e03313a39e40d42ee6a8
Author: Brian Swetland <swetland@frotz.net>
Date:   Sat, 14 Oct 2023 13:36:54 -0700

compiler: first chunk of the lexer ported

Diffstat:
A compiler.spl  | 384 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 384 insertions(+), 0 deletions(-)
diff --git a/compiler.spl b/compiler.spl
@@ -0,0 +1,384 @@
+// Copyright 2023, Brian Swetland <swetland@frotz.net>
+// Licensed under the Apache License, Version 2.0.
+
+fn error_begin() i32 {
+	writes(2, "error: ");
+	return 2;
+}
+
+fn error_end() {
+	writes(2, "\n");
+}
+
+struct String {
+};
+
+struct Context {
+	linenumber u32,		// line number of most recent line
+	lineoffset u32,		// position of start of most recent line
+	byteoffset u32,		// position of the most recent character
+	flags u32,
+	cc u32,			// scanner: next character
+
+	tok u32,		// most recent token
+	num u32,		// for tNUM
+	tmp [256]u8,		// for tIDN, tSTR
+	ident *String,		// for tSTR
+};
+
+var ctx Context;
+
+// ================================================================
+// lexical scanner
+
+// token classes (tok & tcMASK)
+enum {
+	tcRELOP = 0x08, tcADDOP = 0x10, tcMULOP = 0x18,
+	tcAEQOP = 0x20, tcMEQOP = 0x28, tcMASK = 0xF8,
+};
+
+enum {
+	// EndMarks, Braces, Brackets Parens
+	tEOF, tEOL, tOBRACE, tCBRACE, tOBRACK, tCBRACK, tOPAREN, tCPAREN,
+	// RelOps (do not reorder)
+	tEQ, tNE, tLT, tLE, tGT, tGE, tx0E, tx0F,
+	// AddOps (do not reorder)
+	tPLUS, tMINUS, tPIPE, tCARET, tx14, tx15, tx16, tx17,
+	// MulOps (do not reorder)
+	tSTAR, tSLASH, tPERCENT, tAMP, tLEFT, tRIGHT, tx1E, tx1F,
+	// AsnOps (do not reorder)
+	tADDEQ, tSUBEQ, tOREQ, tXOREQ, tx24, tx25, tx26, tx27,
+	tMULEQ, tDIVEQ, tMODEQ, tANDEQ, tLSEQ, tRSEQ, t2E, t2F,
+	// Various, UnaryNot, LogicalOps,
+	tSEMI, tCOLON, tDOT, tCOMMA, tNOT, tAND, tOR, tBANG,
+	tASSIGN, tINC, tDEC,
+	// Keywords
+	tNEW, tFN, tSTRUCT, tVAR, tENUM,
+	tIF, tELSE, tWHILE,
+	tBREAK, tCONTINUE, tRETURN,
+	tFOR, tSWITCH, tCASE,
+	tTRUE, tFALSE, tNIL,
+	tIDN, tNUM, tSTR,
+	// used internal to the lexer but never returned
+	tSPC, tINV, tDQT, tSQT, tMSC,
+};
+
+var tnames []str = {
+	"<EOF>", "<EOL>", "{",  "}",  "[",   "]",   "(",   ")",
+	"==",    "!=",    "<",  "<=", ">",   ">=",  "",    "",
+	"+",     "-",     "|",  "^",  "",    "",    "",    "",
+	"*",     "/",     "%",  "&",  "<<",  ">>",  "",    "",
+	"+=",    "-=",    "|=", "^=", "",    "",    "",    "",
+	"*=",    "/=",    "%=", "&=", "<<=", ">>=", "",    "",
+	";",     ":",     ".",  ",",  "~",   "&&",  "||",  "!",
+	"=",     "++",    "--",
+	"new", "fn", "struct", "var", "enum",
+	"if", "else", "while",
+	"break", "continue", "return",
+	"for", "switch", "case",
+	"true", "false", "nil",
+	"<ID>", "<NUM>", "<STR>",
+	"<SPC>", "<INV>", "<DQT>", "<SQT>", "<MSC>",
+};
+
+var lextab [256]u8 = {
+	tEOF, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tSPC, tEOL, tSPC, tINV, tSPC, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tSPC, tBANG, tDQT, tMSC, tMSC, tPERCENT, tAMP, tSQT,
+	tOPAREN, tCPAREN, tSTAR, tPLUS, tCOMMA, tMINUS, tDOT, tSLASH,
+	tNUM, tNUM, tNUM, tNUM, tNUM, tNUM, tNUM, tNUM,
+	tNUM, tNUM, tCOLON, tSEMI, tLT, tASSIGN, tGT, tMSC,
+	tMSC, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN,
+	tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN,
+	tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN,
+	tIDN, tIDN, tIDN, tOBRACK, tMSC, tCBRACK, tCARET, tIDN,
+	tMSC, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN,
+	tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN,
+	tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN,
+	tIDN, tIDN, tIDN, tOBRACE, tPIPE, tCBRACE, tNOT, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+	tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV,
+};
+
+fn unhex(ch u32) i32 {
+	if ((ch >= '0') && (ch <= '9')) {
+		return ch - '0';
+	}
+	if ((ch >= 'a') && (ch <= 'f')) {
+		return ch - 'a' + 10;
+	}
+	if ((ch >= 'A') && (ch <= 'F')) {
+		return ch - 'A' + 10;
+	}
+	return -1;
+}
+
+fn scan() u32 {
+	var ch i32 = readc(0);
+	if (ch < 0) {
+		ctx.cc = 0;
+	} else {
+		ctx.cc = ch;
+	}
+	return ctx.cc;
+}
+
+fn unescape(n u32) u32 {
+	if (n == 'n') {
+		return 10;
+	} else if (n == 'r') {
+		return 13;
+	} else if (n == 't') {
+		return 9;
+	} else if (n == '"') {
+		return '"';
+	} else if (n == '\'') {
+		return '\'';
+	} else if (n == '\\') {
+		return '\\';
+	} else if (n == 'x') {
+		var x0 u32 = unhex(scan());
+		var x1 u32 = unhex(scan());
+		if ((x0 < 0) || (x1 < 0)) {
+			error("invalid hex escape");
+		}
+		return (x0 << 4) | x1;
+	} else {
+		error("invalid escape ", n);
+		return 0;
+	}
+}
+
+fn scan_string(cc u32, nc u32) u32 {
+	var n u32 = 0;
+	while (true) {
+		if (nc == '"') {
+			nc = scan();
+			break;
+		} else if (nc == 0) {
+			error("unterminated string");
+		} else if (nc == '\\') {
+			ctx.tmp[n] = unescape(scan());
+		} else {
+			ctx.tmp[n] = nc;
+		}
+		nc = scan();
+		n++;
+		if (n == 255) {
+			error("constant string too large");
+		}
+	}
+	ctx.tmp[n] = 0;
+	return tSTR;
+}
+
+fn scan_keyword(len u32) u32 {
+	ctx.tmp[len] = 0;
+	// TODO
+	return tIDN;
+}
+
+fn scan_number(cc u32, nc u32) u32 {
+	var n u32 = 1;
+	var val u32 = cc - '0';
+
+	if ((cc == '0') && (nc == 'b')) { // binary
+		nc = scan();
+		while ((nc == '0') || (nc == '1')) {
+			val = (val << 1) | (nc - '0');
+			nc = scan();
+			n++;
+			if (n == 34) {
+				error("binary constant too large");
+			}
+		}
+	} else if ((cc == '0') && (nc == 'x')) { // hex
+		nc = scan();
+		while (true) {
+			var tmp i32 = unhex(nc);
+			if (tmp == -1) {
+				break;
+			}
+			val = (val << 4) | tmp;
+			nc = scan();
+			n++;
+			if (n == 10) {
+				error("hex constant too large");
+			}
+		}
+	} else { // decimal
+		while (lextab[nc] == tNUM) {
+			var tmp u32 = (val * 10) + (nc - '0');
+			if (tmp <= val) {
+				error("decimal constant too large");
+			}
+			val = tmp;
+			nc = scan();
+			n++;
+		}
+	}
+	ctx.num = val;
+	return tNUM;
+}
+
+fn scan_ident(cc u32, nc u32) u32 {
+	ctx.tmp[0] = cc;
+	var n u32 = 1;
+
+	while (true) {
+		var tok u32 = lextab[nc];
+		if ((tok == tIDN) || (tok == tNUM)) {
+			ctx.tmp[n] = nc;
+			n++;
+			if (n == 32) { error("identifier too large"); }
+			nc = scan();
+		} else {
+			break;
+		}
+	}
+	return scan_keyword(n);
+}
+
+fn _next() u32 {
+	var nc u8 = ctx.cc;
+	while (1) {
+		var cc u8 = nc;
+		nc = scan();
+		var tok u32 = lextab[cc];
+		if (tok == tNUM) { // 0..9
+			return scan_number(cc, nc);
+		} else if (tok == tIDN) { // _ A..Z a..z
+			return scan_ident(cc, nc);
+		} else if (tok == tDQT) { // "
+			return scan_string(cc, nc);
+		} else if (tok == tSQT) { // '
+			ctx.num = nc;
+			if (nc == '\\') {
+				ctx.num = unescape(scan());
+			}
+			nc = scan();
+			if (nc != '\'') {
+				error("unterminated character constant");
+			}
+			nc = scan();
+			return tNUM;
+		} else if (tok == tPLUS) {
+			if (nc == '+') { tok = tINC; nc = scan(); }
+		} else if (tok == tMINUS) {
+			if (nc == '-') { tok = tDEC; nc = scan(); }
+		} else if (tok == tAMP) {
+			if (nc == '&') { tok = tAND; nc = scan(); }
+		} else if (tok == tPIPE) {
+			if (nc == '|') { tok = tOR; nc = scan(); }
+		} else if (tok == tGT) {
+			if (nc == '=') { tok = tGE; nc = scan(); }
+			else if (nc == '>') { tok = tRIGHT; nc = scan(); }
+		} else if (tok == tLT) {
+			if (nc == '=') { tok = tLE; nc = scan(); }
+			else if (nc == '<') { tok = tLEFT; nc = scan(); }
+		} else if (tok == tASSIGN) {
+			if (nc == '=') { tok = tEQ; nc = scan(); }
+		} else if (tok == tBANG) {
+			if (nc == '=') { tok = tNE; nc = scan(); }
+		} else if (tok == tSLASH) {
+			if (nc == '/') {
+				// comment -- consume until EOL or EOF
+				while ((nc != '\n') && (nc != 0)) {
+					nc = scan();
+				}
+				continue;
+			}
+		} else if (tok == tEOL) {
+			ctx.linenumber++;
+			ctx.lineoffset = ctx.byteoffset;
+			//if (ctx.flags & cfVisibleEOL) {
+			//	return tEOL;
+			//}
+			continue;
+		} else if (tok == tSPC) {
+			continue;
+		} else if ((tok == tMSC) || (tok == tINV)) {
+			error("unknown character 0x%02x", cc);
+		}
+
+		// if we're an AddOp or MulOp, followed by an '='
+		if (((tok & 0xF0) == 0x20) && (nc == '=')) {
+			nc = scan();
+			// transform us to a XEQ operation
+			tok = tok + 0x10;
+		}
+
+		return tok;
+	}
+}
+
+
+fn token_printstr(fd i32) {
+	var n u32 = 0;
+	writec(fd, '"');
+	while (n < 256) {
+		var ch u32 = ctx.tmp[n];
+		if (ch == 0) {
+			break;
+		} else if ((ch < ' ') || (ch > '~')) {
+			writex(fd, ch);
+		} else if ((ch == '"') || (ch == '\\')) {
+			writec(fd, '\\');
+			writec(fd, ch);
+		} else {
+			writec(fd, ch);
+		}
+		n++;
+	}
+	writec(fd, '"');
+}
+
+fn token_print(fd i32) {
+	if (ctx.tok == tNUM) {
+		writec(fd, '#');
+		writex(fd, ctx.num);
+	} else if (ctx.tok == tIDN) {
+		writec(fd, '@');
+		writes(fd, ctx.tmp);
+	} else if (ctx.tok == tEOL) {
+		writec(fd, '\n');
+	} else if (ctx.tok == tSTR) {
+		token_printstr(fd);
+	} else {
+		writes(fd, tnames[ctx.tok]);
+	}
+	writec(fd, ' ');
+}
+
+fn next() u32 {
+	ctx.tok = _next();
+	return ctx.tok;
+}
+
+fn start() i32 {
+	ctx = new(Context);
+	scan();
+	
+	while(next() != tEOF) {
+		token_print(1);
+	}
+	writec(1, '\n');
+	return 0;
+}
+

	spl Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE