spl

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit eb7745b26c8c173293aab76056acdc329a842cc8
parent c91b7a75c525c32f9c44e03313a39e40d42ee6a8
Author: Brian Swetland <swetland@frotz.net>
Date:   Sat, 14 Oct 2023 13:36:54 -0700

compiler: first chunk of the lexer ported

Diffstat:
Acompiler.spl | 384+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 384 insertions(+), 0 deletions(-)

diff --git a/compiler.spl b/compiler.spl @@ -0,0 +1,384 @@ +// Copyright 2023, Brian Swetland <swetland@frotz.net> +// Licensed under the Apache License, Version 2.0. + +fn error_begin() i32 { + writes(2, "error: "); + return 2; +} + +fn error_end() { + writes(2, "\n"); +} + +struct String { +}; + +struct Context { + linenumber u32, // line number of most recent line + lineoffset u32, // position of start of most recent line + byteoffset u32, // position of the most recent character + flags u32, + cc u32, // scanner: next character + + tok u32, // most recent token + num u32, // for tNUM + tmp [256]u8, // for tIDN, tSTR + ident *String, // for tSTR +}; + +var ctx Context; + +// ================================================================ +// lexical scanner + +// token classes (tok & tcMASK) +enum { + tcRELOP = 0x08, tcADDOP = 0x10, tcMULOP = 0x18, + tcAEQOP = 0x20, tcMEQOP = 0x28, tcMASK = 0xF8, +}; + +enum { + // EndMarks, Braces, Brackets Parens + tEOF, tEOL, tOBRACE, tCBRACE, tOBRACK, tCBRACK, tOPAREN, tCPAREN, + // RelOps (do not reorder) + tEQ, tNE, tLT, tLE, tGT, tGE, tx0E, tx0F, + // AddOps (do not reorder) + tPLUS, tMINUS, tPIPE, tCARET, tx14, tx15, tx16, tx17, + // MulOps (do not reorder) + tSTAR, tSLASH, tPERCENT, tAMP, tLEFT, tRIGHT, tx1E, tx1F, + // AsnOps (do not reorder) + tADDEQ, tSUBEQ, tOREQ, tXOREQ, tx24, tx25, tx26, tx27, + tMULEQ, tDIVEQ, tMODEQ, tANDEQ, tLSEQ, tRSEQ, t2E, t2F, + // Various, UnaryNot, LogicalOps, + tSEMI, tCOLON, tDOT, tCOMMA, tNOT, tAND, tOR, tBANG, + tASSIGN, tINC, tDEC, + // Keywords + tNEW, tFN, tSTRUCT, tVAR, tENUM, + tIF, tELSE, tWHILE, + tBREAK, tCONTINUE, tRETURN, + tFOR, tSWITCH, tCASE, + tTRUE, tFALSE, tNIL, + tIDN, tNUM, tSTR, + // used internal to the lexer but never returned + tSPC, tINV, tDQT, tSQT, tMSC, +}; + +var tnames []str = { + "<EOF>", "<EOL>", "{", "}", "[", "]", "(", ")", + "==", "!=", "<", "<=", ">", ">=", "", "", + "+", "-", "|", "^", "", "", "", "", + "*", "/", "%", "&", "<<", ">>", "", "", + "+=", "-=", "|=", "^=", "", "", "", "", + "*=", "/=", "%=", "&=", "<<=", ">>=", "", "", + ";", ":", ".", ",", "~", "&&", "||", "!", + "=", "++", "--", + "new", "fn", "struct", "var", "enum", + "if", "else", "while", + "break", "continue", "return", + "for", "switch", "case", + "true", "false", "nil", + "<ID>", "<NUM>", "<STR>", + "<SPC>", "<INV>", "<DQT>", "<SQT>", "<MSC>", +}; + +var lextab [256]u8 = { + tEOF, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tSPC, tEOL, tSPC, tINV, tSPC, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tSPC, tBANG, tDQT, tMSC, tMSC, tPERCENT, tAMP, tSQT, + tOPAREN, tCPAREN, tSTAR, tPLUS, tCOMMA, tMINUS, tDOT, tSLASH, + tNUM, tNUM, tNUM, tNUM, tNUM, tNUM, tNUM, tNUM, + tNUM, tNUM, tCOLON, tSEMI, tLT, tASSIGN, tGT, tMSC, + tMSC, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, + tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, + tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, + tIDN, tIDN, tIDN, tOBRACK, tMSC, tCBRACK, tCARET, tIDN, + tMSC, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, + tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, + tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, tIDN, + tIDN, tIDN, tIDN, tOBRACE, tPIPE, tCBRACE, tNOT, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, + tINV, tINV, tINV, tINV, tINV, tINV, tINV, tINV, +}; + +fn unhex(ch u32) i32 { + if ((ch >= '0') && (ch <= '9')) { + return ch - '0'; + } + if ((ch >= 'a') && (ch <= 'f')) { + return ch - 'a' + 10; + } + if ((ch >= 'A') && (ch <= 'F')) { + return ch - 'A' + 10; + } + return -1; +} + +fn scan() u32 { + var ch i32 = readc(0); + if (ch < 0) { + ctx.cc = 0; + } else { + ctx.cc = ch; + } + return ctx.cc; +} + +fn unescape(n u32) u32 { + if (n == 'n') { + return 10; + } else if (n == 'r') { + return 13; + } else if (n == 't') { + return 9; + } else if (n == '"') { + return '"'; + } else if (n == '\'') { + return '\''; + } else if (n == '\\') { + return '\\'; + } else if (n == 'x') { + var x0 u32 = unhex(scan()); + var x1 u32 = unhex(scan()); + if ((x0 < 0) || (x1 < 0)) { + error("invalid hex escape"); + } + return (x0 << 4) | x1; + } else { + error("invalid escape ", n); + return 0; + } +} + +fn scan_string(cc u32, nc u32) u32 { + var n u32 = 0; + while (true) { + if (nc == '"') { + nc = scan(); + break; + } else if (nc == 0) { + error("unterminated string"); + } else if (nc == '\\') { + ctx.tmp[n] = unescape(scan()); + } else { + ctx.tmp[n] = nc; + } + nc = scan(); + n++; + if (n == 255) { + error("constant string too large"); + } + } + ctx.tmp[n] = 0; + return tSTR; +} + +fn scan_keyword(len u32) u32 { + ctx.tmp[len] = 0; + // TODO + return tIDN; +} + +fn scan_number(cc u32, nc u32) u32 { + var n u32 = 1; + var val u32 = cc - '0'; + + if ((cc == '0') && (nc == 'b')) { // binary + nc = scan(); + while ((nc == '0') || (nc == '1')) { + val = (val << 1) | (nc - '0'); + nc = scan(); + n++; + if (n == 34) { + error("binary constant too large"); + } + } + } else if ((cc == '0') && (nc == 'x')) { // hex + nc = scan(); + while (true) { + var tmp i32 = unhex(nc); + if (tmp == -1) { + break; + } + val = (val << 4) | tmp; + nc = scan(); + n++; + if (n == 10) { + error("hex constant too large"); + } + } + } else { // decimal + while (lextab[nc] == tNUM) { + var tmp u32 = (val * 10) + (nc - '0'); + if (tmp <= val) { + error("decimal constant too large"); + } + val = tmp; + nc = scan(); + n++; + } + } + ctx.num = val; + return tNUM; +} + +fn scan_ident(cc u32, nc u32) u32 { + ctx.tmp[0] = cc; + var n u32 = 1; + + while (true) { + var tok u32 = lextab[nc]; + if ((tok == tIDN) || (tok == tNUM)) { + ctx.tmp[n] = nc; + n++; + if (n == 32) { error("identifier too large"); } + nc = scan(); + } else { + break; + } + } + return scan_keyword(n); +} + +fn _next() u32 { + var nc u8 = ctx.cc; + while (1) { + var cc u8 = nc; + nc = scan(); + var tok u32 = lextab[cc]; + if (tok == tNUM) { // 0..9 + return scan_number(cc, nc); + } else if (tok == tIDN) { // _ A..Z a..z + return scan_ident(cc, nc); + } else if (tok == tDQT) { // " + return scan_string(cc, nc); + } else if (tok == tSQT) { // ' + ctx.num = nc; + if (nc == '\\') { + ctx.num = unescape(scan()); + } + nc = scan(); + if (nc != '\'') { + error("unterminated character constant"); + } + nc = scan(); + return tNUM; + } else if (tok == tPLUS) { + if (nc == '+') { tok = tINC; nc = scan(); } + } else if (tok == tMINUS) { + if (nc == '-') { tok = tDEC; nc = scan(); } + } else if (tok == tAMP) { + if (nc == '&') { tok = tAND; nc = scan(); } + } else if (tok == tPIPE) { + if (nc == '|') { tok = tOR; nc = scan(); } + } else if (tok == tGT) { + if (nc == '=') { tok = tGE; nc = scan(); } + else if (nc == '>') { tok = tRIGHT; nc = scan(); } + } else if (tok == tLT) { + if (nc == '=') { tok = tLE; nc = scan(); } + else if (nc == '<') { tok = tLEFT; nc = scan(); } + } else if (tok == tASSIGN) { + if (nc == '=') { tok = tEQ; nc = scan(); } + } else if (tok == tBANG) { + if (nc == '=') { tok = tNE; nc = scan(); } + } else if (tok == tSLASH) { + if (nc == '/') { + // comment -- consume until EOL or EOF + while ((nc != '\n') && (nc != 0)) { + nc = scan(); + } + continue; + } + } else if (tok == tEOL) { + ctx.linenumber++; + ctx.lineoffset = ctx.byteoffset; + //if (ctx.flags & cfVisibleEOL) { + // return tEOL; + //} + continue; + } else if (tok == tSPC) { + continue; + } else if ((tok == tMSC) || (tok == tINV)) { + error("unknown character 0x%02x", cc); + } + + // if we're an AddOp or MulOp, followed by an '=' + if (((tok & 0xF0) == 0x20) && (nc == '=')) { + nc = scan(); + // transform us to a XEQ operation + tok = tok + 0x10; + } + + return tok; + } +} + + +fn token_printstr(fd i32) { + var n u32 = 0; + writec(fd, '"'); + while (n < 256) { + var ch u32 = ctx.tmp[n]; + if (ch == 0) { + break; + } else if ((ch < ' ') || (ch > '~')) { + writex(fd, ch); + } else if ((ch == '"') || (ch == '\\')) { + writec(fd, '\\'); + writec(fd, ch); + } else { + writec(fd, ch); + } + n++; + } + writec(fd, '"'); +} + +fn token_print(fd i32) { + if (ctx.tok == tNUM) { + writec(fd, '#'); + writex(fd, ctx.num); + } else if (ctx.tok == tIDN) { + writec(fd, '@'); + writes(fd, ctx.tmp); + } else if (ctx.tok == tEOL) { + writec(fd, '\n'); + } else if (ctx.tok == tSTR) { + token_printstr(fd); + } else { + writes(fd, tnames[ctx.tok]); + } + writec(fd, ' '); +} + +fn next() u32 { + ctx.tok = _next(); + return ctx.tok; +} + +fn start() i32 { + ctx = new(Context); + scan(); + + while(next() != tEOF) { + token_print(1); + } + writec(1, '\n'); + return 0; +} +