2024-12-04 17:56:08 +00:00
|
|
|
use ascii;
|
2024-12-04 02:41:23 +00:00
|
|
|
use bufio;
|
2024-12-04 16:29:11 +00:00
|
|
|
use encoding::utf8;
|
2024-12-04 02:41:23 +00:00
|
|
|
use io;
|
|
|
|
use memio;
|
2024-12-04 17:56:08 +00:00
|
|
|
use unicode;
|
2024-12-05 00:40:14 +00:00
|
|
|
use strconv;
|
2024-12-04 17:56:08 +00:00
|
|
|
|
|
|
|
use fmt;
|
2024-12-04 02:41:23 +00:00
|
|
|
use strings;
|
|
|
|
|
2024-12-05 00:40:14 +00:00
|
|
|
def longcharnames: [_](str, rune) = [
|
|
|
|
("nul", '\u0000'),
|
|
|
|
("alarm", '\u0007'),
|
|
|
|
("backspace", '\u0008'),
|
|
|
|
("tab", '\u0009'),
|
2024-12-05 15:50:29 +00:00
|
|
|
("newline", '\u000a'),
|
2024-12-05 00:40:14 +00:00
|
|
|
("linefeed", '\u000a'),
|
|
|
|
("vtab", '\u000b'),
|
|
|
|
("page", '\u000c'),
|
|
|
|
("return", '\u000d'),
|
|
|
|
("esc", '\u001b'),
|
|
|
|
("space", '\u0020'),
|
|
|
|
("delete", '\u007f'),
|
|
|
|
];
|
|
|
|
|
2024-12-04 02:41:23 +00:00
|
|
|
export type lexer = struct {
|
|
|
|
in: io::handle,
|
|
|
|
strbuf: memio::stream,
|
|
|
|
path: str,
|
|
|
|
loc: (uint, uint),
|
|
|
|
prevloc: (uint, uint),
|
|
|
|
unread: (rune | void),
|
|
|
|
};
|
|
|
|
|
|
|
|
export fn newlexer(in: io::handle, path: str) lexer = {
|
|
|
|
return lexer {
|
|
|
|
in = in,
|
|
|
|
strbuf = memio::dynamic(),
|
|
|
|
path = path,
|
|
|
|
loc = (1, 0),
|
|
|
|
unread = void,
|
|
|
|
...
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
export fn close(lex: *lexer) void = {
|
|
|
|
io::close(&lex.strbuf)!;
|
|
|
|
};
|
|
|
|
|
2024-12-04 17:56:08 +00:00
|
|
|
export fn lex(lex: *lexer) (token | io::EOF | error) = {
|
2024-12-04 16:29:11 +00:00
|
|
|
const rn = match (nextrunews(lex)?) {
|
|
|
|
case io::EOF =>
|
|
|
|
return io::EOF;
|
|
|
|
case let rn: rune =>
|
|
|
|
yield rn;
|
2024-12-04 02:41:23 +00:00
|
|
|
};
|
2024-12-04 16:29:11 +00:00
|
|
|
|
|
|
|
switch (rn) {
|
|
|
|
case '(' =>
|
2024-12-04 17:56:08 +00:00
|
|
|
return comment{ v = scancomment(lex)? };
|
2024-12-04 16:29:11 +00:00
|
|
|
case ')' =>
|
2024-12-04 17:56:08 +00:00
|
|
|
return lex.loc: invalid;
|
2024-12-04 16:29:11 +00:00
|
|
|
case '[' =>
|
2024-12-04 17:56:08 +00:00
|
|
|
return quotstart;
|
2024-12-04 16:29:11 +00:00
|
|
|
case ']' =>
|
2024-12-04 17:56:08 +00:00
|
|
|
return quotend;
|
2024-12-04 16:29:11 +00:00
|
|
|
case '{' =>
|
2024-12-04 17:56:08 +00:00
|
|
|
return mapstart;
|
2024-12-04 16:29:11 +00:00
|
|
|
case '}' =>
|
2024-12-04 17:56:08 +00:00
|
|
|
return mapend;
|
2024-12-04 16:29:11 +00:00
|
|
|
case '\\' =>
|
2024-12-04 17:56:08 +00:00
|
|
|
let v = scanword(lex)?;
|
|
|
|
if (len(v) == 0) {
|
|
|
|
return lex.loc: invalid;
|
|
|
|
} else {
|
|
|
|
return symbol{ v = v, kw = false };
|
|
|
|
};
|
2024-12-04 16:29:11 +00:00
|
|
|
case ':' =>
|
2024-12-04 17:56:08 +00:00
|
|
|
let v = scanword(lex)?;
|
|
|
|
if (len(v) == 0) {
|
|
|
|
return lex.loc: invalid;
|
|
|
|
} else {
|
|
|
|
return symbol{ v = v, kw = true };
|
2024-12-04 16:29:11 +00:00
|
|
|
};
|
2024-12-05 00:40:14 +00:00
|
|
|
case '#' =>
|
2024-12-05 15:52:02 +00:00
|
|
|
return scanpound(lex)?;
|
2024-12-04 17:56:08 +00:00
|
|
|
case '"' =>
|
|
|
|
return scanstr(lex)?;
|
2024-12-04 16:29:11 +00:00
|
|
|
case =>
|
|
|
|
yield;
|
|
|
|
};
|
|
|
|
|
|
|
|
unget(lex, rn);
|
|
|
|
return word{ v = scanword(lex)? };
|
2024-12-04 02:41:23 +00:00
|
|
|
};
|
|
|
|
|
2024-12-04 16:29:11 +00:00
|
|
|
fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
|
2024-12-04 02:41:23 +00:00
|
|
|
match (lex.unread) {
|
|
|
|
case let rn: rune =>
|
|
|
|
lex.prevloc = lex.loc;
|
|
|
|
lex.unread = void;
|
2024-12-04 16:29:11 +00:00
|
|
|
if (rn == '\n') {
|
|
|
|
lex.loc = (lex.loc.0 + 1, 0);
|
|
|
|
} else {
|
|
|
|
lex.loc.1 += 1;
|
|
|
|
};
|
2024-12-04 02:41:23 +00:00
|
|
|
return rn;
|
|
|
|
case void =>
|
|
|
|
yield;
|
|
|
|
};
|
|
|
|
|
2024-12-04 16:29:11 +00:00
|
|
|
match (bufio::read_rune(lex.in)) {
|
2024-12-04 02:41:23 +00:00
|
|
|
case let rn: rune =>
|
|
|
|
lex.prevloc = lex.loc;
|
2024-12-04 16:29:11 +00:00
|
|
|
if (rn == '\n') {
|
|
|
|
lex.loc = (lex.loc.0 + 1, 0);
|
|
|
|
} else {
|
|
|
|
lex.loc.1 += 1;
|
|
|
|
};
|
2024-12-04 02:41:23 +00:00
|
|
|
return rn;
|
|
|
|
case io::EOF =>
|
|
|
|
return io::EOF;
|
2024-12-04 16:29:11 +00:00
|
|
|
case let err: io::error =>
|
|
|
|
return err;
|
|
|
|
case utf8::invalid =>
|
|
|
|
return lex.loc: invalid;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
|
|
|
|
for (true) {
|
|
|
|
match (nextrune(lex)?) {
|
|
|
|
case let rn: rune =>
|
2024-12-04 17:56:08 +00:00
|
|
|
if (isspace(rn)) {
|
2024-12-04 16:29:11 +00:00
|
|
|
continue;
|
|
|
|
};
|
|
|
|
return rn;
|
|
|
|
case io::EOF =>
|
|
|
|
return io::EOF;
|
|
|
|
};
|
2024-12-04 02:41:23 +00:00
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
fn unget(lex: *lexer, rn: rune) void = {
|
|
|
|
assert(lex.unread is void);
|
|
|
|
lex.unread = rn;
|
|
|
|
lex.loc = lex.prevloc;
|
|
|
|
};
|
|
|
|
|
2024-12-04 16:29:11 +00:00
|
|
|
fn scanword(lex: *lexer) (str | error) = {
|
|
|
|
memio::reset(&lex.strbuf);
|
2024-12-04 02:41:23 +00:00
|
|
|
for (true) {
|
2024-12-04 16:29:11 +00:00
|
|
|
const rn = match (nextrune(lex)?) {
|
2024-12-04 02:41:23 +00:00
|
|
|
case let rn: rune =>
|
2024-12-04 16:29:11 +00:00
|
|
|
yield rn;
|
|
|
|
case io::EOF =>
|
|
|
|
break;
|
|
|
|
};
|
2024-12-04 17:56:08 +00:00
|
|
|
if (isspace(rn) || isdelimiter(rn)) {
|
2024-12-04 16:29:11 +00:00
|
|
|
unget(lex, rn);
|
|
|
|
break;
|
|
|
|
};
|
|
|
|
memio::appendrune(&lex.strbuf, rn)!;
|
|
|
|
};
|
|
|
|
return memio::string(&lex.strbuf)!;
|
|
|
|
};
|
|
|
|
|
2024-12-04 17:56:08 +00:00
|
|
|
fn scancomment(lex: *lexer) (str | error) = {
|
2024-12-04 16:29:11 +00:00
|
|
|
memio::reset(&lex.strbuf);
|
|
|
|
for (true) {
|
|
|
|
const rn = match (nextrune(lex)?) {
|
|
|
|
case let rn: rune =>
|
|
|
|
yield rn;
|
|
|
|
case io::EOF =>
|
2024-12-04 17:56:08 +00:00
|
|
|
return ("comment", lex.loc.0, lex.loc.1): unterminated;
|
|
|
|
};
|
|
|
|
|
|
|
|
switch (rn) {
|
|
|
|
case '(' =>
|
2024-12-04 16:29:11 +00:00
|
|
|
return lex.loc: invalid;
|
2024-12-04 17:56:08 +00:00
|
|
|
case ')' =>
|
|
|
|
break;
|
|
|
|
case =>
|
|
|
|
memio::appendrune(&lex.strbuf, rn)!;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
return memio::string(&lex.strbuf)!;
|
|
|
|
};
|
|
|
|
|
|
|
|
fn scanstr(lex: *lexer) (str | error) = {
|
|
|
|
memio::reset(&lex.strbuf);
|
|
|
|
for (true) {
|
|
|
|
const rn = match (nextrune(lex)?) {
|
|
|
|
case let rn: rune =>
|
|
|
|
yield rn;
|
|
|
|
case io::EOF =>
|
|
|
|
return ("string literal", lex.loc.0, lex.loc.1): unterminated;
|
2024-12-04 16:29:11 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
switch (rn) {
|
|
|
|
case '"' => break;
|
|
|
|
case '\\' =>
|
|
|
|
memio::appendrune(&lex.strbuf, scanescape(lex)?)!;
|
|
|
|
case =>
|
|
|
|
memio::appendrune(&lex.strbuf, rn)!;
|
2024-12-04 02:41:23 +00:00
|
|
|
};
|
|
|
|
};
|
2024-12-04 16:29:11 +00:00
|
|
|
return memio::string(&lex.strbuf)!;
|
2024-12-04 02:41:23 +00:00
|
|
|
};
|
|
|
|
|
2024-12-05 15:52:02 +00:00
|
|
|
fn scanpound(lex: *lexer) (token | error) = {
|
2024-12-04 17:56:08 +00:00
|
|
|
const rn = match (nextrune(lex)?) {
|
|
|
|
case let rn: rune =>
|
|
|
|
yield rn;
|
|
|
|
case io::EOF =>
|
2024-12-05 15:52:02 +00:00
|
|
|
return ("pound literal", lex.loc.0, lex.loc.1): unterminated;
|
2024-12-04 17:56:08 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
switch (rn) {
|
2024-12-05 00:40:14 +00:00
|
|
|
case 't' =>
|
|
|
|
return true;
|
|
|
|
case 'f' =>
|
|
|
|
return false;
|
2024-12-04 17:56:08 +00:00
|
|
|
case '\\' =>
|
2024-12-05 00:40:14 +00:00
|
|
|
return scanchar(lex)?;
|
2024-12-04 17:56:08 +00:00
|
|
|
case =>
|
2024-12-05 00:40:14 +00:00
|
|
|
return lex.loc: invalid;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
fn scanchar(lex: *lexer) (rune | error) = {
|
|
|
|
static let namebuf: [16]u8 = [0...];
|
|
|
|
let namebuf = memio::fixed(namebuf);
|
|
|
|
|
|
|
|
const rn = match (nextrune(lex)?) {
|
|
|
|
case let rn: rune =>
|
|
|
|
yield rn;
|
|
|
|
case io::EOF =>
|
|
|
|
return ("character literal", lex.loc.0, lex.loc.1): unterminated;
|
|
|
|
};
|
|
|
|
|
|
|
|
match (nextrune(lex)?) {
|
|
|
|
case let rnn: rune =>
|
|
|
|
unget(lex, rnn);
|
|
|
|
if (isspace(rnn)) {
|
|
|
|
return rn;
|
|
|
|
} else {
|
2024-12-05 16:43:12 +00:00
|
|
|
if (rn == 'x') {
|
|
|
|
return scanescape2(lex);
|
|
|
|
} else {
|
|
|
|
memio::appendrune(&namebuf, rn)!;
|
|
|
|
memio::concat(&namebuf, scanword(lex)?)!;
|
|
|
|
const name = memio::string(&namebuf)!;
|
|
|
|
for (let i = 0z; i < len(longcharnames); i += 1) {
|
|
|
|
if (name == longcharnames[i].0) {
|
|
|
|
return longcharnames[i].1;
|
|
|
|
};
|
2024-12-05 00:40:14 +00:00
|
|
|
};
|
2024-12-05 16:43:12 +00:00
|
|
|
return lex.loc: invalid;
|
2024-12-05 00:40:14 +00:00
|
|
|
};
|
|
|
|
};
|
|
|
|
case io::EOF =>
|
2024-12-04 17:56:08 +00:00
|
|
|
return rn;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
2024-12-04 16:29:11 +00:00
|
|
|
fn scanescape(lex: *lexer) (rune | error) = {
|
|
|
|
const rn = match (nextrune(lex)?) {
|
|
|
|
case let rn: rune =>
|
|
|
|
yield rn;
|
|
|
|
case io::EOF =>
|
2024-12-05 00:40:14 +00:00
|
|
|
return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
|
2024-12-04 16:29:11 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
switch (rn) {
|
2024-12-05 15:50:29 +00:00
|
|
|
case '0' => return '\0';
|
|
|
|
case 'a' => return '\a';
|
|
|
|
case 'b' => return '\b';
|
|
|
|
case 'e' => return '\x1b';
|
|
|
|
case 'f' => return '\f';
|
|
|
|
case 'n' => return '\n';
|
|
|
|
case 'r' => return '\r';
|
|
|
|
case 't' => return '\t';
|
|
|
|
case 'v' => return '\v';
|
|
|
|
case '\\' => return '\\';
|
|
|
|
case '"' => return '"';
|
2024-12-05 16:43:12 +00:00
|
|
|
case 'x' => return scanescape2(lex)?;
|
|
|
|
case =>
|
|
|
|
return lex.loc: invalid;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
fn scanescape2(lex: *lexer) (rune | error) = {
|
|
|
|
// This handles the `\xhh...;` family of escapes.
|
|
|
|
// It's on a separate function since both [[scanescape]] and
|
|
|
|
// [[scanchar]] make use of it. Much like how [[scanescape]] assumes
|
|
|
|
// that the backslash has already been consumed, this one assumes that
|
|
|
|
// the leading character has been consumed prior to entering this
|
|
|
|
// function.
|
|
|
|
|
|
|
|
const rn = match (nextrune(lex)?) {
|
|
|
|
case let rn: rune =>
|
|
|
|
yield rn;
|
|
|
|
case io::EOF =>
|
|
|
|
return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
|
|
|
|
};
|
|
|
|
|
|
|
|
const buf: [6]u8 = [0...];
|
|
|
|
let buf = memio::fixed(buf);
|
|
|
|
memio::appendrune(&buf, rn)!;
|
|
|
|
|
|
|
|
let count = 1z;
|
|
|
|
for (true) {
|
|
|
|
const rn = match (nextrune(lex)?) {
|
|
|
|
case let rn: rune =>
|
|
|
|
yield rn;
|
|
|
|
case io::EOF =>
|
|
|
|
return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
|
|
|
|
};
|
|
|
|
|
|
|
|
if (count > 6) {
|
|
|
|
return lex.loc: invalid;
|
|
|
|
} else if (rn == ';') {
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
memio::appendrune(&buf, rn)!;
|
|
|
|
};
|
2024-12-05 16:45:11 +00:00
|
|
|
|
|
|
|
count += 1;
|
2024-12-05 16:43:12 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
const buf = memio::string(&buf)!;
|
|
|
|
|
|
|
|
return match (strconv::stou32(buf, strconv::base::HEX)) {
|
|
|
|
case let codepoint: u32 =>
|
|
|
|
return codepoint: rune;
|
2024-12-04 16:29:11 +00:00
|
|
|
case =>
|
|
|
|
return lex.loc: invalid;
|
|
|
|
};
|
|
|
|
};
|
2024-12-04 02:41:23 +00:00
|
|
|
|
2024-12-04 17:56:08 +00:00
|
|
|
fn isspace(rn: rune) bool = {
|
|
|
|
if (ascii::isspace(rn)) {
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
switch (unicode::rune_gc(rn)) {
|
|
|
|
case unicode::gc::Zs =>
|
|
|
|
return true;
|
|
|
|
case =>
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
};
|
2024-12-04 02:41:23 +00:00
|
|
|
|
2024-12-04 17:56:08 +00:00
|
|
|
fn isdelimiter(rn: rune) bool = {
|
2024-12-05 00:40:14 +00:00
|
|
|
match (strings::index(`()[]{}\:#`, rn)) {
|
2024-12-04 17:56:08 +00:00
|
|
|
case size =>
|
|
|
|
return true;
|
2024-12-04 16:29:11 +00:00
|
|
|
case =>
|
2024-12-04 17:56:08 +00:00
|
|
|
return false;
|
2024-12-04 16:29:11 +00:00
|
|
|
};
|
|
|
|
};
|