kojote/parse/lex.ha

412 lines
7.8 KiB
Hare
Raw Normal View History

2024-12-04 17:56:08 +00:00
use ascii;
2024-12-04 02:41:23 +00:00
use bufio;
2024-12-04 16:29:11 +00:00
use encoding::utf8;
2024-12-04 02:41:23 +00:00
use io;
use memio;
2024-12-04 17:56:08 +00:00
use unicode;
use strconv;
2024-12-04 17:56:08 +00:00
use fmt;
2024-12-04 02:41:23 +00:00
use strings;
// my cod prob sux :(
def longcharnames: [_](str, rune) = [
("nul", '\u0000'),
("alarm", '\u0007'),
("backspace", '\u0008'),
("newline", '\u000a'),
("tab", '\u0009'),
("linefeed", '\u000a'),
("vtab", '\u000b'),
("page", '\u000c'),
("return", '\u000d'),
("esc", '\u001b'),
("space", '\u0020'),
("delete", '\u007f'),
];
2024-12-04 02:41:23 +00:00
export type lexer = struct {
in: io::handle,
strbuf: memio::stream,
path: str,
loc: (uint, uint),
prevloc: (uint, uint),
unread: (rune | void),
};
export fn newlexer(in: io::handle, path: str) lexer = {
return lexer {
in = in,
strbuf = memio::dynamic(),
path = path,
loc = (1, 0),
unread = void,
...
};
};
export fn close(lex: *lexer) void = {
io::close(&lex.strbuf)!;
};
2024-12-04 17:56:08 +00:00
export fn lex(lex: *lexer) (token | io::EOF | error) = {
2024-12-04 16:29:11 +00:00
const rn = match (nextrunews(lex)?) {
case io::EOF =>
return io::EOF;
case let rn: rune =>
yield rn;
2024-12-04 02:41:23 +00:00
};
2024-12-04 16:29:11 +00:00
switch (rn) {
case '(' =>
2024-12-04 17:56:08 +00:00
return comment{ v = scancomment(lex)? };
2024-12-04 16:29:11 +00:00
case ')' =>
2024-12-04 17:56:08 +00:00
return lex.loc: invalid;
2024-12-04 16:29:11 +00:00
case '[' =>
2024-12-04 17:56:08 +00:00
return quotstart;
2024-12-04 16:29:11 +00:00
case ']' =>
2024-12-04 17:56:08 +00:00
return quotend;
2024-12-04 16:29:11 +00:00
case '{' =>
2024-12-04 17:56:08 +00:00
return mapstart;
2024-12-04 16:29:11 +00:00
case '}' =>
2024-12-04 17:56:08 +00:00
return mapend;
2024-12-04 16:29:11 +00:00
case '\\' =>
2024-12-04 17:56:08 +00:00
let v = scanword(lex)?;
if (len(v) == 0) {
return lex.loc: invalid;
} else {
return symbol{ v = v, kw = false };
};
2024-12-04 16:29:11 +00:00
case ':' =>
2024-12-04 17:56:08 +00:00
let v = scanword(lex)?;
if (len(v) == 0) {
return lex.loc: invalid;
} else {
return symbol{ v = v, kw = true };
2024-12-04 16:29:11 +00:00
};
case '#' =>
return scanhash(lex)?;
2024-12-04 17:56:08 +00:00
case '"' =>
return scanstr(lex)?;
2024-12-04 16:29:11 +00:00
case =>
yield;
};
unget(lex, rn);
return word{ v = scanword(lex)? };
2024-12-04 02:41:23 +00:00
};
2024-12-04 16:29:11 +00:00
fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
2024-12-04 02:41:23 +00:00
match (lex.unread) {
case let rn: rune =>
lex.prevloc = lex.loc;
lex.unread = void;
2024-12-04 16:29:11 +00:00
if (rn == '\n') {
lex.loc = (lex.loc.0 + 1, 0);
} else {
lex.loc.1 += 1;
};
2024-12-04 02:41:23 +00:00
return rn;
case void =>
yield;
};
2024-12-04 16:29:11 +00:00
match (bufio::read_rune(lex.in)) {
2024-12-04 02:41:23 +00:00
case let rn: rune =>
lex.prevloc = lex.loc;
2024-12-04 16:29:11 +00:00
if (rn == '\n') {
lex.loc = (lex.loc.0 + 1, 0);
} else {
lex.loc.1 += 1;
};
2024-12-04 02:41:23 +00:00
return rn;
case io::EOF =>
return io::EOF;
2024-12-04 16:29:11 +00:00
case let err: io::error =>
return err;
case utf8::invalid =>
return lex.loc: invalid;
};
};
fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
for (true) {
match (nextrune(lex)?) {
case let rn: rune =>
2024-12-04 17:56:08 +00:00
if (isspace(rn)) {
2024-12-04 16:29:11 +00:00
continue;
};
return rn;
case io::EOF =>
return io::EOF;
};
2024-12-04 02:41:23 +00:00
};
};
fn unget(lex: *lexer, rn: rune) void = {
assert(lex.unread is void);
lex.unread = rn;
lex.loc = lex.prevloc;
};
2024-12-04 16:29:11 +00:00
fn scanword(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
2024-12-04 02:41:23 +00:00
for (true) {
2024-12-04 16:29:11 +00:00
const rn = match (nextrune(lex)?) {
2024-12-04 02:41:23 +00:00
case let rn: rune =>
2024-12-04 16:29:11 +00:00
yield rn;
case io::EOF =>
break;
};
2024-12-04 17:56:08 +00:00
if (isspace(rn) || isdelimiter(rn)) {
2024-12-04 16:29:11 +00:00
unget(lex, rn);
break;
};
memio::appendrune(&lex.strbuf, rn)!;
};
return memio::string(&lex.strbuf)!;
};
2024-12-04 17:56:08 +00:00
fn scancomment(lex: *lexer) (str | error) = {
2024-12-04 16:29:11 +00:00
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
2024-12-04 17:56:08 +00:00
return ("comment", lex.loc.0, lex.loc.1): unterminated;
};
switch (rn) {
case '(' =>
2024-12-04 16:29:11 +00:00
return lex.loc: invalid;
2024-12-04 17:56:08 +00:00
case ')' =>
break;
case =>
memio::appendrune(&lex.strbuf, rn)!;
};
};
return memio::string(&lex.strbuf)!;
};
fn scanstr(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("string literal", lex.loc.0, lex.loc.1): unterminated;
2024-12-04 16:29:11 +00:00
};
switch (rn) {
case '"' => break;
case '\\' =>
memio::appendrune(&lex.strbuf, scanescape(lex)?)!;
case =>
memio::appendrune(&lex.strbuf, rn)!;
2024-12-04 02:41:23 +00:00
};
};
2024-12-04 16:29:11 +00:00
return memio::string(&lex.strbuf)!;
2024-12-04 02:41:23 +00:00
};
fn scanhash(lex: *lexer) (token | error) = {
2024-12-04 17:56:08 +00:00
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("hash literal", lex.loc.0, lex.loc.1): unterminated;
2024-12-04 17:56:08 +00:00
};
switch (rn) {
case 't' =>
return true;
case 'f' =>
return false;
2024-12-04 17:56:08 +00:00
case '\\' =>
return scanchar(lex)?;
2024-12-04 17:56:08 +00:00
case =>
return lex.loc: invalid;
};
};
fn scanchar(lex: *lexer) (rune | error) = {
static let namebuf: [16]u8 = [0...];
let namebuf = memio::fixed(namebuf);
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("character literal", lex.loc.0, lex.loc.1): unterminated;
};
match (nextrune(lex)?) {
case let rnn: rune =>
unget(lex, rnn);
if (isspace(rnn)) {
return rn;
} else {
memio::appendrune(&namebuf, rn)!;
memio::concat(&namebuf, scanword(lex)?)!;
const name = memio::string(&namebuf)!;
for (let i = 0z; i < len(longcharnames); i += 1) {
if (name == longcharnames[i].0) {
return longcharnames[i].1;
};
};
return lex.loc: invalid;
};
case io::EOF =>
2024-12-04 17:56:08 +00:00
return rn;
};
};
2024-12-04 16:29:11 +00:00
fn scanescape(lex: *lexer) (rune | error) = {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
2024-12-04 16:29:11 +00:00
};
switch (rn) {
case '"' =>
return '"';
case '\\' =>
return '\\';
2024-12-04 17:56:08 +00:00
case 'n' =>
2024-12-04 16:29:11 +00:00
return '\n';
2024-12-04 17:56:08 +00:00
case 't' =>
return '\t';
2024-12-04 16:29:11 +00:00
case =>
return lex.loc: invalid;
};
};
2024-12-04 02:41:23 +00:00
2024-12-04 17:56:08 +00:00
fn isspace(rn: rune) bool = {
if (ascii::isspace(rn)) {
return true;
} else {
switch (unicode::rune_gc(rn)) {
case unicode::gc::Zs =>
return true;
case =>
return false;
};
};
};
2024-12-04 02:41:23 +00:00
2024-12-04 17:56:08 +00:00
fn isdelimiter(rn: rune) bool = {
match (strings::index(`()[]{}\:#`, rn)) {
2024-12-04 17:56:08 +00:00
case size =>
return true;
2024-12-04 16:29:11 +00:00
case =>
2024-12-04 17:56:08 +00:00
return false;
2024-12-04 16:29:11 +00:00
};
};
2024-12-04 17:56:08 +00:00
@test fn lex() void = {
const cases: [_](str, []token) = [
(`"hello" \greeting def`,
["hello", mksym("greeting"), mkword("def")]),
(`[dup *] (a -- a) \square def`,
[quotstart, mkword("dup"), mkword("*"), quotend,
mkcomment("a -- a"), mksym("square"),
mkword("def")]),
(`#t #f`, [true, false]),
(`#\a #\space #\nul`, ['a', ' ', '\0']),
2024-12-04 17:56:08 +00:00
];
2024-12-04 16:29:11 +00:00
2024-12-04 17:56:08 +00:00
for (let i = 0z; i < len(cases); i += 1) {
const src = strings::toutf8(cases[i].0);
const src = memio::fixed(src);
const lexer = newlexer(&src, "<string>");
defer close(&lexer);
2024-12-04 02:41:23 +00:00
2024-12-04 17:56:08 +00:00
for (let j = 0z; j < len(cases[i].1); j += 1) {
const want = cases[i].1[j];
const have = lex(&lexer)! as token;
if (!tokeq(want, have)) {
fmt::printfln("Case {}: {}", i, cases[i].0)!;
fmt::print("\tExpected: ")!;
tokpprint(want);
fmt::print("\tGot: ")!;
tokpprint(have);
assert(false);
};
2024-12-04 17:56:08 +00:00
};
2024-12-04 02:41:23 +00:00
2024-12-04 17:56:08 +00:00
assert(lex(&lexer) is io::EOF);
};
};
2024-12-04 02:41:23 +00:00
2024-12-04 17:56:08 +00:00
fn tokeq(have: token, want: token) bool = {
match (want) {
case quotstart =>
return have is quotstart;
case quotend =>
return have is quotend;
case mapstart =>
return have is mapstart;
case mapend =>
return have is mapend;
case let w: word =>
return (have as word).v == w.v;
case let s: str =>
return have as str == s;
case let s: symbol =>
return (have as symbol).v == s.v;
case let c: comment =>
return (have as comment).v == c.v;
case let r: rune =>
return have as rune == r;
case let b: bool =>
return have as bool == b;
};
};
fn tokpprint(tok: token) void = {
match (tok) {
case quotstart =>
fmt::println("[")!;
case quotend =>
fmt::println("]")!;
case mapstart =>
fmt::println("{")!;
case mapend =>
fmt::println("}")!;
case let w: word =>
fmt::println(w.v)!;
case let s: symbol =>
fmt::printfln("{}{}", if (s.kw) ":" else "\\", s.v)!;
case let s: str =>
fmt::printfln(`"{}"`, s)!;
case let c: comment =>
fmt::printfln("({})", c.v)!;
case let r: rune =>
for (let i = 0z; i < len(longcharnames); i += 1) {
if (r == longcharnames[i].1) {
fmt::printfln("#\\{}", longcharnames[i].0)!;
return;
};
};
fmt::printfln("#\\{}", r)!;
case let b: bool =>
fmt::println(if (b) "#t" else "#f")!;
2024-12-04 17:56:08 +00:00
};
2024-12-04 02:41:23 +00:00
};
2024-12-04 16:29:11 +00:00
2024-12-04 17:56:08 +00:00
fn mkword(v: const str) word =
word{ v = v };
2024-12-04 16:29:11 +00:00
2024-12-04 17:56:08 +00:00
fn mkcomment(v: const str) comment =
comment{ v = v };
2024-12-04 16:29:11 +00:00
2024-12-04 17:56:08 +00:00
fn mksym(v: const str, kw: bool = false) symbol =
symbol{ v = v, kw = kw };
2024-12-04 16:29:11 +00:00