kojote/parse/lex.ha

311 lines
5.7 KiB
Hare
Raw Normal View History

2024-12-04 17:56:08 +00:00
use ascii;
2024-12-04 02:41:23 +00:00
use bufio;
2024-12-04 16:29:11 +00:00
use encoding::utf8;
2024-12-04 02:41:23 +00:00
use io;
use memio;
2024-12-04 17:56:08 +00:00
use unicode;
use strconv;
2024-12-04 17:56:08 +00:00
use fmt;
2024-12-04 02:41:23 +00:00
use strings;
def longcharnames: [_](str, rune) = [
("nul", '\u0000'),
("alarm", '\u0007'),
("backspace", '\u0008'),
("tab", '\u0009'),
2024-12-05 15:50:29 +00:00
("newline", '\u000a'),
("linefeed", '\u000a'),
("vtab", '\u000b'),
("page", '\u000c'),
("return", '\u000d'),
("esc", '\u001b'),
("space", '\u0020'),
("delete", '\u007f'),
];
2024-12-04 02:41:23 +00:00
export type lexer = struct {
in: io::handle,
strbuf: memio::stream,
path: str,
loc: (uint, uint),
prevloc: (uint, uint),
unread: (rune | void),
};
export fn newlexer(in: io::handle, path: str) lexer = {
return lexer {
in = in,
strbuf = memio::dynamic(),
path = path,
loc = (1, 0),
unread = void,
...
};
};
export fn close(lex: *lexer) void = {
io::close(&lex.strbuf)!;
};
2024-12-04 17:56:08 +00:00
export fn lex(lex: *lexer) (token | io::EOF | error) = {
2024-12-04 16:29:11 +00:00
const rn = match (nextrunews(lex)?) {
case io::EOF =>
return io::EOF;
case let rn: rune =>
yield rn;
2024-12-04 02:41:23 +00:00
};
2024-12-04 16:29:11 +00:00
switch (rn) {
case '(' =>
2024-12-04 17:56:08 +00:00
return comment{ v = scancomment(lex)? };
2024-12-04 16:29:11 +00:00
case ')' =>
2024-12-04 17:56:08 +00:00
return lex.loc: invalid;
2024-12-04 16:29:11 +00:00
case '[' =>
2024-12-04 17:56:08 +00:00
return quotstart;
2024-12-04 16:29:11 +00:00
case ']' =>
2024-12-04 17:56:08 +00:00
return quotend;
2024-12-04 16:29:11 +00:00
case '{' =>
2024-12-04 17:56:08 +00:00
return mapstart;
2024-12-04 16:29:11 +00:00
case '}' =>
2024-12-04 17:56:08 +00:00
return mapend;
2024-12-04 16:29:11 +00:00
case '\\' =>
2024-12-04 17:56:08 +00:00
let v = scanword(lex)?;
if (len(v) == 0) {
return lex.loc: invalid;
} else {
return symbol{ v = v, kw = false };
};
2024-12-04 16:29:11 +00:00
case ':' =>
2024-12-04 17:56:08 +00:00
let v = scanword(lex)?;
if (len(v) == 0) {
return lex.loc: invalid;
} else {
return symbol{ v = v, kw = true };
2024-12-04 16:29:11 +00:00
};
case '#' =>
return scanhash(lex)?;
2024-12-04 17:56:08 +00:00
case '"' =>
return scanstr(lex)?;
2024-12-04 16:29:11 +00:00
case =>
yield;
};
unget(lex, rn);
return word{ v = scanword(lex)? };
2024-12-04 02:41:23 +00:00
};
2024-12-04 16:29:11 +00:00
fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
2024-12-04 02:41:23 +00:00
match (lex.unread) {
case let rn: rune =>
lex.prevloc = lex.loc;
lex.unread = void;
2024-12-04 16:29:11 +00:00
if (rn == '\n') {
lex.loc = (lex.loc.0 + 1, 0);
} else {
lex.loc.1 += 1;
};
2024-12-04 02:41:23 +00:00
return rn;
case void =>
yield;
};
2024-12-04 16:29:11 +00:00
match (bufio::read_rune(lex.in)) {
2024-12-04 02:41:23 +00:00
case let rn: rune =>
lex.prevloc = lex.loc;
2024-12-04 16:29:11 +00:00
if (rn == '\n') {
lex.loc = (lex.loc.0 + 1, 0);
} else {
lex.loc.1 += 1;
};
2024-12-04 02:41:23 +00:00
return rn;
case io::EOF =>
return io::EOF;
2024-12-04 16:29:11 +00:00
case let err: io::error =>
return err;
case utf8::invalid =>
return lex.loc: invalid;
};
};
fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
for (true) {
match (nextrune(lex)?) {
case let rn: rune =>
2024-12-04 17:56:08 +00:00
if (isspace(rn)) {
2024-12-04 16:29:11 +00:00
continue;
};
return rn;
case io::EOF =>
return io::EOF;
};
2024-12-04 02:41:23 +00:00
};
};
fn unget(lex: *lexer, rn: rune) void = {
assert(lex.unread is void);
lex.unread = rn;
lex.loc = lex.prevloc;
};
2024-12-04 16:29:11 +00:00
fn scanword(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
2024-12-04 02:41:23 +00:00
for (true) {
2024-12-04 16:29:11 +00:00
const rn = match (nextrune(lex)?) {
2024-12-04 02:41:23 +00:00
case let rn: rune =>
2024-12-04 16:29:11 +00:00
yield rn;
case io::EOF =>
break;
};
2024-12-04 17:56:08 +00:00
if (isspace(rn) || isdelimiter(rn)) {
2024-12-04 16:29:11 +00:00
unget(lex, rn);
break;
};
memio::appendrune(&lex.strbuf, rn)!;
};
return memio::string(&lex.strbuf)!;
};
2024-12-04 17:56:08 +00:00
fn scancomment(lex: *lexer) (str | error) = {
2024-12-04 16:29:11 +00:00
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
2024-12-04 17:56:08 +00:00
return ("comment", lex.loc.0, lex.loc.1): unterminated;
};
switch (rn) {
case '(' =>
2024-12-04 16:29:11 +00:00
return lex.loc: invalid;
2024-12-04 17:56:08 +00:00
case ')' =>
break;
case =>
memio::appendrune(&lex.strbuf, rn)!;
};
};
return memio::string(&lex.strbuf)!;
};
fn scanstr(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("string literal", lex.loc.0, lex.loc.1): unterminated;
2024-12-04 16:29:11 +00:00
};
switch (rn) {
case '"' => break;
case '\\' =>
memio::appendrune(&lex.strbuf, scanescape(lex)?)!;
case =>
memio::appendrune(&lex.strbuf, rn)!;
2024-12-04 02:41:23 +00:00
};
};
2024-12-04 16:29:11 +00:00
return memio::string(&lex.strbuf)!;
2024-12-04 02:41:23 +00:00
};
fn scanhash(lex: *lexer) (token | error) = {
2024-12-04 17:56:08 +00:00
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("hash literal", lex.loc.0, lex.loc.1): unterminated;
2024-12-04 17:56:08 +00:00
};
switch (rn) {
case 't' =>
return true;
case 'f' =>
return false;
2024-12-04 17:56:08 +00:00
case '\\' =>
return scanchar(lex)?;
2024-12-04 17:56:08 +00:00
case =>
return lex.loc: invalid;
};
};
fn scanchar(lex: *lexer) (rune | error) = {
static let namebuf: [16]u8 = [0...];
let namebuf = memio::fixed(namebuf);
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("character literal", lex.loc.0, lex.loc.1): unterminated;
};
match (nextrune(lex)?) {
case let rnn: rune =>
unget(lex, rnn);
if (isspace(rnn)) {
return rn;
} else {
memio::appendrune(&namebuf, rn)!;
memio::concat(&namebuf, scanword(lex)?)!;
const name = memio::string(&namebuf)!;
for (let i = 0z; i < len(longcharnames); i += 1) {
if (name == longcharnames[i].0) {
return longcharnames[i].1;
};
};
return lex.loc: invalid;
};
case io::EOF =>
2024-12-04 17:56:08 +00:00
return rn;
};
};
2024-12-04 16:29:11 +00:00
fn scanescape(lex: *lexer) (rune | error) = {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
2024-12-04 16:29:11 +00:00
};
switch (rn) {
2024-12-05 15:50:29 +00:00
case '0' => return '\0';
case 'a' => return '\a';
case 'b' => return '\b';
case 'e' => return '\x1b';
case 'f' => return '\f';
case 'n' => return '\n';
case 'r' => return '\r';
case 't' => return '\t';
case 'v' => return '\v';
case '\\' => return '\\';
case '"' => return '"';
2024-12-04 16:29:11 +00:00
case =>
return lex.loc: invalid;
};
};
2024-12-04 02:41:23 +00:00
2024-12-04 17:56:08 +00:00
fn isspace(rn: rune) bool = {
if (ascii::isspace(rn)) {
return true;
} else {
switch (unicode::rune_gc(rn)) {
case unicode::gc::Zs =>
return true;
case =>
return false;
};
};
};
2024-12-04 02:41:23 +00:00
2024-12-04 17:56:08 +00:00
fn isdelimiter(rn: rune) bool = {
match (strings::index(`()[]{}\:#`, rn)) {
2024-12-04 17:56:08 +00:00
case size =>
return true;
2024-12-04 16:29:11 +00:00
case =>
2024-12-04 17:56:08 +00:00
return false;
2024-12-04 16:29:11 +00:00
};
};