kojote/parse/lex.ha

364 lines
6.9 KiB
Hare

use ascii;
use bufio;
use encoding::utf8;
use io;
use memio;
use unicode;
use strconv;
use fmt;
use strings;
def longcharnames: [_](str, rune) = [
("nul", '\u0000'),
("alarm", '\u0007'),
("backspace", '\u0008'),
("tab", '\u0009'),
("newline", '\u000a'),
("linefeed", '\u000a'),
("vtab", '\u000b'),
("page", '\u000c'),
("return", '\u000d'),
("esc", '\u001b'),
("space", '\u0020'),
("delete", '\u007f'),
];
export type lexer = struct {
in: io::handle,
strbuf: memio::stream,
path: str,
loc: (uint, uint),
prevloc: (uint, uint),
unread: (rune | void),
};
export fn newlexer(in: io::handle, path: str) lexer = {
return lexer {
in = in,
strbuf = memio::dynamic(),
path = path,
loc = (1, 0),
unread = void,
...
};
};
export fn close(lex: *lexer) void = {
io::close(&lex.strbuf)!;
};
export fn lex(lex: *lexer) (token | io::EOF | error) = {
const rn = match (nextrunews(lex)?) {
case io::EOF =>
return io::EOF;
case let rn: rune =>
yield rn;
};
switch (rn) {
case '(' =>
return comment{ v = scancomment(lex)? };
case ')' =>
return lex.loc: invalid;
case '[' =>
return quotstart;
case ']' =>
return quotend;
case '{' =>
return mapstart;
case '}' =>
return mapend;
case '\\' =>
let v = scanword(lex)?;
if (len(v) == 0) {
return lex.loc: invalid;
} else {
return symbol{ v = v, kw = false };
};
case ':' =>
let v = scanword(lex)?;
if (len(v) == 0) {
return lex.loc: invalid;
} else {
return symbol{ v = v, kw = true };
};
case '#' =>
return scanpound(lex)?;
case '"' =>
return scanstr(lex)?;
case =>
yield;
};
unget(lex, rn);
return word{ v = scanword(lex)? };
};
fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
match (lex.unread) {
case let rn: rune =>
lex.prevloc = lex.loc;
lex.unread = void;
if (rn == '\n') {
lex.loc = (lex.loc.0 + 1, 0);
} else {
lex.loc.1 += 1;
};
return rn;
case void =>
yield;
};
match (bufio::read_rune(lex.in)) {
case let rn: rune =>
lex.prevloc = lex.loc;
if (rn == '\n') {
lex.loc = (lex.loc.0 + 1, 0);
} else {
lex.loc.1 += 1;
};
return rn;
case io::EOF =>
return io::EOF;
case let err: io::error =>
return err;
case utf8::invalid =>
return lex.loc: invalid;
};
};
fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
for (true) {
match (nextrune(lex)?) {
case let rn: rune =>
if (isspace(rn)) {
continue;
};
return rn;
case io::EOF =>
return io::EOF;
};
};
};
fn unget(lex: *lexer, rn: rune) void = {
assert(lex.unread is void);
lex.unread = rn;
lex.loc = lex.prevloc;
};
fn scanword(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
break;
};
if (isspace(rn) || isdelimiter(rn)) {
unget(lex, rn);
break;
};
memio::appendrune(&lex.strbuf, rn)!;
};
return memio::string(&lex.strbuf)!;
};
fn scancomment(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("comment", lex.loc.0, lex.loc.1): unterminated;
};
switch (rn) {
case '(' =>
return lex.loc: invalid;
case ')' =>
break;
case =>
memio::appendrune(&lex.strbuf, rn)!;
};
};
return memio::string(&lex.strbuf)!;
};
fn scanstr(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("string literal", lex.loc.0, lex.loc.1): unterminated;
};
switch (rn) {
case '"' => break;
case '\\' =>
memio::appendrune(&lex.strbuf, scanescape(lex)?)!;
case =>
memio::appendrune(&lex.strbuf, rn)!;
};
};
return memio::string(&lex.strbuf)!;
};
fn scanpound(lex: *lexer) (token | error) = {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("pound literal", lex.loc.0, lex.loc.1): unterminated;
};
switch (rn) {
case 't' =>
return true;
case 'f' =>
return false;
case '\\' =>
return scanchar(lex)?;
case =>
return lex.loc: invalid;
};
};
fn scanchar(lex: *lexer) (rune | error) = {
static let namebuf: [16]u8 = [0...];
let namebuf = memio::fixed(namebuf);
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("character literal", lex.loc.0, lex.loc.1): unterminated;
};
match (nextrune(lex)?) {
case let rnn: rune =>
unget(lex, rnn);
if (isspace(rnn)) {
return rn;
} else {
if (rn == 'x') {
return scanescape2(lex);
} else {
memio::appendrune(&namebuf, rn)!;
memio::concat(&namebuf, scanword(lex)?)!;
const name = memio::string(&namebuf)!;
for (let i = 0z; i < len(longcharnames); i += 1) {
if (name == longcharnames[i].0) {
return longcharnames[i].1;
};
};
return lex.loc: invalid;
};
};
case io::EOF =>
return rn;
};
};
fn scanescape(lex: *lexer) (rune | error) = {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
};
switch (rn) {
case '0' => return '\0';
case 'a' => return '\a';
case 'b' => return '\b';
case 'e' => return '\x1b';
case 'f' => return '\f';
case 'n' => return '\n';
case 'r' => return '\r';
case 't' => return '\t';
case 'v' => return '\v';
case '\\' => return '\\';
case '"' => return '"';
case 'x' => return scanescape2(lex)?;
case =>
return lex.loc: invalid;
};
};
fn scanescape2(lex: *lexer) (rune | error) = {
// This handles the `\xhh...;` family of escapes.
// It's on a separate function since both [[scanescape]] and
// [[scanchar]] make use of it. Much like how [[scanescape]] assumes
// that the backslash has already been consumed, this one assumes that
// the leading character has been consumed prior to entering this
// function.
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
};
const buf: [6]u8 = [0...];
let buf = memio::fixed(buf);
memio::appendrune(&buf, rn)!;
let count = 1z;
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
};
count += 1;
if (count > 6) {
return lex.loc: invalid;
} else if (rn == ';') {
break;
} else {
memio::appendrune(&buf, rn)!;
};
};
const buf = memio::string(&buf)!;
return match (strconv::stou32(buf, strconv::base::HEX)) {
case let codepoint: u32 =>
return codepoint: rune;
case =>
return lex.loc: invalid;
};
};
fn isspace(rn: rune) bool = {
if (ascii::isspace(rn)) {
return true;
} else {
switch (unicode::rune_gc(rn)) {
case unicode::gc::Zs =>
return true;
case =>
return false;
};
};
};
fn isdelimiter(rn: rune) bool = {
match (strings::index(`()[]{}\:#`, rn)) {
case size =>
return true;
case =>
return false;
};
};