kojote/lex.ha

335 lines
5.9 KiB
Hare

use ascii;
use bufio;
use encoding::utf8;
use io;
use memio;
use unicode;
// Testing dependency
use fmt;
use strings;
// my cod prob sux :(
export type lexer = struct {
in: io::handle,
strbuf: memio::stream,
path: str,
loc: (uint, uint),
prevloc: (uint, uint),
unread: (rune | void),
};
export fn newlexer(in: io::handle, path: str) lexer = {
return lexer {
in = in,
strbuf = memio::dynamic(),
path = path,
loc = (1, 0),
unread = void,
...
};
};
export fn close(lex: *lexer) void = {
io::close(&lex.strbuf)!;
};
export fn lex(lex: *lexer) (token | io::EOF | error) = {
const rn = match (nextrunews(lex)?) {
case io::EOF =>
return io::EOF;
case let rn: rune =>
yield rn;
};
switch (rn) {
case '(' =>
return comment{ v = scancomment(lex)? };
case ')' =>
return lex.loc: invalid;
case '[' =>
return quotstart;
case ']' =>
return quotend;
case '{' =>
return mapstart;
case '}' =>
return mapend;
case '\\' =>
let v = scanword(lex)?;
if (len(v) == 0) {
return lex.loc: invalid;
} else {
return symbol{ v = v, kw = false };
};
case ':' =>
let v = scanword(lex)?;
if (len(v) == 0) {
return lex.loc: invalid;
} else {
return symbol{ v = v, kw = true };
};
case '\'' =>
return scanchar(lex)?;
case '"' =>
return scanstr(lex)?;
case =>
yield;
};
unget(lex, rn);
return word{ v = scanword(lex)? };
};
fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
match (lex.unread) {
case let rn: rune =>
lex.prevloc = lex.loc;
lex.unread = void;
if (rn == '\n') {
lex.loc = (lex.loc.0 + 1, 0);
} else {
lex.loc.1 += 1;
};
return rn;
case void =>
yield;
};
match (bufio::read_rune(lex.in)) {
case let rn: rune =>
lex.prevloc = lex.loc;
if (rn == '\n') {
lex.loc = (lex.loc.0 + 1, 0);
} else {
lex.loc.1 += 1;
};
return rn;
case io::EOF =>
return io::EOF;
case let err: io::error =>
return err;
case utf8::invalid =>
return lex.loc: invalid;
};
};
fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
for (true) {
match (nextrune(lex)?) {
case let rn: rune =>
if (isspace(rn)) {
continue;
};
return rn;
case io::EOF =>
return io::EOF;
};
};
};
fn unget(lex: *lexer, rn: rune) void = {
assert(lex.unread is void);
lex.unread = rn;
lex.loc = lex.prevloc;
};
fn scanword(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
break;
};
if (isspace(rn) || isdelimiter(rn)) {
unget(lex, rn);
break;
};
memio::appendrune(&lex.strbuf, rn)!;
};
return memio::string(&lex.strbuf)!;
};
fn scancomment(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("comment", lex.loc.0, lex.loc.1): unterminated;
};
switch (rn) {
case '(' =>
return lex.loc: invalid;
case ')' =>
break;
case =>
memio::appendrune(&lex.strbuf, rn)!;
};
};
return memio::string(&lex.strbuf)!;
};
fn scanstr(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("string literal", lex.loc.0, lex.loc.1): unterminated;
};
switch (rn) {
case '"' => break;
case '\\' =>
memio::appendrune(&lex.strbuf, scanescape(lex)?)!;
case =>
memio::appendrune(&lex.strbuf, rn)!;
};
};
return memio::string(&lex.strbuf)!;
};
fn scanchar(lex: *lexer) (rune | error) = {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("character literal", lex.loc.0, lex.loc.1): unterminated;
};
switch (rn) {
case '\\' =>
return scanescape(lex)?;
case =>
return rn;
};
};
fn scanescape(lex: *lexer) (rune | error) = {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return lex.loc: invalid;
};
switch (rn) {
case '"' =>
return '"';
case '\\' =>
return '\\';
case 'n' =>
return '\n';
case 't' =>
return '\t';
case 's' =>
return ' ';
case =>
return lex.loc: invalid;
};
};
fn isspace(rn: rune) bool = {
if (ascii::isspace(rn)) {
return true;
} else {
switch (unicode::rune_gc(rn)) {
case unicode::gc::Zs =>
return true;
case =>
return false;
};
};
};
def delimiters = `()[]{}\:'`;
fn isdelimiter(rn: rune) bool = {
match (strings::index(delimiters, rn)) {
case size =>
return true;
case =>
return false;
};
};
@test fn lex() void = {
const cases: [_](str, []token) = [
(
`"hello" \greeting def`,
[
"hello",
mksym("greeting"),
mkword("def"),
]
),
(
`[dup *] (a -- a) \square def`,
[
quotstart,
mkword("dup"),
mkword("*"),
quotend,
mkcomment("a -- a"),
mksym("square"),
mkword("def"),
]
),
(`'\s`, [' '])
];
for (let i = 0z; i < len(cases); i += 1) {
const src = strings::toutf8(cases[i].0);
const src = memio::fixed(src);
const lexer = newlexer(&src, "<string>");
defer close(&lexer);
for (let j = 0z; j < len(cases[i].1); j += 1) {
const want = cases[i].1[j];
const have = lex(&lexer)! as token;
assert(tokeq(want, have));
};
assert(lex(&lexer) is io::EOF);
};
};
fn tokeq(have: token, want: token) bool = {
match (want) {
case quotstart =>
return have is quotstart;
case quotend =>
return have is quotend;
case mapstart =>
return have is mapstart;
case mapend =>
return have is mapend;
case let w: word =>
return (have as word).v == w.v;
case let s: str =>
return have as str == s;
case let s: symbol =>
return (have as symbol).v == s.v;
case let c: comment =>
return (have as comment).v == c.v;
case let r: rune =>
return have as rune == r;
};
};
fn mkword(v: const str) word =
word{ v = v };
fn mkcomment(v: const str) comment =
comment{ v = v };
fn mksym(v: const str, kw: bool = false) symbol =
symbol{ v = v, kw = kw };