Compare commits

..

2 commits

Author SHA1 Message Date
b09e0b37fb lex: move token pretty-printing to another file 2024-12-05 22:31:33 -03:00
d7b6b380fe lex: rework types 2024-12-05 22:06:21 -03:00
7 changed files with 272 additions and 210 deletions

93
kojote/lex/+test.ha Normal file
View file

@ -0,0 +1,93 @@
use memio;
use fmt;
use strings;
use io;
type dummytoken = (ty, value);
@test fn next() void = {
const cases: [_](str, []dummytoken) = [
(`"hello" \greeting def`,
[
(ty::STRING, "hello"),
(ty::SYMBOL, "greeting"),
(ty::WORD, "def")
]),
(`[dup *] (a -- a) \square def`,
[
(ty::QUOT_START, void),
(ty::WORD, "dup"),
(ty::WORD, "*"),
(ty::QUOT_END, void),
(ty::COMMENT, "a -- a"),
(ty::SYMBOL, "square"),
(ty::WORD, "def"),
]),
(`#t #f`,
[
(ty::BOOLEAN, true),
(ty::BOOLEAN, false),
]),
(`#\a #\space #\nul`,
[
(ty::CHAR, 'a'),
(ty::CHAR, ' '),
(ty::CHAR, '\0'),
]),
(`"\x0a;\x2014;\x2f9f4;"`,
[
(ty::STRING, "\n—嶲"),
]),
(`#\x #\x0a; #\x2014; #\x2f9f4;`,
[
(ty::CHAR, 'x'),
(ty::CHAR, '\n'),
(ty::CHAR, '—'),
(ty::CHAR, '嶲'),
]),
];
for (let i = 0z; i < len(cases); i += 1) {
const src = strings::toutf8(cases[i].0);
const src = memio::fixed(src);
const lexer = newlexer(&src, "<string>");
defer close(&lexer);
for (let j = 0z; j < len(cases[i].1); j += 1) {
const want = cases[i].1[j];
const have = match (next(&lexer)) {
case let tok: token =>
yield tok;
case io::EOF =>
assert(false, "reached EOF");
return;
case let err: error =>
assert(false, strerror(err));
return;
};
if (!tokeq(have, want)) {
fmt::printf("Expected:\n\t")!;
fmt::println(tokstr((want.0, want.1, location{ ... })))!;
fmt::printf("Got:\n\t")!;
fmt::println(tokstr(have))!;
assert(false);
};
};
assert(next(&lexer) is io::EOF);
};
};
fn tokeq(have: token, want: dummytoken) bool =
have.0 == want.0 && match (have.1) {
case void =>
yield true;
case let s: str =>
yield want.1 is str && (want.1 as str) == s;
case let r: rune =>
yield want.1 is rune && (want.1 as rune) == r;
case let b: bool =>
yield want.1 is bool && (want.1 as bool) == b;
};

View file

@ -27,16 +27,20 @@ def longcharnames: [_](str, rune) = [
export type lexer = struct { export type lexer = struct {
in: io::handle, in: io::handle,
strbuf: memio::stream, strbuf: memio::stream,
commentbuf: memio::stream,
path: str, path: str,
loc: (uint, uint), loc: (uint, uint),
prevloc: (uint, uint), prevloc: (uint, uint),
unread: (rune | void), unread: (rune | void),
}; };
// Creates a new [[lexer]] for the given [[io::handle]].
// The path is borrowed.
export fn newlexer(in: io::handle, path: str) lexer = { export fn newlexer(in: io::handle, path: str) lexer = {
return lexer { return lexer {
in = in, in = in,
strbuf = memio::dynamic(), strbuf = memio::dynamic(),
commentbuf = memio::dynamic(),
path = path, path = path,
loc = (1, 0), loc = (1, 0),
unread = void, unread = void,
@ -44,11 +48,14 @@ export fn newlexer(in: io::handle, path: str) lexer = {
}; };
}; };
// Frees resources associated with a [[lexer]].
export fn close(lex: *lexer) void = { export fn close(lex: *lexer) void = {
io::close(&lex.strbuf)!; io::close(&lex.strbuf)!;
io::close(&lex.commentbuf)!;
}; };
export fn lex(lex: *lexer) (token | io::EOF | error) = { // Returns the next token from the lexer.
export fn next(lex: *lexer) (token | io::EOF | error) = {
const rn = match (nextrunews(lex)?) { const rn = match (nextrunews(lex)?) {
case io::EOF => case io::EOF =>
return io::EOF; return io::EOF;
@ -58,30 +65,30 @@ export fn lex(lex: *lexer) (token | io::EOF | error) = {
switch (rn) { switch (rn) {
case '(' => case '(' =>
return comment{ v = scancomment(lex)? }; return mktoken(lex, ty::COMMENT, scancomment(lex)?);
case ')' => case ')' =>
return lex.loc: invalid; return mkerror(lex, "invalid token");
case '[' => case '[' =>
return quotstart; return mktoken(lex, ty::QUOT_START, void);
case ']' => case ']' =>
return quotend; return mktoken(lex, ty::QUOT_END, void);
case '{' => case '{' =>
return mapstart; return mktoken(lex, ty::MAP_START, void);
case '}' => case '}' =>
return mapend; return mktoken(lex, ty::MAP_END, void);
case '\\' => case '\\' =>
let v = scanword(lex)?; let v = scanword(lex)?;
if (len(v) == 0) { if (len(v) == 0) {
return lex.loc: invalid; return mkerror(lex, "invalid symbol literal");
} else { } else {
return symbol{ v = v, kw = false }; return mktoken(lex, ty::SYMBOL, v);
}; };
case ':' => case ':' =>
let v = scanword(lex)?; let v = scanword(lex)?;
if (len(v) == 0) { if (len(v) == 0) {
return lex.loc: invalid; return mkerror(lex, "invalid keyword");
} else { } else {
return symbol{ v = v, kw = true }; return mktoken(lex, ty::KEYWORD, v);
}; };
case '#' => case '#' =>
return scanpound(lex)?; return scanpound(lex)?;
@ -92,7 +99,7 @@ export fn lex(lex: *lexer) (token | io::EOF | error) = {
}; };
unget(lex, rn); unget(lex, rn);
return word{ v = scanword(lex)? }; return mktoken(lex, ty::WORD, scanword(lex)?);
}; };
fn nextrune(lex: *lexer) (rune | io::EOF | error) = { fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
@ -124,7 +131,7 @@ fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
case let err: io::error => case let err: io::error =>
return err; return err;
case utf8::invalid => case utf8::invalid =>
return lex.loc: invalid; return mkerror(lex, "invalid UTF-8 sequence");
}; };
}; };
@ -132,9 +139,7 @@ fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
for (true) { for (true) {
match (nextrune(lex)?) { match (nextrune(lex)?) {
case let rn: rune => case let rn: rune =>
if (isspace(rn)) { if (isspace(rn)) continue;
continue;
};
return rn; return rn;
case io::EOF => case io::EOF =>
return io::EOF; return io::EOF;
@ -167,36 +172,36 @@ fn scanword(lex: *lexer) (str | error) = {
}; };
fn scancomment(lex: *lexer) (str | error) = { fn scancomment(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf); memio::reset(&lex.commentbuf);
for (true) { for (true) {
const rn = match (nextrune(lex)?) { const rn = match (nextrune(lex)?) {
case let rn: rune => case let rn: rune =>
yield rn; yield rn;
case io::EOF => case io::EOF =>
return ("comment", lex.loc.0, lex.loc.1): unterminated; return mkerror(lex, "unterminated comment");
}; };
switch (rn) { switch (rn) {
case '(' => case '(' =>
return lex.loc: invalid; return mkerror(lex, "nested comments are not allowed");
case ')' => case ')' =>
break; break;
case => case =>
memio::appendrune(&lex.strbuf, rn)!; memio::appendrune(&lex.commentbuf, rn)!;
}; };
}; };
return memio::string(&lex.strbuf)!; return memio::string(&lex.commentbuf)!;
}; };
fn scanstr(lex: *lexer) (str | error) = { fn scanstr(lex: *lexer) (token | error) = {
memio::reset(&lex.strbuf); memio::reset(&lex.strbuf);
for (true) { for (true) {
const rn = match (nextrune(lex)?) { const rn = match (nextrune(lex)?) {
case let rn: rune => case let rn: rune =>
yield rn; yield rn;
case io::EOF => case io::EOF =>
return ("string literal", lex.loc.0, lex.loc.1): unterminated; return mkerror(lex, "unterminated string literal");
}; };
switch (rn) { switch (rn) {
@ -207,7 +212,7 @@ fn scanstr(lex: *lexer) (str | error) = {
memio::appendrune(&lex.strbuf, rn)!; memio::appendrune(&lex.strbuf, rn)!;
}; };
}; };
return memio::string(&lex.strbuf)!; return mktoken(lex, ty::STRING, memio::string(&lex.strbuf)!);
}; };
fn scanpound(lex: *lexer) (token | error) = { fn scanpound(lex: *lexer) (token | error) = {
@ -215,22 +220,22 @@ fn scanpound(lex: *lexer) (token | error) = {
case let rn: rune => case let rn: rune =>
yield rn; yield rn;
case io::EOF => case io::EOF =>
return ("pound literal", lex.loc.0, lex.loc.1): unterminated; return mkerror(lex, "unterminated pound literal");
}; };
switch (rn) { switch (rn) {
case 't' => case 't' =>
return true; return mktoken(lex, ty::BOOLEAN, true);
case 'f' => case 'f' =>
return false; return mktoken(lex, ty::BOOLEAN, false);
case '\\' => case '\\' =>
return scanchar(lex)?; return scanchar(lex);
case => case =>
return lex.loc: invalid; return mkerror(lex, "invalid pound literal");
}; };
}; };
fn scanchar(lex: *lexer) (rune | error) = { fn scanchar(lex: *lexer) (token | error) = {
static let namebuf: [16]u8 = [0...]; static let namebuf: [16]u8 = [0...];
let namebuf = memio::fixed(namebuf); let namebuf = memio::fixed(namebuf);
@ -238,32 +243,36 @@ fn scanchar(lex: *lexer) (rune | error) = {
case let rn: rune => case let rn: rune =>
yield rn; yield rn;
case io::EOF => case io::EOF =>
return ("character literal", lex.loc.0, lex.loc.1): unterminated; return mkerror(lex, "unterminated character literal");
}; };
let ret: rune = '\0';
match (nextrune(lex)?) { match (nextrune(lex)?) {
case let rnn: rune => case io::EOF =>
unget(lex, rnn); return mktoken(lex, ty::CHAR, rn);
if (isspace(rnn)) { case let next: rune =>
return rn; unget(lex, next);
} else {
if (isspace(next)) {
return mktoken(lex, ty::CHAR, rn);
};
if (rn == 'x') { if (rn == 'x') {
return scanescape2(lex); return mktoken(lex, ty::CHAR, scanescape2(lex)?);
} else { } else {
memio::appendrune(&namebuf, rn)!; memio::appendrune(&namebuf, rn)!;
memio::concat(&namebuf, scanword(lex)?)!; memio::concat(&namebuf, scanword(lex)?)!;
const name = memio::string(&namebuf)!; const name = memio::string(&namebuf)!;
for (let i = 0z; i < len(longcharnames); i += 1) { for (let i = 0z; i < len(longcharnames); i += 1) {
if (name == longcharnames[i].0) { if (name == longcharnames[i].0) {
return longcharnames[i].1; return mktoken(lex, ty::CHAR,
longcharnames[i].1);
}; };
}; };
return lex.loc: invalid; return mkerror(lex, "invalid named character literal");
}; };
}; };
case io::EOF =>
return rn;
};
}; };
fn scanescape(lex: *lexer) (rune | error) = { fn scanescape(lex: *lexer) (rune | error) = {
@ -271,7 +280,7 @@ fn scanescape(lex: *lexer) (rune | error) = {
case let rn: rune => case let rn: rune =>
yield rn; yield rn;
case io::EOF => case io::EOF =>
return ("escape sequence", lex.loc.0, lex.loc.1): unterminated; return mkerror(lex, "unterminated character escape");
}; };
switch (rn) { switch (rn) {
@ -288,7 +297,7 @@ fn scanescape(lex: *lexer) (rune | error) = {
case '"' => return '"'; case '"' => return '"';
case 'x' => return scanescape2(lex)?; case 'x' => return scanescape2(lex)?;
case => case =>
return lex.loc: invalid; return mkerror(lex, "invalid character escape");
}; };
}; };
@ -304,7 +313,7 @@ fn scanescape2(lex: *lexer) (rune | error) = {
case let rn: rune => case let rn: rune =>
yield rn; yield rn;
case io::EOF => case io::EOF =>
return ("escape sequence", lex.loc.0, lex.loc.1): unterminated; return mkerror(lex, "unterminated character escape");
}; };
const buf: [6]u8 = [0...]; const buf: [6]u8 = [0...];
@ -317,11 +326,11 @@ fn scanescape2(lex: *lexer) (rune | error) = {
case let rn: rune => case let rn: rune =>
yield rn; yield rn;
case io::EOF => case io::EOF =>
return ("escape sequence", lex.loc.0, lex.loc.1): unterminated; return mkerror(lex, "unterminated escape sequence");
}; };
if (count > 6) { if (count > 6) {
return lex.loc: invalid; return mkerror(lex, "invalid escape sequence");
} else if (rn == ';') { } else if (rn == ';') {
break; break;
} else { } else {
@ -337,10 +346,16 @@ fn scanescape2(lex: *lexer) (rune | error) = {
case let codepoint: u32 => case let codepoint: u32 =>
return codepoint: rune; return codepoint: rune;
case => case =>
return lex.loc: invalid; return mkerror(lex, "invalid escape sequence");
}; };
}; };
fn mktoken(lex: *lexer, ty: ty, value: value) token =
(ty, value, location{ path = lex.path, line = lex.loc.0, column = lex.loc.1 });
fn mkerror(lex: *lexer, msg: const str) syntax =
(location{ path = lex.path, line = lex.loc.0, column = lex.loc.1 }, msg);
fn isspace(rn: rune) bool = { fn isspace(rn: rune) bool = {
if (ascii::isspace(rn)) { if (ascii::isspace(rn)) {
return true; return true;
@ -355,7 +370,7 @@ fn isspace(rn: rune) bool = {
}; };
fn isdelimiter(rn: rune) bool = { fn isdelimiter(rn: rune) bool = {
match (strings::index(`()[]{}\:#`, rn)) { match (strings::index(`()[]{}`, rn)) {
case size => case size =>
return true; return true;
case => case =>

51
kojote/lex/token.ha Normal file
View file

@ -0,0 +1,51 @@
use ascii;
use fmt;
// Returns a string representation of the token.
//
// This string representation may not correspond one to one with the source,
// but it is guaranteed to be a syntactically-valid construct, such that
// parsing it results in the same token.
export fn tokstr(tok: token) str = {
static let buf: [128]u8 = [0...];
switch (tok.0) {
case ty::QUOT_START =>
return "[";
case ty::QUOT_END =>
return "]";
case ty::MAP_START =>
return "{";
case ty::MAP_END =>
return "}";
case ty::COMMENT =>
return fmt::bsprintf(buf, "({})", tok.1 as str);
case ty::WORD =>
return tok.1 as str;
case ty::SYMBOL =>
return fmt::bsprintf(buf, "\\{}", tok.1 as str);
case ty::KEYWORD =>
return fmt::bsprintf(buf, ":{}", tok.1 as str);
case ty::STRING =>
// TODO: escape string before printing
return fmt::bsprintf(buf, "\"{}\"", tok.1 as str);
case ty::CHAR =>
let rn = tok.1 as rune;
for (let i = 0z; i < len(longcharnames); i += 1) {
if (longcharnames[i].1 == rn) {
return fmt::bsprintf(buf, "#\\{}",
longcharnames[i].0);
};
};
if (ascii::isgraph(rn)) {
return fmt::bsprintf(buf, "#\\{}", rn);
} else {
return fmt::bsprintf(buf, "#\\x{x};", rn: u32);
};
case ty::NUMBER =>
return tok.1 as str;
case ty::BOOLEAN =>
return fmt::bsprintf(buf, "#{}",
if (tok.1 as bool) 't' else 'f');
};
};

51
kojote/lex/types.ha Normal file
View file

@ -0,0 +1,51 @@
use io;
use fmt;
// A syntax error.
export type syntax = !(location, str);
// All possible lexer errors
export type error = !(io::error | syntax);
// A token type
export type ty = enum uint {
QUOT_START,
QUOT_END,
MAP_START,
MAP_END,
COMMENT,
WORD,
SYMBOL,
KEYWORD,
STRING,
CHAR,
NUMBER,
BOOLEAN,
};
// A token value, used for literal tokens and comments.
export type value = (str | rune | bool | void);
// A location within a source file.
// The path is borrowed from the file name given to the lexer.
export type location = struct {
path: str,
line: uint,
column: uint,
};
// A single lexical token.
export type token = (ty, value, location);
// Returns a human-friendly string for a given error. The result may be
// statically allocated.
export fn strerror(err: error) const str = {
static let buf: [512]u8 = [0...];
match (err) {
case let err: io::error =>
return io::strerror(err);
case let s: syntax =>
return fmt::bsprintf(buf, "{}:{}:{}: syntax error: {}",
s.0.path, s.0.line, s.0.column, s.1);
};
};

View file

@ -1,117 +0,0 @@
use memio;
use fmt;
use strings;
use io;
@test fn lex() void = {
const cases: [_](str, []token) = [
(`"hello" \greeting def`,
["hello", mksym("greeting"), mkword("def")]),
(`[dup *] (a -- a) \square def`,
[quotstart, mkword("dup"), mkword("*"), quotend,
mkcomment("a -- a"), mksym("square"),
mkword("def")]),
(`#t #f`, [true, false]),
(`#\a #\space #\nul`, ['a', ' ', '\0']),
(`"\x0a;" "\x2014;" "\x2f9f4;"`, ["\n", "", "嶲"]),
(`#\x #\x0a; #\x2014; #\x2f9f4;`, ['x', '\n', '—', '嶲']),
];
for (let i = 0z; i < len(cases); i += 1) {
const src = strings::toutf8(cases[i].0);
const src = memio::fixed(src);
const lexer = newlexer(&src, "<string>");
defer close(&lexer);
for (let j = 0z; j < len(cases[i].1); j += 1) {
const want = cases[i].1[j];
const have = match (lex(&lexer)) {
case let tok: token =>
yield tok;
case io::EOF =>
assert(false, "reached EOF");
return;
case let err: error =>
assert(false, strerror(err));
return;
};
if (!tokeq(want, have)) {
fmt::printfln("Case {}: {}", i, cases[i].0)!;
fmt::print("\tExpected: ")!;
tokpp(want);
fmt::print("\tGot: ")!;
tokpp(have);
assert(false);
};
};
assert(lex(&lexer) is io::EOF);
};
};
fn tokeq(have: token, want: token) bool = {
match (want) {
case quotstart =>
return have is quotstart;
case quotend =>
return have is quotend;
case mapstart =>
return have is mapstart;
case mapend =>
return have is mapend;
case let w: word =>
return (have as word).v == w.v;
case let s: str =>
return have as str == s;
case let s: symbol =>
return (have as symbol).v == s.v;
case let c: comment =>
return (have as comment).v == c.v;
case let r: rune =>
return have as rune == r;
case let b: bool =>
return have as bool == b;
};
};
fn tokpp(tok: token) void = {
match (tok) {
case quotstart =>
fmt::println("[")!;
case quotend =>
fmt::println("]")!;
case mapstart =>
fmt::println("{")!;
case mapend =>
fmt::println("}")!;
case let w: word =>
fmt::println(w.v)!;
case let s: symbol =>
fmt::printfln("{}{}", if (s.kw) ":" else "\\", s.v)!;
case let s: str =>
fmt::printfln(`"{}"`, s)!;
case let c: comment =>
fmt::printfln("({})", c.v)!;
case let r: rune =>
for (let i = 0z; i < len(longcharnames); i += 1) {
if (r == longcharnames[i].1) {
fmt::printfln("#\\{}", longcharnames[i].0)!;
return;
};
};
fmt::printfln("#\\{}", r)!;
case let b: bool =>
fmt::println(if (b) "#t" else "#f")!;
};
};
fn mkword(v: const str) word =
word{ v = v };
fn mkcomment(v: const str) comment =
comment{ v = v };
fn mksym(v: const str, kw: bool = false) symbol =
symbol{ v = v, kw = kw };

View file

@ -1,32 +0,0 @@
use io;
use fmt;
export type invalid = !(uint, uint);
export type unterminated = !(const str, uint, uint);
export type error = !(invalid | unterminated | io::error);
export type quotstart = void;
export type quotend = void;
export type mapstart = void;
export type mapend = void;
export type comment = struct { v: str };
export type word = struct { v: str };
export type symbol = struct { v: str, kw: bool };
export type token = (quotstart | quotend | mapstart | mapend |
word | symbol | comment | str | rune | bool);
export fn strerror(err: error) const str = {
static let buf: [64]u8 = [0...];
match (err) {
case let err: invalid =>
return fmt::bsprintf(buf,
"Invalid token found at {}:{}", err.0, err.1);
case let err: unterminated =>
return fmt::bsprintf(buf,
"Unterminated {} found at {}:{}", err.0, err.1, err.2);
case let err: io::error =>
return io::strerror(err);
};
};

View file

@ -1,5 +1,6 @@
3.14159 \pi def ( hello world! )
[dup *] \square def \pi 3.14159 def
\square [dup *] def
\circarea [square pi *] def
[square pi *] \circarea def
20 circarea . ( => 1256.636 ) 20 circarea . ( => 1256.636 )