lex: token representation and quoting stuff

* `escape` => `quote` and `quotestr`
  `quote` writes the result to a I/O handle, `quotestr` returns it as a
  heap-allocated string.
* `tokstr` => `tokrepr` and `tokstr`
  same deal
This commit is contained in:
Lobo Torres 2024-12-07 21:25:17 -03:00
parent 7506679f54
commit c372d85de2
5 changed files with 170 additions and 121 deletions

View file

@ -2,11 +2,10 @@ use memio;
use fmt; use fmt;
use strings; use strings;
use io; use io;
use os;
type dummytoken = (ty, value);
@test fn next() void = { @test fn next() void = {
const cases: [_](str, []dummytoken) = [ const cases: [_](str, [](ty, value)) = [
(`"hello" \greeting def`, (`"hello" \greeting def`,
[ [
(ty::STRING, "hello"), (ty::STRING, "hello"),
@ -48,16 +47,17 @@ type dummytoken = (ty, value);
]), ]),
]; ];
for (let i = 0z; i < len(cases); i += 1) { for (let tcase .. cases) {
const src = strings::toutf8(cases[i].0); const src = strings::toutf8(tcase.0);
const src = memio::fixed(src); const src = memio::fixed(src);
const lexer = newlexer(&src, "<string>"); const lexer = newlexer(&src, "<string>");
defer close(&lexer); defer close(&lexer);
for (let j = 0z; j < len(cases[i].1); j += 1) { for (let want .. tcase.1) {
const want = cases[i].1[j];
const have = match (next(&lexer)) { const have = match (next(&lexer)) {
case let tok: token => case let tok: token =>
tokrepr(os::stdout, tok)!;
fmt::print(' ')!;
yield tok; yield tok;
case io::EOF => case io::EOF =>
assert(false, "reached EOF"); assert(false, "reached EOF");
@ -68,19 +68,20 @@ type dummytoken = (ty, value);
}; };
if (!tokeq(have, want)) { if (!tokeq(have, want)) {
fmt::printf("Expected:\n\t")!; fmt::errorf("Expected:\n\t")!;
fmt::println(tokstr((want.0, want.1, location{ ... })))!; tokrepr(os::stderr,
fmt::printf("Got:\n\t")!; (want.0, want.1, location{ ... }))!;
fmt::println(tokstr(have))!; fmt::errorf("\nGot:\n\t")!;
assert(false); tokrepr(os::stderr, have)!;
assert(false, "test case doesn't match expectation");
}; };
}; };
assert(next(&lexer) is io::EOF); assert(next(&lexer) is io::EOF, "didn't reach EOF at the end of test case");
}; };
}; };
fn tokeq(have: token, want: dummytoken) bool = fn tokeq(have: token, want: (ty, value)) bool =
have.0 == want.0 && match (have.1) { have.0 == want.0 && match (have.1) {
case void => case void =>
yield true; yield true;

View file

@ -1,29 +1,31 @@
use ascii; use ascii;
use strings;
use bufio; use bufio;
use encoding::utf8; use encoding::utf8;
use io; use io;
use memio; use memio;
use unicode; use unicode;
use strconv; use strconv;
use sort;
use fmt;
use strings;
def longcharnames: [_](str, rune) = [ def longcharnames: [_](str, rune) = [
("nul", '\u0000'), ("alarm", '\x07'),
("alarm", '\u0007'), ("backspace", '\x08'),
("backspace", '\u0008'), ("delete", '\x7f'),
("tab", '\u0009'), ("esc", '\x1b'),
("newline", '\u000a'), ("linefeed", '\x0a'),
("linefeed", '\u000a'), ("newline", '\x0a'),
("vtab", '\u000b'), ("nul", '\0'),
("page", '\u000c'), ("page", '\x0c'),
("return", '\u000d'), ("return", '\x0d'),
("esc", '\u001b'), ("space", ' '),
("space", '\u0020'), ("tab", '\x09'),
("delete", '\u007f'), ("vtab", '\x0b'),
]; ];
fn longcharnames_namecmp(a: const *opaque, b: const *opaque) int =
strings::compare((a: *(str, rune)).0, (b: *(str, rune)).0);
export type lexer = struct { export type lexer = struct {
in: io::handle, in: io::handle,
strbuf: memio::stream, strbuf: memio::stream,
@ -264,16 +266,20 @@ fn scanchar(lex: *lexer) (token | error) = {
memio::appendrune(&namebuf, rn)!; memio::appendrune(&namebuf, rn)!;
memio::concat(&namebuf, scanword(lex)?)!; memio::concat(&namebuf, scanword(lex)?)!;
const name = memio::string(&namebuf)!; const name = memio::string(&namebuf)!;
for (let i = 0z; i < len(longcharnames); i += 1) {
if (name == longcharnames[i].0) { match (sort::search(longcharnames: []opaque,
size((str, rune)),
&name: const *opaque,
&longcharnames_namecmp)) {
case let ix: size =>
return mktoken(lex, ty::CHAR, return mktoken(lex, ty::CHAR,
longcharnames[i].1); longcharnames[ix].1);
}; case void =>
};
return mkerror(lex, "invalid named character literal"); return mkerror(lex, "invalid named character literal");
}; };
}; };
}; };
};
fn scanescape(lex: *lexer) (rune | error) = { fn scanescape(lex: *lexer) (rune | error) = {
const rn = match (nextrune(lex)?) { const rn = match (nextrune(lex)?) {

71
kojote/lex/quote.ha Normal file
View file

@ -0,0 +1,71 @@
use ascii;
use encoding::utf8;
use fmt;
use io;
use memio;
use sort;
use strings;
use unicode;
// Sorted to use with [[sort::search]].
def escapetable: [_](rune, rune) = [
('\0', '0'),
('\a', 'a'),
('\b', 'b'),
('\t', 't'),
('\n', 'n'),
('\v', 'v'),
('\f', 'f'),
('\r', 'r'),
('\x1b', 'e'),
('"', '"'),
('\\', '\\'),
];
fn escapetable_cmpfunc(a: const *opaque, b: const *opaque) int =
(a: *(rune, rune)).0: int - (b: *(rune, rune)).0: int;
// Quotes a Kojote string and writes it to the provided I/O handle.
export fn quote(sink: io::handle, s: str) (size | io::error) = {
if (len(s) == 0) {
return io::writeall(sink, ['\"', '\"'])?;
};
let z = io::writeall(sink, ['\"'])?;
const iter = strings::iter(s);
for (let rn => strings::next(&iter)) {
if (isgraph(rn)) {
z += io::writeall(sink, utf8::encoderune(rn))?;
continue;
};
match (sort::search(
escapetable: []opaque,
size((rune, rune)),
&rn: const *opaque,
&escapetable_cmpfunc)) {
case let ix: size =>
z += io::writeall(sink, ['\\'])?;
z += io::writeall(sink,
utf8::encoderune(escapetable[ix].1))?;
case void =>
let buf: [16]u8 = [0...];
const esc = fmt::bsprintf(buf, `\x{:x};`, rn: u32);
z += io::writeall(sink, strings::toutf8(esc))?;
};
};
z += io::writeall(sink, ['\"'])?;
return z;
};
// Quotes a Kojote string and returns a new string. The caller must free the
// return value.
export fn quotestr(s: str) str = {
const sink = memio::dynamic();
quote(&sink, s)!;
return memio::string(&sink)!;
};

View file

@ -1,50 +1,76 @@
use ascii; use ascii;
use fmt; use fmt;
use strings;
use encoding::utf8;
use memio;
use io;
use sort;
// Returns a string representation of the token. def punctrepr: [_]u8 = ['[', ']', '{', '}'];
//
// This string representation may not correspond one to one with the source, // Writes the human representation of a token to the provided I/O handle.
// but it is guaranteed to be a syntactically-valid construct, such that export fn tokrepr(sink: io::handle, tok: token) (size | io::error) = {
// parsing it results in the same token. let z = 0z;
export fn tokstr(tok: token) str = {
static let buf: [128]u8 = [0...]; if (tok.0: size < len(punctrepr)) {
return io::writeall(sink, [punctrepr[tok.0]])?;
};
switch (tok.0) { switch (tok.0) {
case ty::QUOT_START =>
return "[";
case ty::QUOT_END =>
return "]";
case ty::MAP_START =>
return "{";
case ty::MAP_END =>
return "}";
case ty::COMMENT => case ty::COMMENT =>
return fmt::bsprintf(buf, "({})", tok.1 as str); z += io::writeall(sink, ['('])?;
z += io::writeall(sink, strings::toutf8(tok.1 as str))?;
z += io::writeall(sink, [')'])?;
case ty::WORD => case ty::WORD =>
return tok.1 as str; z += io::writeall(sink, strings::toutf8(tok.1 as str))?;
case ty::SYMBOL => case ty::SYMBOL =>
return fmt::bsprintf(buf, "\\{}", tok.1 as str); z += io::writeall(sink, ['\\'])?;
z += io::writeall(sink, strings::toutf8(tok.1 as str))?;
case ty::KEYWORD => case ty::KEYWORD =>
return fmt::bsprintf(buf, ":{}", tok.1 as str); z += io::writeall(sink, [':'])?;
z += io::writeall(sink, strings::toutf8(tok.1 as str))?;
case ty::STRING => case ty::STRING =>
return fmt::bsprintf(buf, "\"{}\"", escape(tok.1 as str)); z += quote(sink, tok.1 as str)?;
case ty::CHAR => case ty::CHAR =>
let rn = tok.1 as rune; let rn = tok.1 as rune;
for (let i = 0z; i < len(longcharnames); i += 1) { let named = false;
if (longcharnames[i].1 == rn) {
return fmt::bsprintf(buf, "#\\{}", z += io::writeall(sink, ['#', '\\'])?;
longcharnames[i].0);
for (let ch .. longcharnames) {
if (ch.1 == rn) {
z += io::writeall(sink, strings::toutf8(ch.0))?;
named = true;
break;
}; };
}; };
if (!named) {
if (isgraph(rn)) { if (isgraph(rn)) {
return fmt::bsprintf(buf, "#\\{}", rn); z += io::writeall(sink, utf8::encoderune(rn))?;
} else { } else {
return fmt::bsprintf(buf, "#\\x{:x};", rn: u32); z += fmt::fprintf(sink, "x{:x};", rn: u32)?;
};
}; };
case ty::NUMBER => case ty::NUMBER =>
return tok.1 as str; z += io::writeall(sink, strings::toutf8(tok.1 as str))?;
case ty::BOOLEAN => case ty::BOOLEAN =>
return fmt::bsprintf(buf, "#{}", z += io::writeall(sink, ['#'])?;
if (tok.1 as bool) 't' else 'f'); z += io::writeall(sink,
if (tok.1 as bool) ['t'] else ['f'])?;
case =>
// unreachable
abort();
}; };
return z;
}; };
// Returns the human representation of a token as a new string. The caller must
// free the return value.
export fn tokstr(tok: token) str = {
const sink = memio::dynamic();
tokrepr(&sink, tok)!;
return memio::string(&sink)!;
};

View file

@ -1,10 +1,3 @@
use ascii;
use fmt;
use memio;
use strings;
use io;
use sort;
use unicode; use unicode;
def graphtable: [unicode::gc::Zs]bool = [ def graphtable: [unicode::gc::Zs]bool = [
@ -17,53 +10,5 @@ def graphtable: [unicode::gc::Zs]bool = [
false, false, false, // Z false, false, false, // Z
]; ];
// Sorted to use with [[sort::search]].
def escapetable: [_](rune, rune) = [
('\0', '0'),
('\a', 'a'),
('\b', 'b'),
('\t', 't'),
('\n', 'n'),
('\v', 'v'),
('\f', 'f'),
('\r', 'r'),
('\x1b', 'e'),
('"', '"'),
('\\', '\\'),
];
fn escapetable_cmpfunc(a: const *opaque, b: const *opaque) int =
(a: *(rune, rune)).0: int - (b: *(rune, rune)).0: int;
// Returns whether a rune is a graphical character.
fn isgraph(r: rune) bool = fn isgraph(r: rune) bool =
if (r == ' ') true else graphtable[unicode::rune_gc(r)]; if (r == ' ') true else graphtable[unicode::rune_gc(r)];
// Escapes a string.
fn escape(s: str) str = {
static let buf: [2048]u8 = [0...];
let buf = memio::fixed(buf);
let iter = strings::iter(s);
for (let ch => strings::next(&iter)) {
if (isgraph(ch)) {
memio::appendrune(&buf, ch)!;
} else {
match (sort::search(
escapetable: []opaque,
size((rune, rune)),
&ch: const *opaque,
&escapetable_cmpfunc)) {
case let sz: size =>
memio::appendrune(&buf, '\\')!;
memio::appendrune(&buf, escapetable[sz].1)!;
case void =>
static let hexbuf: [8]u8 = [0...];
memio::concat(&buf, fmt::bsprintf(
hexbuf, `\x{:x};`, ch: u32))!;
};
};
};
return memio::string(&buf)!;
};