From c372d85de297f2e0829312f6ae55a56cf0dd1afa Mon Sep 17 00:00:00 2001 From: Lobo Torres Date: Sat, 7 Dec 2024 21:25:17 -0300 Subject: [PATCH] lex: token representation and quoting stuff * `escape` => `quote` and `quotestr` `quote` writes the result to a I/O handle, `quotestr` returns it as a heap-allocated string. * `tokstr` => `tokrepr` and `tokstr` same deal --- kojote/lex/+test.ha | 29 +++++++-------- kojote/lex/lexer.ha | 48 ++++++++++++++----------- kojote/lex/quote.ha | 71 ++++++++++++++++++++++++++++++++++++ kojote/lex/token.ha | 88 +++++++++++++++++++++++++++++---------------- kojote/lex/util.ha | 55 ---------------------------- 5 files changed, 170 insertions(+), 121 deletions(-) create mode 100644 kojote/lex/quote.ha diff --git a/kojote/lex/+test.ha b/kojote/lex/+test.ha index 773ad12..62872fb 100644 --- a/kojote/lex/+test.ha +++ b/kojote/lex/+test.ha @@ -2,11 +2,10 @@ use memio; use fmt; use strings; use io; - -type dummytoken = (ty, value); +use os; @test fn next() void = { - const cases: [_](str, []dummytoken) = [ + const cases: [_](str, [](ty, value)) = [ (`"hello" \greeting def`, [ (ty::STRING, "hello"), @@ -48,16 +47,17 @@ type dummytoken = (ty, value); ]), ]; - for (let i = 0z; i < len(cases); i += 1) { - const src = strings::toutf8(cases[i].0); + for (let tcase .. cases) { + const src = strings::toutf8(tcase.0); const src = memio::fixed(src); const lexer = newlexer(&src, ""); defer close(&lexer); - for (let j = 0z; j < len(cases[i].1); j += 1) { - const want = cases[i].1[j]; + for (let want .. tcase.1) { const have = match (next(&lexer)) { case let tok: token => + tokrepr(os::stdout, tok)!; + fmt::print(' ')!; yield tok; case io::EOF => assert(false, "reached EOF"); @@ -68,19 +68,20 @@ type dummytoken = (ty, value); }; if (!tokeq(have, want)) { - fmt::printf("Expected:\n\t")!; - fmt::println(tokstr((want.0, want.1, location{ ... })))!; - fmt::printf("Got:\n\t")!; - fmt::println(tokstr(have))!; - assert(false); + fmt::errorf("Expected:\n\t")!; + tokrepr(os::stderr, + (want.0, want.1, location{ ... }))!; + fmt::errorf("\nGot:\n\t")!; + tokrepr(os::stderr, have)!; + assert(false, "test case doesn't match expectation"); }; }; - assert(next(&lexer) is io::EOF); + assert(next(&lexer) is io::EOF, "didn't reach EOF at the end of test case"); }; }; -fn tokeq(have: token, want: dummytoken) bool = +fn tokeq(have: token, want: (ty, value)) bool = have.0 == want.0 && match (have.1) { case void => yield true; diff --git a/kojote/lex/lexer.ha b/kojote/lex/lexer.ha index 150c7be..4fe2f8b 100644 --- a/kojote/lex/lexer.ha +++ b/kojote/lex/lexer.ha @@ -1,29 +1,31 @@ use ascii; +use strings; use bufio; use encoding::utf8; use io; use memio; use unicode; use strconv; - -use fmt; -use strings; +use sort; def longcharnames: [_](str, rune) = [ - ("nul", '\u0000'), - ("alarm", '\u0007'), - ("backspace", '\u0008'), - ("tab", '\u0009'), - ("newline", '\u000a'), - ("linefeed", '\u000a'), - ("vtab", '\u000b'), - ("page", '\u000c'), - ("return", '\u000d'), - ("esc", '\u001b'), - ("space", '\u0020'), - ("delete", '\u007f'), + ("alarm", '\x07'), + ("backspace", '\x08'), + ("delete", '\x7f'), + ("esc", '\x1b'), + ("linefeed", '\x0a'), + ("newline", '\x0a'), + ("nul", '\0'), + ("page", '\x0c'), + ("return", '\x0d'), + ("space", ' '), + ("tab", '\x09'), + ("vtab", '\x0b'), ]; +fn longcharnames_namecmp(a: const *opaque, b: const *opaque) int = + strings::compare((a: *(str, rune)).0, (b: *(str, rune)).0); + export type lexer = struct { in: io::handle, strbuf: memio::stream, @@ -264,13 +266,17 @@ fn scanchar(lex: *lexer) (token | error) = { memio::appendrune(&namebuf, rn)!; memio::concat(&namebuf, scanword(lex)?)!; const name = memio::string(&namebuf)!; - for (let i = 0z; i < len(longcharnames); i += 1) { - if (name == longcharnames[i].0) { - return mktoken(lex, ty::CHAR, - longcharnames[i].1); - }; + + match (sort::search(longcharnames: []opaque, + size((str, rune)), + &name: const *opaque, + &longcharnames_namecmp)) { + case let ix: size => + return mktoken(lex, ty::CHAR, + longcharnames[ix].1); + case void => + return mkerror(lex, "invalid named character literal"); }; - return mkerror(lex, "invalid named character literal"); }; }; }; diff --git a/kojote/lex/quote.ha b/kojote/lex/quote.ha new file mode 100644 index 0000000..10b83c6 --- /dev/null +++ b/kojote/lex/quote.ha @@ -0,0 +1,71 @@ +use ascii; +use encoding::utf8; +use fmt; +use io; +use memio; +use sort; +use strings; + +use unicode; + +// Sorted to use with [[sort::search]]. +def escapetable: [_](rune, rune) = [ + ('\0', '0'), + ('\a', 'a'), + ('\b', 'b'), + ('\t', 't'), + ('\n', 'n'), + ('\v', 'v'), + ('\f', 'f'), + ('\r', 'r'), + ('\x1b', 'e'), + ('"', '"'), + ('\\', '\\'), +]; + +fn escapetable_cmpfunc(a: const *opaque, b: const *opaque) int = + (a: *(rune, rune)).0: int - (b: *(rune, rune)).0: int; + +// Quotes a Kojote string and writes it to the provided I/O handle. +export fn quote(sink: io::handle, s: str) (size | io::error) = { + if (len(s) == 0) { + return io::writeall(sink, ['\"', '\"'])?; + }; + + let z = io::writeall(sink, ['\"'])?; + + const iter = strings::iter(s); + for (let rn => strings::next(&iter)) { + if (isgraph(rn)) { + z += io::writeall(sink, utf8::encoderune(rn))?; + continue; + }; + + match (sort::search( + escapetable: []opaque, + size((rune, rune)), + &rn: const *opaque, + &escapetable_cmpfunc)) { + case let ix: size => + z += io::writeall(sink, ['\\'])?; + z += io::writeall(sink, + utf8::encoderune(escapetable[ix].1))?; + case void => + let buf: [16]u8 = [0...]; + const esc = fmt::bsprintf(buf, `\x{:x};`, rn: u32); + z += io::writeall(sink, strings::toutf8(esc))?; + }; + }; + + z += io::writeall(sink, ['\"'])?; + return z; +}; + +// Quotes a Kojote string and returns a new string. The caller must free the +// return value. +export fn quotestr(s: str) str = { + const sink = memio::dynamic(); + quote(&sink, s)!; + return memio::string(&sink)!; +}; + diff --git a/kojote/lex/token.ha b/kojote/lex/token.ha index 44da63e..db6c1df 100644 --- a/kojote/lex/token.ha +++ b/kojote/lex/token.ha @@ -1,50 +1,76 @@ use ascii; use fmt; +use strings; +use encoding::utf8; +use memio; +use io; +use sort; -// Returns a string representation of the token. -// -// This string representation may not correspond one to one with the source, -// but it is guaranteed to be a syntactically-valid construct, such that -// parsing it results in the same token. -export fn tokstr(tok: token) str = { - static let buf: [128]u8 = [0...]; +def punctrepr: [_]u8 = ['[', ']', '{', '}']; + +// Writes the human representation of a token to the provided I/O handle. +export fn tokrepr(sink: io::handle, tok: token) (size | io::error) = { + let z = 0z; + + if (tok.0: size < len(punctrepr)) { + return io::writeall(sink, [punctrepr[tok.0]])?; + }; switch (tok.0) { - case ty::QUOT_START => - return "["; - case ty::QUOT_END => - return "]"; - case ty::MAP_START => - return "{"; - case ty::MAP_END => - return "}"; case ty::COMMENT => - return fmt::bsprintf(buf, "({})", tok.1 as str); + z += io::writeall(sink, ['('])?; + z += io::writeall(sink, strings::toutf8(tok.1 as str))?; + z += io::writeall(sink, [')'])?; case ty::WORD => - return tok.1 as str; + z += io::writeall(sink, strings::toutf8(tok.1 as str))?; case ty::SYMBOL => - return fmt::bsprintf(buf, "\\{}", tok.1 as str); + z += io::writeall(sink, ['\\'])?; + z += io::writeall(sink, strings::toutf8(tok.1 as str))?; case ty::KEYWORD => - return fmt::bsprintf(buf, ":{}", tok.1 as str); + z += io::writeall(sink, [':'])?; + z += io::writeall(sink, strings::toutf8(tok.1 as str))?; case ty::STRING => - return fmt::bsprintf(buf, "\"{}\"", escape(tok.1 as str)); + z += quote(sink, tok.1 as str)?; case ty::CHAR => let rn = tok.1 as rune; - for (let i = 0z; i < len(longcharnames); i += 1) { - if (longcharnames[i].1 == rn) { - return fmt::bsprintf(buf, "#\\{}", - longcharnames[i].0); + let named = false; + + z += io::writeall(sink, ['#', '\\'])?; + + for (let ch .. longcharnames) { + if (ch.1 == rn) { + z += io::writeall(sink, strings::toutf8(ch.0))?; + named = true; + break; }; }; - if (isgraph(rn)) { - return fmt::bsprintf(buf, "#\\{}", rn); - } else { - return fmt::bsprintf(buf, "#\\x{:x};", rn: u32); + + if (!named) { + if (isgraph(rn)) { + z += io::writeall(sink, utf8::encoderune(rn))?; + } else { + z += fmt::fprintf(sink, "x{:x};", rn: u32)?; + }; }; case ty::NUMBER => - return tok.1 as str; + z += io::writeall(sink, strings::toutf8(tok.1 as str))?; case ty::BOOLEAN => - return fmt::bsprintf(buf, "#{}", - if (tok.1 as bool) 't' else 'f'); + z += io::writeall(sink, ['#'])?; + z += io::writeall(sink, + if (tok.1 as bool) ['t'] else ['f'])?; + case => + // unreachable + abort(); }; + + return z; }; + +// Returns the human representation of a token as a new string. The caller must +// free the return value. +export fn tokstr(tok: token) str = { + const sink = memio::dynamic(); + tokrepr(&sink, tok)!; + return memio::string(&sink)!; +}; + diff --git a/kojote/lex/util.ha b/kojote/lex/util.ha index fb5eacf..56a56b1 100644 --- a/kojote/lex/util.ha +++ b/kojote/lex/util.ha @@ -1,10 +1,3 @@ -use ascii; -use fmt; -use memio; -use strings; -use io; -use sort; - use unicode; def graphtable: [unicode::gc::Zs]bool = [ @@ -17,53 +10,5 @@ def graphtable: [unicode::gc::Zs]bool = [ false, false, false, // Z ]; -// Sorted to use with [[sort::search]]. -def escapetable: [_](rune, rune) = [ - ('\0', '0'), - ('\a', 'a'), - ('\b', 'b'), - ('\t', 't'), - ('\n', 'n'), - ('\v', 'v'), - ('\f', 'f'), - ('\r', 'r'), - ('\x1b', 'e'), - ('"', '"'), - ('\\', '\\'), -]; - -fn escapetable_cmpfunc(a: const *opaque, b: const *opaque) int = - (a: *(rune, rune)).0: int - (b: *(rune, rune)).0: int; - -// Returns whether a rune is a graphical character. fn isgraph(r: rune) bool = if (r == ' ') true else graphtable[unicode::rune_gc(r)]; - -// Escapes a string. -fn escape(s: str) str = { - static let buf: [2048]u8 = [0...]; - let buf = memio::fixed(buf); - let iter = strings::iter(s); - - for (let ch => strings::next(&iter)) { - if (isgraph(ch)) { - memio::appendrune(&buf, ch)!; - } else { - match (sort::search( - escapetable: []opaque, - size((rune, rune)), - &ch: const *opaque, - &escapetable_cmpfunc)) { - case let sz: size => - memio::appendrune(&buf, '\\')!; - memio::appendrune(&buf, escapetable[sz].1)!; - case void => - static let hexbuf: [8]u8 = [0...]; - memio::concat(&buf, fmt::bsprintf( - hexbuf, `\x{:x};`, ch: u32))!; - }; - }; - }; - - return memio::string(&buf)!; -};