From d7b6b380fe6d13af4455759cbb14c9cb2e6b474e Mon Sep 17 00:00:00 2001 From: Lobo Torres Date: Thu, 5 Dec 2024 22:06:21 -0300 Subject: [PATCH] lex: rework types --- kojote/lex/+test.ha | 131 ++++++++++++++++++++++++++++ parse/lex.ha => kojote/lex/lexer.ha | 130 +++++++++++++++------------ kojote/lex/types.ha | 51 +++++++++++ parse/+test/lexer.ha | 117 ------------------------- parse/types.ha | 32 ------- test.kj | 7 +- 6 files changed, 258 insertions(+), 210 deletions(-) create mode 100644 kojote/lex/+test.ha rename parse/lex.ha => kojote/lex/lexer.ha (65%) create mode 100644 kojote/lex/types.ha delete mode 100644 parse/+test/lexer.ha delete mode 100644 parse/types.ha diff --git a/kojote/lex/+test.ha b/kojote/lex/+test.ha new file mode 100644 index 0000000..5ece579 --- /dev/null +++ b/kojote/lex/+test.ha @@ -0,0 +1,131 @@ +use memio; +use fmt; +use strings; +use io; + +type dummytoken = (ty, value); + +@test fn next() void = { + const cases: [_](str, []dummytoken) = [ + (`"hello" \greeting def`, + [ + (ty::STRING, "hello"), + (ty::SYMBOL, "greeting"), + (ty::WORD, "def") + ]), + (`[dup *] (a -- a) \square def`, + [ + (ty::QUOT_START, void), + (ty::WORD, "dup"), + (ty::WORD, "*"), + (ty::QUOT_END, void), + (ty::COMMENT, "a -- a"), + (ty::SYMBOL, "square"), + (ty::WORD, "def"), + ]), + (`#t #f`, + [ + (ty::BOOLEAN, true), + (ty::BOOLEAN, false), + ]), + (`#\a #\space #\nul`, + [ + (ty::CHAR, 'a'), + (ty::CHAR, ' '), + (ty::CHAR, '\0'), + ]), + (`"\x0a;\x2014;\x2f9f4;"`, + [ + (ty::STRING, "\n—嶲"), + ]), + (`#\x #\x0a; #\x2014; #\x2f9f4;`, + [ + (ty::CHAR, 'x'), + (ty::CHAR, '\n'), + (ty::CHAR, '—'), + (ty::CHAR, '嶲'), + ]), + ]; + + for (let i = 0z; i < len(cases); i += 1) { + const src = strings::toutf8(cases[i].0); + const src = memio::fixed(src); + const lexer = newlexer(&src, ""); + defer close(&lexer); + + for (let j = 0z; j < len(cases[i].1); j += 1) { + const want = cases[i].1[j]; + const have = match (next(&lexer)) { + case let tok: token => + yield tok; + case io::EOF => + assert(false, "reached EOF"); + return; + case let err: error => + assert(false, strerror(err)); + return; + }; + + if (!tokeq(have, want)) { + fmt::printf("Expected:\n\t")!; + fmt::println(tokpp(want.0, want.1))!; + fmt::printf("Got:\n\t")!; + fmt::println(tokpp(have.0, have.1))!; + assert(false); + }; + }; + + assert(next(&lexer) is io::EOF); + }; +}; + +fn tokeq(have: token, want: dummytoken) bool = + have.0 == want.0 && match (have.1) { + case void => + yield true; + case let s: str => + yield want.1 is str && (want.1 as str) == s; + case let r: rune => + yield want.1 is rune && (want.1 as rune) == r; + case let b: bool => + yield want.1 is bool && (want.1 as bool) == b; + }; + +fn tokpp(ty: ty, value: value) const str = { + static let buf: [128]u8 = [0...]; + + switch (ty) { + case ty::QUOT_START => + return "["; + case ty::QUOT_END => + return "]"; + case ty::MAP_START => + return "{"; + case ty::MAP_END => + return "}"; + case ty::COMMENT => + return fmt::bsprintf(buf, "({})", value as str); + case ty::WORD => + return value as str; + case ty::SYMBOL => + return fmt::bsprintf(buf, "\\{}", value as str); + case ty::KEYWORD => + return fmt::bsprintf(buf, ":{}", value as str); + case ty::STRING => + return fmt::bsprintf(buf, "\"{}\"", value as str); + case ty::CHAR => + let rn = value as rune; + for (let i = 0z; i < len(longcharnames); i += 1) { + if (longcharnames[i].1 == rn) { + return fmt::bsprintf(buf, "#\\{}", + longcharnames[i].0); + }; + }; + return fmt::bsprintf(buf, "#\\{}", rn); + case ty::NUMBER => + return value as str; + case ty::BOOLEAN => + return fmt::bsprintf(buf, "#{}", + if (value as bool) 't' else 'f'); + }; +}; diff --git a/parse/lex.ha b/kojote/lex/lexer.ha similarity index 65% rename from parse/lex.ha rename to kojote/lex/lexer.ha index b87b946..93cc318 100644 --- a/parse/lex.ha +++ b/kojote/lex/lexer.ha @@ -27,16 +27,19 @@ def longcharnames: [_](str, rune) = [ export type lexer = struct { in: io::handle, strbuf: memio::stream, + commentbuf: memio::stream, path: str, loc: (uint, uint), prevloc: (uint, uint), unread: (rune | void), }; +// Creates a new [[lexer]] for the given [[io::handle]]. The path is borrowed export fn newlexer(in: io::handle, path: str) lexer = { return lexer { in = in, strbuf = memio::dynamic(), + commentbuf = memio::dynamic(), path = path, loc = (1, 0), unread = void, @@ -44,11 +47,14 @@ export fn newlexer(in: io::handle, path: str) lexer = { }; }; +// Frees resources associated with a [[lexer]]. export fn close(lex: *lexer) void = { io::close(&lex.strbuf)!; + io::close(&lex.commentbuf)!; }; -export fn lex(lex: *lexer) (token | io::EOF | error) = { +// Returns the next token from the lexer. +export fn next(lex: *lexer) (token | io::EOF | error) = { const rn = match (nextrunews(lex)?) { case io::EOF => return io::EOF; @@ -58,30 +64,30 @@ export fn lex(lex: *lexer) (token | io::EOF | error) = { switch (rn) { case '(' => - return comment{ v = scancomment(lex)? }; + return mktoken(lex, ty::COMMENT, scancomment(lex)?); case ')' => - return lex.loc: invalid; + return mkerror(lex, "invalid token"); case '[' => - return quotstart; + return mktoken(lex, ty::QUOT_START, void); case ']' => - return quotend; + return mktoken(lex, ty::QUOT_END, void); case '{' => - return mapstart; + return mktoken(lex, ty::MAP_START, void); case '}' => - return mapend; + return mktoken(lex, ty::MAP_END, void); case '\\' => let v = scanword(lex)?; if (len(v) == 0) { - return lex.loc: invalid; + return mkerror(lex, "invalid symbol literal"); } else { - return symbol{ v = v, kw = false }; + return mktoken(lex, ty::SYMBOL, v); }; case ':' => let v = scanword(lex)?; if (len(v) == 0) { - return lex.loc: invalid; + return mkerror(lex, "invalid keyword"); } else { - return symbol{ v = v, kw = true }; + return mktoken(lex, ty::KEYWORD, v); }; case '#' => return scanpound(lex)?; @@ -92,7 +98,7 @@ export fn lex(lex: *lexer) (token | io::EOF | error) = { }; unget(lex, rn); - return word{ v = scanword(lex)? }; + return mktoken(lex, ty::WORD, scanword(lex)?); }; fn nextrune(lex: *lexer) (rune | io::EOF | error) = { @@ -124,7 +130,7 @@ fn nextrune(lex: *lexer) (rune | io::EOF | error) = { case let err: io::error => return err; case utf8::invalid => - return lex.loc: invalid; + return mkerror(lex, "invalid UTF-8 sequence"); }; }; @@ -132,9 +138,7 @@ fn nextrunews(lex: *lexer) (rune | io::EOF | error) = { for (true) { match (nextrune(lex)?) { case let rn: rune => - if (isspace(rn)) { - continue; - }; + if (isspace(rn)) continue; return rn; case io::EOF => return io::EOF; @@ -167,36 +171,36 @@ fn scanword(lex: *lexer) (str | error) = { }; fn scancomment(lex: *lexer) (str | error) = { - memio::reset(&lex.strbuf); + memio::reset(&lex.commentbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => - return ("comment", lex.loc.0, lex.loc.1): unterminated; + return mkerror(lex, "unterminated comment"); }; switch (rn) { case '(' => - return lex.loc: invalid; + return mkerror(lex, "nested comments are not allowed"); case ')' => break; case => - memio::appendrune(&lex.strbuf, rn)!; + memio::appendrune(&lex.commentbuf, rn)!; }; }; - return memio::string(&lex.strbuf)!; + return memio::string(&lex.commentbuf)!; }; -fn scanstr(lex: *lexer) (str | error) = { +fn scanstr(lex: *lexer) (token | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => - return ("string literal", lex.loc.0, lex.loc.1): unterminated; + return mkerror(lex, "unterminated string literal"); }; switch (rn) { @@ -207,7 +211,7 @@ fn scanstr(lex: *lexer) (str | error) = { memio::appendrune(&lex.strbuf, rn)!; }; }; - return memio::string(&lex.strbuf)!; + return mktoken(lex, ty::STRING, memio::string(&lex.strbuf)!); }; fn scanpound(lex: *lexer) (token | error) = { @@ -215,22 +219,22 @@ fn scanpound(lex: *lexer) (token | error) = { case let rn: rune => yield rn; case io::EOF => - return ("pound literal", lex.loc.0, lex.loc.1): unterminated; + return mkerror(lex, "unterminated pound literal"); }; switch (rn) { case 't' => - return true; + return mktoken(lex, ty::BOOLEAN, true); case 'f' => - return false; + return mktoken(lex, ty::BOOLEAN, false); case '\\' => - return scanchar(lex)?; + return scanchar(lex); case => - return lex.loc: invalid; + return mkerror(lex, "invalid pound literal"); }; }; -fn scanchar(lex: *lexer) (rune | error) = { +fn scanchar(lex: *lexer) (token | error) = { static let namebuf: [16]u8 = [0...]; let namebuf = memio::fixed(namebuf); @@ -238,31 +242,35 @@ fn scanchar(lex: *lexer) (rune | error) = { case let rn: rune => yield rn; case io::EOF => - return ("character literal", lex.loc.0, lex.loc.1): unterminated; + return mkerror(lex, "unterminated character literal"); }; + let ret: rune = '\0'; + match (nextrune(lex)?) { - case let rnn: rune => - unget(lex, rnn); - if (isspace(rnn)) { - return rn; - } else { - if (rn == 'x') { - return scanescape2(lex); - } else { - memio::appendrune(&namebuf, rn)!; - memio::concat(&namebuf, scanword(lex)?)!; - const name = memio::string(&namebuf)!; - for (let i = 0z; i < len(longcharnames); i += 1) { - if (name == longcharnames[i].0) { - return longcharnames[i].1; - }; - }; - return lex.loc: invalid; - }; - }; case io::EOF => - return rn; + return mktoken(lex, ty::CHAR, rn); + case let next: rune => + unget(lex, next); + + if (isspace(next)) { + return mktoken(lex, ty::CHAR, rn); + }; + + if (rn == 'x') { + return mktoken(lex, ty::CHAR, scanescape2(lex)?); + } else { + memio::appendrune(&namebuf, rn)!; + memio::concat(&namebuf, scanword(lex)?)!; + const name = memio::string(&namebuf)!; + for (let i = 0z; i < len(longcharnames); i += 1) { + if (name == longcharnames[i].0) { + return mktoken(lex, ty::CHAR, + longcharnames[i].1); + }; + }; + return mkerror(lex, "invalid named character literal"); + }; }; }; @@ -271,7 +279,7 @@ fn scanescape(lex: *lexer) (rune | error) = { case let rn: rune => yield rn; case io::EOF => - return ("escape sequence", lex.loc.0, lex.loc.1): unterminated; + return mkerror(lex, "unterminated character escape"); }; switch (rn) { @@ -288,7 +296,7 @@ fn scanescape(lex: *lexer) (rune | error) = { case '"' => return '"'; case 'x' => return scanescape2(lex)?; case => - return lex.loc: invalid; + return mkerror(lex, "invalid character escape"); }; }; @@ -304,7 +312,7 @@ fn scanescape2(lex: *lexer) (rune | error) = { case let rn: rune => yield rn; case io::EOF => - return ("escape sequence", lex.loc.0, lex.loc.1): unterminated; + return mkerror(lex, "unterminated character escape"); }; const buf: [6]u8 = [0...]; @@ -317,11 +325,11 @@ fn scanescape2(lex: *lexer) (rune | error) = { case let rn: rune => yield rn; case io::EOF => - return ("escape sequence", lex.loc.0, lex.loc.1): unterminated; + return mkerror(lex, "unterminated escape sequence"); }; if (count > 6) { - return lex.loc: invalid; + return mkerror(lex, "invalid escape sequence"); } else if (rn == ';') { break; } else { @@ -337,10 +345,16 @@ fn scanescape2(lex: *lexer) (rune | error) = { case let codepoint: u32 => return codepoint: rune; case => - return lex.loc: invalid; + return mkerror(lex, "invalid escape sequence"); }; }; +fn mktoken(lex: *lexer, ty: ty, value: value) token = + (ty, value, location{ path = lex.path, line = lex.loc.0, column = lex.loc.1 }); + +fn mkerror(lex: *lexer, msg: const str) syntax = + (location{ path = lex.path, line = lex.loc.0, column = lex.loc.1 }, msg); + fn isspace(rn: rune) bool = { if (ascii::isspace(rn)) { return true; @@ -355,7 +369,7 @@ fn isspace(rn: rune) bool = { }; fn isdelimiter(rn: rune) bool = { - match (strings::index(`()[]{}\:#`, rn)) { + match (strings::index(`()[]{}`, rn)) { case size => return true; case => diff --git a/kojote/lex/types.ha b/kojote/lex/types.ha new file mode 100644 index 0000000..3311e72 --- /dev/null +++ b/kojote/lex/types.ha @@ -0,0 +1,51 @@ +use io; +use fmt; + +// A syntax error. +export type syntax = !(location, str); + +// All possible lexer errors +export type error = !(io::error | syntax); + +// A token type +export type ty = enum uint { + QUOT_START, + QUOT_END, + MAP_START, + MAP_END, + COMMENT, + WORD, + SYMBOL, + KEYWORD, + STRING, + CHAR, + NUMBER, + BOOLEAN, +}; + +// A token value, used for literal tokens and comments. +export type value = (str | rune | bool | void); + +// A location within a source file. +// The path is borrowed from the file name given to the lexer. +export type location = struct { + path: str, + line: uint, + column: uint, +}; + +// A single lexical token. +export type token = (ty, value, location); + +// Returns a human-friendly string for a given error. The result may be +// statically allocated. +export fn strerror(err: error) const str = { + static let buf: [512]u8 = [0...]; + match (err) { + case let err: io::error => + return io::strerror(err); + case let s: syntax => + return fmt::bsprintf(buf, "{}:{}:{}: syntax error: {}", + s.0.path, s.0.line, s.0.column, s.1); + }; +}; diff --git a/parse/+test/lexer.ha b/parse/+test/lexer.ha deleted file mode 100644 index 5c883cf..0000000 --- a/parse/+test/lexer.ha +++ /dev/null @@ -1,117 +0,0 @@ -use memio; -use fmt; -use strings; -use io; - -@test fn lex() void = { - const cases: [_](str, []token) = [ - (`"hello" \greeting def`, - ["hello", mksym("greeting"), mkword("def")]), - (`[dup *] (a -- a) \square def`, - [quotstart, mkword("dup"), mkword("*"), quotend, - mkcomment("a -- a"), mksym("square"), - mkword("def")]), - (`#t #f`, [true, false]), - (`#\a #\space #\nul`, ['a', ' ', '\0']), - (`"\x0a;" "\x2014;" "\x2f9f4;"`, ["\n", "—", "嶲"]), - (`#\x #\x0a; #\x2014; #\x2f9f4;`, ['x', '\n', '—', '嶲']), - ]; - - for (let i = 0z; i < len(cases); i += 1) { - const src = strings::toutf8(cases[i].0); - const src = memio::fixed(src); - const lexer = newlexer(&src, ""); - defer close(&lexer); - - for (let j = 0z; j < len(cases[i].1); j += 1) { - const want = cases[i].1[j]; - const have = match (lex(&lexer)) { - case let tok: token => - yield tok; - case io::EOF => - assert(false, "reached EOF"); - return; - case let err: error => - assert(false, strerror(err)); - return; - }; - - if (!tokeq(want, have)) { - fmt::printfln("Case {}: {}", i, cases[i].0)!; - fmt::print("\tExpected: ")!; - tokpp(want); - fmt::print("\tGot: ")!; - tokpp(have); - assert(false); - }; - }; - - assert(lex(&lexer) is io::EOF); - }; -}; - -fn tokeq(have: token, want: token) bool = { - match (want) { - case quotstart => - return have is quotstart; - case quotend => - return have is quotend; - case mapstart => - return have is mapstart; - case mapend => - return have is mapend; - case let w: word => - return (have as word).v == w.v; - case let s: str => - return have as str == s; - case let s: symbol => - return (have as symbol).v == s.v; - case let c: comment => - return (have as comment).v == c.v; - case let r: rune => - return have as rune == r; - case let b: bool => - return have as bool == b; - }; -}; - -fn tokpp(tok: token) void = { - match (tok) { - case quotstart => - fmt::println("[")!; - case quotend => - fmt::println("]")!; - case mapstart => - fmt::println("{")!; - case mapend => - fmt::println("}")!; - case let w: word => - fmt::println(w.v)!; - case let s: symbol => - fmt::printfln("{}{}", if (s.kw) ":" else "\\", s.v)!; - case let s: str => - fmt::printfln(`"{}"`, s)!; - case let c: comment => - fmt::printfln("({})", c.v)!; - case let r: rune => - for (let i = 0z; i < len(longcharnames); i += 1) { - if (r == longcharnames[i].1) { - fmt::printfln("#\\{}", longcharnames[i].0)!; - return; - }; - }; - fmt::printfln("#\\{}", r)!; - case let b: bool => - fmt::println(if (b) "#t" else "#f")!; - }; -}; - -fn mkword(v: const str) word = - word{ v = v }; - -fn mkcomment(v: const str) comment = - comment{ v = v }; - -fn mksym(v: const str, kw: bool = false) symbol = - symbol{ v = v, kw = kw }; - diff --git a/parse/types.ha b/parse/types.ha deleted file mode 100644 index 809ed7e..0000000 --- a/parse/types.ha +++ /dev/null @@ -1,32 +0,0 @@ -use io; -use fmt; - -export type invalid = !(uint, uint); -export type unterminated = !(const str, uint, uint); -export type error = !(invalid | unterminated | io::error); - -export type quotstart = void; -export type quotend = void; -export type mapstart = void; -export type mapend = void; - -export type comment = struct { v: str }; -export type word = struct { v: str }; -export type symbol = struct { v: str, kw: bool }; - -export type token = (quotstart | quotend | mapstart | mapend | - word | symbol | comment | str | rune | bool); - -export fn strerror(err: error) const str = { - static let buf: [64]u8 = [0...]; - match (err) { - case let err: invalid => - return fmt::bsprintf(buf, - "Invalid token found at {}:{}", err.0, err.1); - case let err: unterminated => - return fmt::bsprintf(buf, - "Unterminated {} found at {}:{}", err.0, err.1, err.2); - case let err: io::error => - return io::strerror(err); - }; -}; diff --git a/test.kj b/test.kj index 0a65d59..958f7cf 100644 --- a/test.kj +++ b/test.kj @@ -1,5 +1,6 @@ -3.14159 \pi def -[dup *] \square def +( hello world! ) +\pi 3.14159 def +\square [dup *] def +\circarea [square pi *] def -[square pi *] \circarea def 20 circarea . ( => 1256.636 )