use ascii; use bufio; use encoding::utf8; use io; use memio; use unicode; use strconv; use fmt; use strings; // my cod prob sux :( def longcharnames: [_](str, rune) = [ ("nul", '\u0000'), ("alarm", '\u0007'), ("backspace", '\u0008'), ("newline", '\u000a'), ("tab", '\u0009'), ("linefeed", '\u000a'), ("vtab", '\u000b'), ("page", '\u000c'), ("return", '\u000d'), ("esc", '\u001b'), ("space", '\u0020'), ("delete", '\u007f'), ]; export type lexer = struct { in: io::handle, strbuf: memio::stream, path: str, loc: (uint, uint), prevloc: (uint, uint), unread: (rune | void), }; export fn newlexer(in: io::handle, path: str) lexer = { return lexer { in = in, strbuf = memio::dynamic(), path = path, loc = (1, 0), unread = void, ... }; }; export fn close(lex: *lexer) void = { io::close(&lex.strbuf)!; }; export fn lex(lex: *lexer) (token | io::EOF | error) = { const rn = match (nextrunews(lex)?) { case io::EOF => return io::EOF; case let rn: rune => yield rn; }; switch (rn) { case '(' => return comment{ v = scancomment(lex)? }; case ')' => return lex.loc: invalid; case '[' => return quotstart; case ']' => return quotend; case '{' => return mapstart; case '}' => return mapend; case '\\' => let v = scanword(lex)?; if (len(v) == 0) { return lex.loc: invalid; } else { return symbol{ v = v, kw = false }; }; case ':' => let v = scanword(lex)?; if (len(v) == 0) { return lex.loc: invalid; } else { return symbol{ v = v, kw = true }; }; case '#' => return scanhash(lex)?; case '"' => return scanstr(lex)?; case => yield; }; unget(lex, rn); return word{ v = scanword(lex)? }; }; fn nextrune(lex: *lexer) (rune | io::EOF | error) = { match (lex.unread) { case let rn: rune => lex.prevloc = lex.loc; lex.unread = void; if (rn == '\n') { lex.loc = (lex.loc.0 + 1, 0); } else { lex.loc.1 += 1; }; return rn; case void => yield; }; match (bufio::read_rune(lex.in)) { case let rn: rune => lex.prevloc = lex.loc; if (rn == '\n') { lex.loc = (lex.loc.0 + 1, 0); } else { lex.loc.1 += 1; }; return rn; case io::EOF => return io::EOF; case let err: io::error => return err; case utf8::invalid => return lex.loc: invalid; }; }; fn nextrunews(lex: *lexer) (rune | io::EOF | error) = { for (true) { match (nextrune(lex)?) { case let rn: rune => if (isspace(rn)) { continue; }; return rn; case io::EOF => return io::EOF; }; }; }; fn unget(lex: *lexer, rn: rune) void = { assert(lex.unread is void); lex.unread = rn; lex.loc = lex.prevloc; }; fn scanword(lex: *lexer) (str | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => break; }; if (isspace(rn) || isdelimiter(rn)) { unget(lex, rn); break; }; memio::appendrune(&lex.strbuf, rn)!; }; return memio::string(&lex.strbuf)!; }; fn scancomment(lex: *lexer) (str | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("comment", lex.loc.0, lex.loc.1): unterminated; }; switch (rn) { case '(' => return lex.loc: invalid; case ')' => break; case => memio::appendrune(&lex.strbuf, rn)!; }; }; return memio::string(&lex.strbuf)!; }; fn scanstr(lex: *lexer) (str | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("string literal", lex.loc.0, lex.loc.1): unterminated; }; switch (rn) { case '"' => break; case '\\' => memio::appendrune(&lex.strbuf, scanescape(lex)?)!; case => memio::appendrune(&lex.strbuf, rn)!; }; }; return memio::string(&lex.strbuf)!; }; fn scanhash(lex: *lexer) (token | error) = { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("hash literal", lex.loc.0, lex.loc.1): unterminated; }; switch (rn) { case 't' => return true; case 'f' => return false; case '\\' => return scanchar(lex)?; case => return lex.loc: invalid; }; }; fn scanchar(lex: *lexer) (rune | error) = { static let namebuf: [16]u8 = [0...]; let namebuf = memio::fixed(namebuf); const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("character literal", lex.loc.0, lex.loc.1): unterminated; }; match (nextrune(lex)?) { case let rnn: rune => unget(lex, rnn); if (isspace(rnn)) { return rn; } else { memio::appendrune(&namebuf, rn)!; memio::concat(&namebuf, scanword(lex)?)!; const name = memio::string(&namebuf)!; for (let i = 0z; i < len(longcharnames); i += 1) { if (name == longcharnames[i].0) { return longcharnames[i].1; }; }; return lex.loc: invalid; }; case io::EOF => return rn; }; }; fn scanescape(lex: *lexer) (rune | error) = { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("escape sequence", lex.loc.0, lex.loc.1): unterminated; }; switch (rn) { case '"' => return '"'; case '\\' => return '\\'; case 'n' => return '\n'; case 't' => return '\t'; case => return lex.loc: invalid; }; }; fn isspace(rn: rune) bool = { if (ascii::isspace(rn)) { return true; } else { switch (unicode::rune_gc(rn)) { case unicode::gc::Zs => return true; case => return false; }; }; }; fn isdelimiter(rn: rune) bool = { match (strings::index(`()[]{}\:#`, rn)) { case size => return true; case => return false; }; }; @test fn lex() void = { const cases: [_](str, []token) = [ (`"hello" \greeting def`, ["hello", mksym("greeting"), mkword("def")]), (`[dup *] (a -- a) \square def`, [quotstart, mkword("dup"), mkword("*"), quotend, mkcomment("a -- a"), mksym("square"), mkword("def")]), (`#t #f`, [true, false]), (`#\a #\space #\nul`, ['a', ' ', '\0']), ]; for (let i = 0z; i < len(cases); i += 1) { const src = strings::toutf8(cases[i].0); const src = memio::fixed(src); const lexer = newlexer(&src, ""); defer close(&lexer); for (let j = 0z; j < len(cases[i].1); j += 1) { const want = cases[i].1[j]; const have = lex(&lexer)! as token; if (!tokeq(want, have)) { fmt::printfln("Case {}: {}", i, cases[i].0)!; fmt::print("\tExpected: ")!; tokpprint(want); fmt::print("\tGot: ")!; tokpprint(have); assert(false); }; }; assert(lex(&lexer) is io::EOF); }; }; fn tokeq(have: token, want: token) bool = { match (want) { case quotstart => return have is quotstart; case quotend => return have is quotend; case mapstart => return have is mapstart; case mapend => return have is mapend; case let w: word => return (have as word).v == w.v; case let s: str => return have as str == s; case let s: symbol => return (have as symbol).v == s.v; case let c: comment => return (have as comment).v == c.v; case let r: rune => return have as rune == r; case let b: bool => return have as bool == b; }; }; fn tokpprint(tok: token) void = { match (tok) { case quotstart => fmt::println("[")!; case quotend => fmt::println("]")!; case mapstart => fmt::println("{")!; case mapend => fmt::println("}")!; case let w: word => fmt::println(w.v)!; case let s: symbol => fmt::printfln("{}{}", if (s.kw) ":" else "\\", s.v)!; case let s: str => fmt::printfln(`"{}"`, s)!; case let c: comment => fmt::printfln("({})", c.v)!; case let r: rune => for (let i = 0z; i < len(longcharnames); i += 1) { if (r == longcharnames[i].1) { fmt::printfln("#\\{}", longcharnames[i].0)!; return; }; }; fmt::printfln("#\\{}", r)!; case let b: bool => fmt::println(if (b) "#t" else "#f")!; }; }; fn mkword(v: const str) word = word{ v = v }; fn mkcomment(v: const str) comment = comment{ v = v }; fn mksym(v: const str, kw: bool = false) symbol = symbol{ v = v, kw = kw };