use ascii; use bufio; use encoding::utf8; use io; use memio; use unicode; // Testing dependency use fmt; use strings; // my cod prob sux :( export type lexer = struct { in: io::handle, strbuf: memio::stream, path: str, loc: (uint, uint), prevloc: (uint, uint), unread: (rune | void), }; export fn newlexer(in: io::handle, path: str) lexer = { return lexer { in = in, strbuf = memio::dynamic(), path = path, loc = (1, 0), unread = void, ... }; }; export fn close(lex: *lexer) void = { io::close(&lex.strbuf)!; }; export fn lex(lex: *lexer) (token | io::EOF | error) = { const rn = match (nextrunews(lex)?) { case io::EOF => return io::EOF; case let rn: rune => yield rn; }; switch (rn) { case '(' => return comment{ v = scancomment(lex)? }; case ')' => return lex.loc: invalid; case '[' => return quotstart; case ']' => return quotend; case '{' => return mapstart; case '}' => return mapend; case '\\' => let v = scanword(lex)?; if (len(v) == 0) { return lex.loc: invalid; } else { return symbol{ v = v, kw = false }; }; case ':' => let v = scanword(lex)?; if (len(v) == 0) { return lex.loc: invalid; } else { return symbol{ v = v, kw = true }; }; case '\'' => return scanchar(lex)?; case '"' => return scanstr(lex)?; case => yield; }; unget(lex, rn); return word{ v = scanword(lex)? }; }; fn nextrune(lex: *lexer) (rune | io::EOF | error) = { match (lex.unread) { case let rn: rune => lex.prevloc = lex.loc; lex.unread = void; if (rn == '\n') { lex.loc = (lex.loc.0 + 1, 0); } else { lex.loc.1 += 1; }; return rn; case void => yield; }; match (bufio::read_rune(lex.in)) { case let rn: rune => lex.prevloc = lex.loc; if (rn == '\n') { lex.loc = (lex.loc.0 + 1, 0); } else { lex.loc.1 += 1; }; return rn; case io::EOF => return io::EOF; case let err: io::error => return err; case utf8::invalid => return lex.loc: invalid; }; }; fn nextrunews(lex: *lexer) (rune | io::EOF | error) = { for (true) { match (nextrune(lex)?) { case let rn: rune => if (isspace(rn)) { continue; }; return rn; case io::EOF => return io::EOF; }; }; }; fn unget(lex: *lexer, rn: rune) void = { assert(lex.unread is void); lex.unread = rn; lex.loc = lex.prevloc; }; fn scanword(lex: *lexer) (str | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => break; }; if (isspace(rn) || isdelimiter(rn)) { unget(lex, rn); break; }; memio::appendrune(&lex.strbuf, rn)!; }; return memio::string(&lex.strbuf)!; }; fn scancomment(lex: *lexer) (str | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("comment", lex.loc.0, lex.loc.1): unterminated; }; switch (rn) { case '(' => return lex.loc: invalid; case ')' => break; case => memio::appendrune(&lex.strbuf, rn)!; }; }; return memio::string(&lex.strbuf)!; }; fn scanstr(lex: *lexer) (str | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("string literal", lex.loc.0, lex.loc.1): unterminated; }; switch (rn) { case '"' => break; case '\\' => memio::appendrune(&lex.strbuf, scanescape(lex)?)!; case => memio::appendrune(&lex.strbuf, rn)!; }; }; return memio::string(&lex.strbuf)!; }; fn scanchar(lex: *lexer) (rune | error) = { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("character literal", lex.loc.0, lex.loc.1): unterminated; }; switch (rn) { case '\\' => return scanescape(lex)?; case => return rn; }; }; fn scanescape(lex: *lexer) (rune | error) = { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return lex.loc: invalid; }; switch (rn) { case '"' => return '"'; case '\\' => return '\\'; case 'n' => return '\n'; case 't' => return '\t'; case 's' => return ' '; case => return lex.loc: invalid; }; }; fn isspace(rn: rune) bool = { if (ascii::isspace(rn)) { return true; } else { switch (unicode::rune_gc(rn)) { case unicode::gc::Zs => return true; case => return false; }; }; }; def delimiters = `()[]{}\:'`; fn isdelimiter(rn: rune) bool = { match (strings::index(delimiters, rn)) { case size => return true; case => return false; }; }; @test fn lex() void = { const cases: [_](str, []token) = [ ( `"hello" \greeting def`, [ "hello", mksym("greeting"), mkword("def"), ] ), ( `[dup *] (a -- a) \square def`, [ quotstart, mkword("dup"), mkword("*"), quotend, mkcomment("a -- a"), mksym("square"), mkword("def"), ] ), (`'\s`, [' ']) ]; for (let i = 0z; i < len(cases); i += 1) { const src = strings::toutf8(cases[i].0); const src = memio::fixed(src); const lexer = newlexer(&src, ""); defer close(&lexer); for (let j = 0z; j < len(cases[i].1); j += 1) { const want = cases[i].1[j]; const have = lex(&lexer)! as token; assert(tokeq(want, have)); }; assert(lex(&lexer) is io::EOF); }; }; fn tokeq(have: token, want: token) bool = { match (want) { case quotstart => return have is quotstart; case quotend => return have is quotend; case mapstart => return have is mapstart; case mapend => return have is mapend; case let w: word => return (have as word).v == w.v; case let s: str => return have as str == s; case let s: symbol => return (have as symbol).v == s.v; case let c: comment => return (have as comment).v == c.v; case let r: rune => return have as rune == r; }; }; fn mkword(v: const str) word = word{ v = v }; fn mkcomment(v: const str) comment = comment{ v = v }; fn mksym(v: const str, kw: bool = false) symbol = symbol{ v = v, kw = kw };