use ascii; use bufio; use encoding::utf8; use io; use memio; use unicode; use strconv; use fmt; use strings; def longcharnames: [_](str, rune) = [ ("nul", '\u0000'), ("alarm", '\u0007'), ("backspace", '\u0008'), ("tab", '\u0009'), ("newline", '\u000a'), ("linefeed", '\u000a'), ("vtab", '\u000b'), ("page", '\u000c'), ("return", '\u000d'), ("esc", '\u001b'), ("space", '\u0020'), ("delete", '\u007f'), ]; export type lexer = struct { in: io::handle, strbuf: memio::stream, path: str, loc: (uint, uint), prevloc: (uint, uint), unread: (rune | void), }; export fn newlexer(in: io::handle, path: str) lexer = { return lexer { in = in, strbuf = memio::dynamic(), path = path, loc = (1, 0), unread = void, ... }; }; export fn close(lex: *lexer) void = { io::close(&lex.strbuf)!; }; export fn lex(lex: *lexer) (token | io::EOF | error) = { const rn = match (nextrunews(lex)?) { case io::EOF => return io::EOF; case let rn: rune => yield rn; }; switch (rn) { case '(' => return comment{ v = scancomment(lex)? }; case ')' => return lex.loc: invalid; case '[' => return quotstart; case ']' => return quotend; case '{' => return mapstart; case '}' => return mapend; case '\\' => let v = scanword(lex)?; if (len(v) == 0) { return lex.loc: invalid; } else { return symbol{ v = v, kw = false }; }; case ':' => let v = scanword(lex)?; if (len(v) == 0) { return lex.loc: invalid; } else { return symbol{ v = v, kw = true }; }; case '#' => return scanpound(lex)?; case '"' => return scanstr(lex)?; case => yield; }; unget(lex, rn); return word{ v = scanword(lex)? }; }; fn nextrune(lex: *lexer) (rune | io::EOF | error) = { match (lex.unread) { case let rn: rune => lex.prevloc = lex.loc; lex.unread = void; if (rn == '\n') { lex.loc = (lex.loc.0 + 1, 0); } else { lex.loc.1 += 1; }; return rn; case void => yield; }; match (bufio::read_rune(lex.in)) { case let rn: rune => lex.prevloc = lex.loc; if (rn == '\n') { lex.loc = (lex.loc.0 + 1, 0); } else { lex.loc.1 += 1; }; return rn; case io::EOF => return io::EOF; case let err: io::error => return err; case utf8::invalid => return lex.loc: invalid; }; }; fn nextrunews(lex: *lexer) (rune | io::EOF | error) = { for (true) { match (nextrune(lex)?) { case let rn: rune => if (isspace(rn)) { continue; }; return rn; case io::EOF => return io::EOF; }; }; }; fn unget(lex: *lexer, rn: rune) void = { assert(lex.unread is void); lex.unread = rn; lex.loc = lex.prevloc; }; fn scanword(lex: *lexer) (str | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => break; }; if (isspace(rn) || isdelimiter(rn)) { unget(lex, rn); break; }; memio::appendrune(&lex.strbuf, rn)!; }; return memio::string(&lex.strbuf)!; }; fn scancomment(lex: *lexer) (str | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("comment", lex.loc.0, lex.loc.1): unterminated; }; switch (rn) { case '(' => return lex.loc: invalid; case ')' => break; case => memio::appendrune(&lex.strbuf, rn)!; }; }; return memio::string(&lex.strbuf)!; }; fn scanstr(lex: *lexer) (str | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("string literal", lex.loc.0, lex.loc.1): unterminated; }; switch (rn) { case '"' => break; case '\\' => memio::appendrune(&lex.strbuf, scanescape(lex)?)!; case => memio::appendrune(&lex.strbuf, rn)!; }; }; return memio::string(&lex.strbuf)!; }; fn scanpound(lex: *lexer) (token | error) = { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("pound literal", lex.loc.0, lex.loc.1): unterminated; }; switch (rn) { case 't' => return true; case 'f' => return false; case '\\' => return scanchar(lex)?; case => return lex.loc: invalid; }; }; fn scanchar(lex: *lexer) (rune | error) = { static let namebuf: [16]u8 = [0...]; let namebuf = memio::fixed(namebuf); const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("character literal", lex.loc.0, lex.loc.1): unterminated; }; match (nextrune(lex)?) { case let rnn: rune => unget(lex, rnn); if (isspace(rnn)) { return rn; } else { if (rn == 'x') { return scanescape2(lex); } else { memio::appendrune(&namebuf, rn)!; memio::concat(&namebuf, scanword(lex)?)!; const name = memio::string(&namebuf)!; for (let i = 0z; i < len(longcharnames); i += 1) { if (name == longcharnames[i].0) { return longcharnames[i].1; }; }; return lex.loc: invalid; }; }; case io::EOF => return rn; }; }; fn scanescape(lex: *lexer) (rune | error) = { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("escape sequence", lex.loc.0, lex.loc.1): unterminated; }; switch (rn) { case '0' => return '\0'; case 'a' => return '\a'; case 'b' => return '\b'; case 'e' => return '\x1b'; case 'f' => return '\f'; case 'n' => return '\n'; case 'r' => return '\r'; case 't' => return '\t'; case 'v' => return '\v'; case '\\' => return '\\'; case '"' => return '"'; case 'x' => return scanescape2(lex)?; case => return lex.loc: invalid; }; }; fn scanescape2(lex: *lexer) (rune | error) = { // This handles the `\xhh...;` family of escapes. // It's on a separate function since both [[scanescape]] and // [[scanchar]] make use of it. Much like how [[scanescape]] assumes // that the backslash has already been consumed, this one assumes that // the leading character has been consumed prior to entering this // function. const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("escape sequence", lex.loc.0, lex.loc.1): unterminated; }; const buf: [6]u8 = [0...]; let buf = memio::fixed(buf); memio::appendrune(&buf, rn)!; let count = 1z; for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => return ("escape sequence", lex.loc.0, lex.loc.1): unterminated; }; if (count > 6) { return lex.loc: invalid; } else if (rn == ';') { break; } else { memio::appendrune(&buf, rn)!; }; count += 1; }; const buf = memio::string(&buf)!; return match (strconv::stou32(buf, strconv::base::HEX)) { case let codepoint: u32 => return codepoint: rune; case => return lex.loc: invalid; }; }; fn isspace(rn: rune) bool = { if (ascii::isspace(rn)) { return true; } else { switch (unicode::rune_gc(rn)) { case unicode::gc::Zs => return true; case => return false; }; }; }; fn isdelimiter(rn: rune) bool = { match (strings::index(`()[]{}\:#`, rn)) { case size => return true; case => return false; }; };