From 696d4910b1bf6fb46ca0b4b463f0c57b819eb87f Mon Sep 17 00:00:00 2001 From: Lobo Torres Date: Wed, 4 Dec 2024 14:56:08 -0300 Subject: [PATCH] fhdskjgfdsjglfdskjgfd lexer --- lex.ha | 241 ++++++++++++++++++++++++++++++++++++------------------- types.ha | 27 ++++--- 2 files changed, 177 insertions(+), 91 deletions(-) diff --git a/lex.ha b/lex.ha index 526a432..7c5d279 100644 --- a/lex.ha +++ b/lex.ha @@ -1,10 +1,12 @@ -use ascii; // TODO: maybe use unicode? +use ascii; use bufio; use encoding::utf8; -use fmt; use io; use memio; -use os; +use unicode; + +// Testing dependency +use fmt; use strings; // my cod prob sux :( @@ -33,7 +35,7 @@ export fn close(lex: *lexer) void = { io::close(&lex.strbuf)!; }; -export fn next(lex: *lexer) (token | io::EOF | error) = { +export fn lex(lex: *lexer) (token | io::EOF | error) = { const rn = match (nextrunews(lex)?) { case io::EOF => return io::EOF; @@ -43,28 +45,35 @@ export fn next(lex: *lexer) (token | io::EOF | error) = { switch (rn) { case '(' => - return punctuation::LEFT_PAREN: token; + return comment{ v = scancomment(lex)? }; case ')' => - return punctuation::RIGHT_PAREN: token; + return lex.loc: invalid; case '[' => - return punctuation::LEFT_SQUARE_BRACKET: token; + return quotstart; case ']' => - return punctuation::RIGHT_SQUARE_BRACKET: token; + return quotend; case '{' => - return punctuation::LEFT_CURLY_BRACKET: token; + return mapstart; case '}' => - return punctuation::RIGHT_CURLY_BRACKET: token; + return mapend; case '\\' => - return punctuation::BACKSLASH: token; - case ':' => - return punctuation::COLON: token; - case '"' => - match (scanstr(lex)?) { - case let s: str => - return s; - case io::EOF => - return io::EOF; + let v = scanword(lex)?; + if (len(v) == 0) { + return lex.loc: invalid; + } else { + return symbol{ v = v, kw = false }; }; + case ':' => + let v = scanword(lex)?; + if (len(v) == 0) { + return lex.loc: invalid; + } else { + return symbol{ v = v, kw = true }; + }; + case '\'' => + return scanchar(lex)?; + case '"' => + return scanstr(lex)?; case => yield; }; @@ -110,7 +119,7 @@ fn nextrunews(lex: *lexer) (rune | io::EOF | error) = { for (true) { match (nextrune(lex)?) { case let rn: rune => - if (ascii::isspace(rn)) { + if (isspace(rn)) { continue; }; return rn; @@ -135,7 +144,7 @@ fn scanword(lex: *lexer) (str | error) = { case io::EOF => break; }; - if (ascii::isspace(rn)) { + if (isspace(rn) || isdelimiter(rn)) { unget(lex, rn); break; }; @@ -144,14 +153,37 @@ fn scanword(lex: *lexer) (str | error) = { return memio::string(&lex.strbuf)!; }; -fn scanstr(lex: *lexer) (str | io::EOF | error) = { +fn scancomment(lex: *lexer) (str | error) = { memio::reset(&lex.strbuf); for (true) { const rn = match (nextrune(lex)?) { case let rn: rune => yield rn; case io::EOF => + return ("comment", lex.loc.0, lex.loc.1): unterminated; + }; + + switch (rn) { + case '(' => return lex.loc: invalid; + case ')' => + break; + case => + memio::appendrune(&lex.strbuf, rn)!; + }; + }; + + return memio::string(&lex.strbuf)!; +}; + +fn scanstr(lex: *lexer) (str | error) = { + memio::reset(&lex.strbuf); + for (true) { + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + return ("string literal", lex.loc.0, lex.loc.1): unterminated; }; switch (rn) { @@ -165,6 +197,22 @@ fn scanstr(lex: *lexer) (str | io::EOF | error) = { return memio::string(&lex.strbuf)!; }; +fn scanchar(lex: *lexer) (rune | error) = { + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + return ("character literal", lex.loc.0, lex.loc.1): unterminated; + }; + + switch (rn) { + case '\\' => + return scanescape(lex)?; + case => + return rn; + }; +}; + fn scanescape(lex: *lexer) (rune | error) = { const rn = match (nextrune(lex)?) { case let rn: rune => @@ -178,79 +226,110 @@ fn scanescape(lex: *lexer) (rune | error) = { return '"'; case '\\' => return '\\'; - case '\n' => + case 'n' => return '\n'; + case 't' => + return '\t'; + case 's' => + return ' '; case => return lex.loc: invalid; }; }; -// Tests! :) +fn isspace(rn: rune) bool = { + if (ascii::isspace(rn)) { + return true; + } else { + switch (unicode::rune_gc(rn)) { + case unicode::gc::Zs => + return true; + case => + return false; + }; + }; +}; -fn tnext(lex: *lexer) token = { - match (next(lex)!) { - case let t: token => - return t; +def delimiters = `()[]{}\:'`; +fn isdelimiter(rn: rune) bool = { + match (strings::index(delimiters, rn)) { + case size => + return true; case => - assert(false); - return word { v = "" }; + return false; }; }; -@test fn test_next() void = { - let lex = newlexer(&memio::fixed( - strings::toutf8("\"hello\" \\greeting def")), - ""); - defer close(&lex); +@test fn lex() void = { + const cases: [_](str, []token) = [ + ( + `"hello" \greeting def`, + [ + "hello", + mksym("greeting"), + mkword("def"), + ] + ), + ( + `[dup *] (a -- a) \square def`, + [ + quotstart, + mkword("dup"), + mkword("*"), + quotend, + mkcomment("a -- a"), + mksym("square"), + mkword("def"), + ] + ), + (`'\s`, [' ']) + ]; - let tk = tnext(&lex); - assert(tk is str && tk: str == "hello"); - let tk = tnext(&lex); - assert(tk is punctuation && tk: punctuation == punctuation::BACKSLASH); - let tk = tnext(&lex); - assert(tk is word && (tk: word).v == "greeting"); - let tk = tnext(&lex); - assert(tk is word && (tk: word).v == "def"); + for (let i = 0z; i < len(cases); i += 1) { + const src = strings::toutf8(cases[i].0); + const src = memio::fixed(src); + const lexer = newlexer(&src, ""); + defer close(&lexer); + + for (let j = 0z; j < len(cases[i].1); j += 1) { + const want = cases[i].1[j]; + const have = lex(&lexer)! as token; + assert(tokeq(want, have)); + }; + + assert(lex(&lexer) is io::EOF); + }; }; -@test fn test_nextrune() void = { - let lex = newlexer(&memio::fixed(strings::toutf8("a\nb")), - ""); - defer close(&lex); - - assert(nextrune(&lex)! == 'a'); - assert(nextrune(&lex)! == '\n'); - assert(nextrune(&lex)! == 'b'); - assert(lex.loc.0 == 2u && lex.loc.1 == 1u); -}; - -@test fn test_nextrunews() void = { - let lex = newlexer(&memio::fixed(strings::toutf8("\n a")), - ""); - defer close(&lex); - - assert(nextrunews(&lex)! == 'a'); - assert(lex.loc.0 == 2u && lex.loc.1 == 2u); -}; - -@test fn test_scanword() void = { - let lex = newlexer(&memio::fixed(strings::toutf8("string->number .")), - ""); - defer close(&lex); - - assert(scanword(&lex)! == "string->number"); -}; - -@test fn test_scanstr() void = { - let lex = newlexer(&memio::fixed(strings::toutf8("\"\\\\hello\\\"world!\\\n\"")), - ""); - defer close(&lex); - - assert(nextrune(&lex)! == '"'); - match (scanstr(&lex)!) { - case io::EOF => - assert(false); +fn tokeq(have: token, want: token) bool = { + match (want) { + case quotstart => + return have is quotstart; + case quotend => + return have is quotend; + case mapstart => + return have is mapstart; + case mapend => + return have is mapend; + case let w: word => + return (have as word).v == w.v; case let s: str => - assert(s == "\\hello\"world!\n"); + return have as str == s; + case let s: symbol => + return (have as symbol).v == s.v; + case let c: comment => + return (have as comment).v == c.v; + case let r: rune => + return have as rune == r; }; }; + +fn mkword(v: const str) word = + word{ v = v }; + +fn mkcomment(v: const str) comment = + comment{ v = v }; + +fn mksym(v: const str, kw: bool = false) symbol = + symbol{ v = v, kw = kw }; + diff --git a/types.ha b/types.ha index 22d4eef..2217e03 100644 --- a/types.ha +++ b/types.ha @@ -1,17 +1,21 @@ use io; use fmt; -export type invalid = !(uint, uint); -export type error = !(invalid | io::error); +export type invalid = !(uint, uint); +export type unterminated = !(const str, uint, uint); +export type error = !(invalid | unterminated | io::error); -export type punctuation = enum uint { - LEFT_PAREN, RIGHT_PAREN, - LEFT_SQUARE_BRACKET, RIGHT_SQUARE_BRACKET, - LEFT_CURLY_BRACKET, RIGHT_CURLY_BRACKET, - BACKSLASH, COLON, -}; -export type word = struct { v: str }; -export type token = (punctuation | word | str); +export type quotstart = void; +export type quotend = void; +export type mapstart = void; +export type mapend = void; + +export type comment = struct { v: str }; +export type word = struct { v: str }; +export type symbol = struct { v: str, kw: bool }; + +export type token = (quotstart | quotend | mapstart | mapend | + word | symbol | comment | str | rune); export fn strerror(err: error) const str = { static let buf: [64]u8 = [0...]; @@ -19,6 +23,9 @@ export fn strerror(err: error) const str = { case let err: invalid => return fmt::bsprintf(buf, "{}:{}: Invalid token found", err.0, err.1); + case let err: unterminated => + return fmt::bsprintf(buf, + "{}:{}: Unterminated {} found", err.1, err.2, err.0); case let err: io::error => return io::strerror(err); };