From 57979aa6fc25486e65242c1770fa480c591e14c2 Mon Sep 17 00:00:00 2001 From: Lobo Torres Date: Wed, 4 Dec 2024 13:29:11 -0300 Subject: [PATCH] finish first implementation of lexer --- Makefile | 4 ++ lex.ha | 194 ++++++++++++++++++++++++++++++++++++++++++++++++------- types.ha | 22 ++++++- 3 files changed, 196 insertions(+), 24 deletions(-) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..13f2fad --- /dev/null +++ b/Makefile @@ -0,0 +1,4 @@ +.PHONY: test + +test: + @hare test diff --git a/lex.ha b/lex.ha index ade3d2b..526a432 100644 --- a/lex.ha +++ b/lex.ha @@ -1,11 +1,11 @@ +use ascii; // TODO: maybe use unicode? use bufio; +use encoding::utf8; use fmt; use io; use memio; use os; -use encoding::utf8; use strings; -use ascii; // my cod prob sux :( @@ -33,32 +33,90 @@ export fn close(lex: *lexer) void = { io::close(&lex.strbuf)!; }; -fn updateloc(lex: *lexer, rn: rune) void = { - if (rn == '\n') { - lex.loc = (lex.loc.0 + 1, 0); - } else { - lex.loc.1 += 1; +export fn next(lex: *lexer) (token | io::EOF | error) = { + const rn = match (nextrunews(lex)?) { + case io::EOF => + return io::EOF; + case let rn: rune => + yield rn; }; + + switch (rn) { + case '(' => + return punctuation::LEFT_PAREN: token; + case ')' => + return punctuation::RIGHT_PAREN: token; + case '[' => + return punctuation::LEFT_SQUARE_BRACKET: token; + case ']' => + return punctuation::RIGHT_SQUARE_BRACKET: token; + case '{' => + return punctuation::LEFT_CURLY_BRACKET: token; + case '}' => + return punctuation::RIGHT_CURLY_BRACKET: token; + case '\\' => + return punctuation::BACKSLASH: token; + case ':' => + return punctuation::COLON: token; + case '"' => + match (scanstr(lex)?) { + case let s: str => + return s; + case io::EOF => + return io::EOF; + }; + case => + yield; + }; + + unget(lex, rn); + return word{ v = scanword(lex)? }; }; -fn nextrune(lex: *lexer) (rune | io::error | io::EOF | utf8::invalid) = { +fn nextrune(lex: *lexer) (rune | io::EOF | error) = { match (lex.unread) { case let rn: rune => lex.prevloc = lex.loc; lex.unread = void; - updateloc(lex, rn); + if (rn == '\n') { + lex.loc = (lex.loc.0 + 1, 0); + } else { + lex.loc.1 += 1; + }; return rn; case void => yield; }; - match (bufio::read_rune(lex.in)?) { + match (bufio::read_rune(lex.in)) { case let rn: rune => lex.prevloc = lex.loc; - updateloc(lex, rn); + if (rn == '\n') { + lex.loc = (lex.loc.0 + 1, 0); + } else { + lex.loc.1 += 1; + }; return rn; case io::EOF => return io::EOF; + case let err: io::error => + return err; + case utf8::invalid => + return lex.loc: invalid; + }; +}; + +fn nextrunews(lex: *lexer) (rune | io::EOF | error) = { + for (true) { + match (nextrune(lex)?) { + case let rn: rune => + if (ascii::isspace(rn)) { + continue; + }; + return rn; + case io::EOF => + return io::EOF; + }; }; }; @@ -68,22 +126,93 @@ fn unget(lex: *lexer, rn: rune) void = { lex.loc = lex.prevloc; }; -fn skipws(lex: *lexer) (void | io::EOF | io::error | utf8::invalid) = { +fn scanword(lex: *lexer) (str | error) = { + memio::reset(&lex.strbuf); for (true) { - match (nextrune(lex)?) { - case io::EOF => return io::EOF; + const rn = match (nextrune(lex)?) { case let rn: rune => - if (!ascii::isspace(rn)) { - unget(lex, rn); - return; - }; + yield rn; + case io::EOF => + break; }; + if (ascii::isspace(rn)) { + unget(lex, rn); + break; + }; + memio::appendrune(&lex.strbuf, rn)!; + }; + return memio::string(&lex.strbuf)!; +}; + +fn scanstr(lex: *lexer) (str | io::EOF | error) = { + memio::reset(&lex.strbuf); + for (true) { + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + return lex.loc: invalid; + }; + + switch (rn) { + case '"' => break; + case '\\' => + memio::appendrune(&lex.strbuf, scanescape(lex)?)!; + case => + memio::appendrune(&lex.strbuf, rn)!; + }; + }; + return memio::string(&lex.strbuf)!; +}; + +fn scanescape(lex: *lexer) (rune | error) = { + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + return lex.loc: invalid; + }; + + switch (rn) { + case '"' => + return '"'; + case '\\' => + return '\\'; + case '\n' => + return '\n'; + case => + return lex.loc: invalid; }; }; - // Tests! :) +fn tnext(lex: *lexer) token = { + match (next(lex)!) { + case let t: token => + return t; + case => + assert(false); + return word { v = "" }; + }; +}; + +@test fn test_next() void = { + let lex = newlexer(&memio::fixed( + strings::toutf8("\"hello\" \\greeting def")), + ""); + defer close(&lex); + + let tk = tnext(&lex); + assert(tk is str && tk: str == "hello"); + let tk = tnext(&lex); + assert(tk is punctuation && tk: punctuation == punctuation::BACKSLASH); + let tk = tnext(&lex); + assert(tk is word && (tk: word).v == "greeting"); + let tk = tnext(&lex); + assert(tk is word && (tk: word).v == "def"); +}; + @test fn test_nextrune() void = { let lex = newlexer(&memio::fixed(strings::toutf8("a\nb")), ""); @@ -95,12 +224,33 @@ fn skipws(lex: *lexer) (void | io::EOF | io::error | utf8::invalid) = { assert(lex.loc.0 == 2u && lex.loc.1 == 1u); }; -@test fn test_skipws() void = { +@test fn test_nextrunews() void = { let lex = newlexer(&memio::fixed(strings::toutf8("\n a")), ""); defer close(&lex); - skipws(&lex)!; - assert(nextrune(&lex)! == 'a'); + assert(nextrunews(&lex)! == 'a'); assert(lex.loc.0 == 2u && lex.loc.1 == 2u); }; + +@test fn test_scanword() void = { + let lex = newlexer(&memio::fixed(strings::toutf8("string->number .")), + ""); + defer close(&lex); + + assert(scanword(&lex)! == "string->number"); +}; + +@test fn test_scanstr() void = { + let lex = newlexer(&memio::fixed(strings::toutf8("\"\\\\hello\\\"world!\\\n\"")), + ""); + defer close(&lex); + + assert(nextrune(&lex)! == '"'); + match (scanstr(&lex)!) { + case io::EOF => + assert(false); + case let s: str => + assert(s == "\\hello\"world!\n"); + }; +}; diff --git a/types.ha b/types.ha index ee64ac6..22d4eef 100644 --- a/types.ha +++ b/types.ha @@ -1,7 +1,25 @@ -export type punct = enum uint { +use io; +use fmt; + +export type invalid = !(uint, uint); +export type error = !(invalid | io::error); + +export type punctuation = enum uint { LEFT_PAREN, RIGHT_PAREN, LEFT_SQUARE_BRACKET, RIGHT_SQUARE_BRACKET, LEFT_CURLY_BRACKET, RIGHT_CURLY_BRACKET, BACKSLASH, COLON, }; -export type token = (punct | str | f64 | bool); +export type word = struct { v: str }; +export type token = (punctuation | word | str); + +export fn strerror(err: error) const str = { + static let buf: [64]u8 = [0...]; + match (err) { + case let err: invalid => + return fmt::bsprintf(buf, + "{}:{}: Invalid token found", err.0, err.1); + case let err: io::error => + return io::strerror(err); + }; +};