From 57979aa6fc25486e65242c1770fa480c591e14c2 Mon Sep 17 00:00:00 2001
From: Lobo Torres <lobo@quiltro.org>
Date: Wed, 4 Dec 2024 13:29:11 -0300
Subject: [PATCH] finish first implementation of lexer

---
 Makefile |   4 ++
 lex.ha   | 194 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 types.ha |  22 ++++++-
 3 files changed, 196 insertions(+), 24 deletions(-)
 create mode 100644 Makefile

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..13f2fad
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,4 @@
+.PHONY: test
+
+test:
+	@hare test
diff --git a/lex.ha b/lex.ha
index ade3d2b..526a432 100644
--- a/lex.ha
+++ b/lex.ha
@@ -1,11 +1,11 @@
+use ascii; // TODO: maybe use unicode?
 use bufio;
+use encoding::utf8;
 use fmt;
 use io;
 use memio;
 use os;
-use encoding::utf8;
 use strings;
-use ascii;
 
 // my cod prob sux :(
 
@@ -33,32 +33,90 @@ export fn close(lex: *lexer) void = {
 	io::close(&lex.strbuf)!;
 };
 
-fn updateloc(lex: *lexer, rn: rune) void = {
-	if (rn == '\n') {
-		lex.loc = (lex.loc.0 + 1, 0);
-	} else {
-		lex.loc.1 += 1;
+export fn next(lex: *lexer) (token | io::EOF | error) = {
+	const rn = match (nextrunews(lex)?) {
+	case io::EOF =>
+		return io::EOF;
+	case let rn: rune =>
+		yield rn;
 	};
+
+	switch (rn) {
+	case '(' =>
+		return punctuation::LEFT_PAREN: token;
+	case ')' =>
+		return punctuation::RIGHT_PAREN: token;
+	case '[' =>
+		return punctuation::LEFT_SQUARE_BRACKET: token;
+	case ']' =>
+		return punctuation::RIGHT_SQUARE_BRACKET: token;
+	case '{' =>
+		return punctuation::LEFT_CURLY_BRACKET: token;
+	case '}' =>
+		return punctuation::RIGHT_CURLY_BRACKET: token;
+	case '\\' =>
+		return punctuation::BACKSLASH: token;
+	case ':' =>
+		return punctuation::COLON: token;
+	case '"' =>
+		match (scanstr(lex)?) {
+		case let s: str =>
+			return s;
+		case io::EOF =>
+			return io::EOF;
+		};
+	case =>
+		yield;
+	};
+
+	unget(lex, rn);
+	return word{ v = scanword(lex)? };
 };
 
-fn nextrune(lex: *lexer) (rune | io::error | io::EOF | utf8::invalid) = {
+fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
 	match (lex.unread) {
 	case let rn: rune =>
 		lex.prevloc = lex.loc;
 		lex.unread = void;
-		updateloc(lex, rn);
+		if (rn == '\n') {
+			lex.loc = (lex.loc.0 + 1, 0);
+		} else {
+			lex.loc.1 += 1;
+		};
 		return rn;
 	case void =>
 		yield;
 	};
 
-	match (bufio::read_rune(lex.in)?) {
+	match (bufio::read_rune(lex.in)) {
 	case let rn: rune =>
 		lex.prevloc = lex.loc;
-		updateloc(lex, rn);
+		if (rn == '\n') {
+			lex.loc = (lex.loc.0 + 1, 0);
+		} else {
+			lex.loc.1 += 1;
+		};
 		return rn;
 	case io::EOF =>
 		return io::EOF;
+	case let err: io::error =>
+		return err;
+	case utf8::invalid =>
+		return lex.loc: invalid;
+	};
+};
+
+fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
+	for (true) {
+		match (nextrune(lex)?) {
+		case let rn: rune =>
+			if (ascii::isspace(rn)) {
+				continue;
+			};
+			return rn;
+		case io::EOF =>
+			return io::EOF;
+		};
 	};
 };
 
@@ -68,22 +126,93 @@ fn unget(lex: *lexer, rn: rune) void = {
 	lex.loc = lex.prevloc;
 };
 
-fn skipws(lex: *lexer) (void | io::EOF | io::error | utf8::invalid) = {
+fn scanword(lex: *lexer) (str | error) = {
+	memio::reset(&lex.strbuf);
 	for (true) {
-		match (nextrune(lex)?) {
-		case io::EOF => return io::EOF;
+		const rn = match (nextrune(lex)?) {
 		case let rn: rune =>
-			if (!ascii::isspace(rn)) {
-				unget(lex, rn);
-				return;
-			};
+			yield rn;
+		case io::EOF =>
+			break;
 		};
+		if (ascii::isspace(rn)) {
+			unget(lex, rn);
+			break;
+		};
+		memio::appendrune(&lex.strbuf, rn)!;
+	};
+	return memio::string(&lex.strbuf)!;
+};
+
+fn scanstr(lex: *lexer) (str | io::EOF | error) = {
+	memio::reset(&lex.strbuf);
+	for (true) {
+		const rn = match (nextrune(lex)?) {
+		case let rn: rune =>
+			yield rn;
+		case io::EOF =>
+			return lex.loc: invalid;
+		};
+
+		switch (rn) {
+		case '"' => break;
+		case '\\' =>
+			memio::appendrune(&lex.strbuf, scanescape(lex)?)!;
+		case =>
+			memio::appendrune(&lex.strbuf, rn)!;
+		};
+	};
+	return memio::string(&lex.strbuf)!;
+};
+
+fn scanescape(lex: *lexer) (rune | error) = {
+	const rn = match (nextrune(lex)?) {
+	case let rn: rune =>
+		yield rn;
+	case io::EOF =>
+		return lex.loc: invalid;
+	};
+
+	switch (rn) {
+	case '"' =>
+		return '"';
+	case '\\' =>
+		return '\\';
+	case '\n' =>
+		return '\n';
+	case =>
+		return lex.loc: invalid;
 	};
 };
 
-
 // Tests! :)
 
+fn tnext(lex: *lexer) token = {
+	match (next(lex)!) {
+	case let t: token =>
+		return t;
+	case =>
+		assert(false);
+		return word { v = "" };
+	};
+};
+
+@test fn test_next() void = {
+	let lex = newlexer(&memio::fixed(
+		strings::toutf8("\"hello\" \\greeting def")),
+		"<string>");
+	defer close(&lex);
+
+	let tk = tnext(&lex);
+	assert(tk is str && tk: str == "hello");
+	let tk = tnext(&lex);
+	assert(tk is punctuation && tk: punctuation == punctuation::BACKSLASH);
+	let tk = tnext(&lex);
+	assert(tk is word && (tk: word).v == "greeting");
+	let tk = tnext(&lex);
+	assert(tk is word && (tk: word).v == "def");
+};
+
 @test fn test_nextrune() void = {
 	let lex = newlexer(&memio::fixed(strings::toutf8("a\nb")),
 		"<string>");
@@ -95,12 +224,33 @@ fn skipws(lex: *lexer) (void | io::EOF | io::error | utf8::invalid) = {
 	assert(lex.loc.0 == 2u && lex.loc.1 == 1u);
 };
 
-@test fn test_skipws() void = {
+@test fn test_nextrunews() void = {
 	let lex = newlexer(&memio::fixed(strings::toutf8("\n a")),
 		"<string>");
 	defer close(&lex);
 
-	skipws(&lex)!;
-	assert(nextrune(&lex)! == 'a');
+	assert(nextrunews(&lex)! == 'a');
 	assert(lex.loc.0 == 2u && lex.loc.1 == 2u);
 };
+
+@test fn test_scanword() void = {
+	let lex = newlexer(&memio::fixed(strings::toutf8("string->number .")),
+		"<string>");
+	defer close(&lex);
+
+	assert(scanword(&lex)! == "string->number");
+};
+
+@test fn test_scanstr() void = {
+	let lex = newlexer(&memio::fixed(strings::toutf8("\"\\\\hello\\\"world!\\\n\"")),
+		"<string>");
+	defer close(&lex);
+
+	assert(nextrune(&lex)! == '"');
+	match (scanstr(&lex)!) {
+	case io::EOF =>
+		assert(false);
+	case let s: str =>
+		assert(s == "\\hello\"world!\n");
+	};
+};
diff --git a/types.ha b/types.ha
index ee64ac6..22d4eef 100644
--- a/types.ha
+++ b/types.ha
@@ -1,7 +1,25 @@
-export type punct = enum uint {
+use io;
+use fmt;
+
+export type invalid = !(uint, uint);
+export type error = !(invalid | io::error);
+
+export type punctuation = enum uint {
 	LEFT_PAREN, RIGHT_PAREN,
 	LEFT_SQUARE_BRACKET, RIGHT_SQUARE_BRACKET,
 	LEFT_CURLY_BRACKET, RIGHT_CURLY_BRACKET,
 	BACKSLASH, COLON,
 };
-export type token = (punct | str | f64 | bool);
+export type word = struct { v: str };
+export type token = (punctuation | word | str);
+
+export fn strerror(err: error) const str = {
+	static let buf: [64]u8 = [0...];
+	match (err) {
+	case let err: invalid =>
+		return fmt::bsprintf(buf,
+			"{}:{}: Invalid token found", err.0, err.1);
+	case let err: io::error =>
+		return io::strerror(err);
+	};
+};