From d7b6b380fe6d13af4455759cbb14c9cb2e6b474e Mon Sep 17 00:00:00 2001
From: Lobo Torres <lobo@quiltro.org>
Date: Thu, 5 Dec 2024 22:06:21 -0300
Subject: [PATCH] lex: rework types

---
 kojote/lex/+test.ha                 | 131 ++++++++++++++++++++++++++++
 parse/lex.ha => kojote/lex/lexer.ha | 130 +++++++++++++++------------
 kojote/lex/types.ha                 |  51 +++++++++++
 parse/+test/lexer.ha                | 117 -------------------------
 parse/types.ha                      |  32 -------
 test.kj                             |   7 +-
 6 files changed, 258 insertions(+), 210 deletions(-)
 create mode 100644 kojote/lex/+test.ha
 rename parse/lex.ha => kojote/lex/lexer.ha (65%)
 create mode 100644 kojote/lex/types.ha
 delete mode 100644 parse/+test/lexer.ha
 delete mode 100644 parse/types.ha
diff --git a/kojote/lex/+test.ha b/kojote/lex/+test.ha
new file mode 100644
index 0000000..5ece579
--- /dev/null
+++ b/kojote/lex/+test.ha
@@ -0,0 +1,131 @@
+use memio;
+use fmt;
+use strings;
+use io;
+
+type dummytoken = (ty, value);
+
+@test fn next() void = {
+	const cases: [_](str, []dummytoken) = [
+		(`"hello" \greeting def`,
+			[
+				(ty::STRING, "hello"),
+				(ty::SYMBOL, "greeting"),
+				(ty::WORD, "def")
+			]),
+		(`[dup *] (a -- a) \square def`,
+			[
+				(ty::QUOT_START, void),
+				(ty::WORD, "dup"),
+				(ty::WORD, "*"),
+				(ty::QUOT_END, void),
+				(ty::COMMENT, "a -- a"),
+				(ty::SYMBOL, "square"),
+				(ty::WORD, "def"),
+			]),
+		(`#t #f`,
+			[
+				(ty::BOOLEAN, true),
+				(ty::BOOLEAN, false),
+			]),
+		(`#\a #\space #\nul`,
+			[
+				(ty::CHAR, 'a'),
+				(ty::CHAR, ' '),
+				(ty::CHAR, '\0'),
+			]),
+		(`"\x0a;\x2014;\x2f9f4;"`,
+			[
+				(ty::STRING, "\n—嶲"),
+			]),
+		(`#\x #\x0a; #\x2014; #\x2f9f4;`,
+			[
+				(ty::CHAR, 'x'),
+				(ty::CHAR, '\n'),
+				(ty::CHAR, '—'),
+				(ty::CHAR, '嶲'),
+			]),
+	];
+
+	for (let i = 0z; i < len(cases); i += 1) {
+		const src = strings::toutf8(cases[i].0);
+		const src = memio::fixed(src);
+		const lexer = newlexer(&src, "<string>");
+		defer close(&lexer);
+
+		for (let j = 0z; j < len(cases[i].1); j += 1) {
+			const want = cases[i].1[j];
+			const have = match (next(&lexer)) {
+			case let tok: token =>
+				yield tok;
+			case io::EOF =>
+				assert(false, "reached EOF");
+				return;
+			case let err: error =>
+				assert(false, strerror(err));
+				return;
+			};
+
+			if (!tokeq(have, want)) {
+				fmt::printf("Expected:\n\t")!;
+				fmt::println(tokpp(want.0, want.1))!;
+				fmt::printf("Got:\n\t")!;
+				fmt::println(tokpp(have.0, have.1))!;
+				assert(false);
+			};
+		};
+
+		assert(next(&lexer) is io::EOF);
+	};
+};
+
+fn tokeq(have: token, want: dummytoken) bool =
+	have.0 == want.0 && match (have.1) {
+	case void =>
+		yield true;
+	case let s: str =>
+		yield want.1 is str && (want.1 as str) == s;
+	case let r: rune =>
+		yield want.1 is rune && (want.1 as rune) == r;
+	case let b: bool =>
+		yield want.1 is bool && (want.1 as bool) == b;
+	};
+
+fn tokpp(ty: ty, value: value) const str = {
+	static let buf: [128]u8 = [0...];
+
+	switch (ty) {
+	case ty::QUOT_START =>
+		return "[";
+	case ty::QUOT_END =>
+		return "]";
+	case ty::MAP_START =>
+		return "{";
+	case ty::MAP_END =>
+		return "}";
+	case ty::COMMENT =>
+		return fmt::bsprintf(buf, "({})", value as str);
+	case ty::WORD =>
+		return value as str;
+	case ty::SYMBOL =>
+		return fmt::bsprintf(buf, "\\{}", value as str);
+	case ty::KEYWORD =>
+		return fmt::bsprintf(buf, ":{}", value as str);
+	case ty::STRING =>
+		return fmt::bsprintf(buf, "\"{}\"", value as str);
+	case ty::CHAR =>
+		let rn = value as rune;
+		for (let i = 0z; i < len(longcharnames); i += 1) {
+			if (longcharnames[i].1 == rn) {
+				return fmt::bsprintf(buf, "#\\{}",
+					longcharnames[i].0);
+			};
+		};
+		return fmt::bsprintf(buf, "#\\{}", rn);
+	case ty::NUMBER =>
+		return value as str;
+	case ty::BOOLEAN =>
+		return fmt::bsprintf(buf, "#{}",
+			if (value as bool) 't' else 'f');
+	};
+};
diff --git a/parse/lex.ha b/kojote/lex/lexer.ha
similarity index 65%
rename from parse/lex.ha
rename to kojote/lex/lexer.ha
index b87b946..93cc318 100644
--- a/parse/lex.ha
+++ b/kojote/lex/lexer.ha
@@ -27,16 +27,19 @@ def longcharnames: [_](str, rune) = [
 export type lexer = struct {
 	in: io::handle,
 	strbuf: memio::stream,
+	commentbuf: memio::stream,
 	path: str,
 	loc: (uint, uint),
 	prevloc: (uint, uint),
 	unread: (rune | void),
 };
 
+// Creates a new [[lexer]] for the given [[io::handle]]. The path is borrowed
 export fn newlexer(in: io::handle, path: str) lexer = {
 	return lexer {
 		in = in,
 		strbuf = memio::dynamic(),
+		commentbuf = memio::dynamic(),
 		path = path,
 		loc = (1, 0),
 		unread = void,
@@ -44,11 +47,14 @@ export fn newlexer(in: io::handle, path: str) lexer = {
 	};
 };
 
+// Frees resources associated with a [[lexer]].
 export fn close(lex: *lexer) void = {
 	io::close(&lex.strbuf)!;
+	io::close(&lex.commentbuf)!;
 };
 
-export fn lex(lex: *lexer) (token | io::EOF | error) = {
+// Returns the next token from the lexer.
+export fn next(lex: *lexer) (token | io::EOF | error) = {
 	const rn = match (nextrunews(lex)?) {
 	case io::EOF =>
 		return io::EOF;
@@ -58,30 +64,30 @@ export fn lex(lex: *lexer) (token | io::EOF | error) = {
 
 	switch (rn) {
 	case '(' =>
-		return comment{ v = scancomment(lex)? };
+		return mktoken(lex, ty::COMMENT, scancomment(lex)?);
 	case ')' =>
-		return lex.loc: invalid;
+		return mkerror(lex, "invalid token");
 	case '[' =>
-		return quotstart;
+		return mktoken(lex, ty::QUOT_START, void);
 	case ']' =>
-		return quotend;
+		return mktoken(lex, ty::QUOT_END, void);
 	case '{' =>
-		return mapstart;
+		return mktoken(lex, ty::MAP_START, void);
 	case '}' =>
-		return mapend;
+		return mktoken(lex, ty::MAP_END, void);
 	case '\\' =>
 		let v = scanword(lex)?;
 		if (len(v) == 0) {
-			return lex.loc: invalid;
+			return mkerror(lex, "invalid symbol literal");
 		} else {
-			return symbol{ v = v, kw = false };
+			return mktoken(lex, ty::SYMBOL, v);
 		};
 	case ':' =>
 		let v = scanword(lex)?;
 		if (len(v) == 0) {
-			return lex.loc: invalid;
+			return mkerror(lex, "invalid keyword");
 		} else {
-			return symbol{ v = v, kw = true };
+			return mktoken(lex, ty::KEYWORD, v);
 		};
 	case '#' =>
 		return scanpound(lex)?;
@@ -92,7 +98,7 @@ export fn lex(lex: *lexer) (token | io::EOF | error) = {
 	};
 
 	unget(lex, rn);
-	return word{ v = scanword(lex)? };
+	return mktoken(lex, ty::WORD, scanword(lex)?);
 };
 
 fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
@@ -124,7 +130,7 @@ fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
 	case let err: io::error =>
 		return err;
 	case utf8::invalid =>
-		return lex.loc: invalid;
+		return mkerror(lex, "invalid UTF-8 sequence");
 	};
 };
 
@@ -132,9 +138,7 @@ fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
 	for (true) {
 		match (nextrune(lex)?) {
 		case let rn: rune =>
-			if (isspace(rn)) {
-				continue;
-			};
+			if (isspace(rn)) continue;
 			return rn;
 		case io::EOF =>
 			return io::EOF;
@@ -167,36 +171,36 @@ fn scanword(lex: *lexer) (str | error) = {
 };
 
 fn scancomment(lex: *lexer) (str | error) = {
-	memio::reset(&lex.strbuf);
+	memio::reset(&lex.commentbuf);
 	for (true) {
 		const rn = match (nextrune(lex)?) {
 		case let rn: rune =>
 			yield rn;
 		case io::EOF =>
-			return ("comment", lex.loc.0, lex.loc.1): unterminated;
+			return mkerror(lex, "unterminated comment");
 		};
 
 		switch (rn) {
 		case '(' =>
-			return lex.loc: invalid;
+			return mkerror(lex, "nested comments are not allowed");
 		case ')' =>
 			break;
 		case =>
-			memio::appendrune(&lex.strbuf, rn)!;
+			memio::appendrune(&lex.commentbuf, rn)!;
 		};
 	};
 
-	return memio::string(&lex.strbuf)!;
+	return memio::string(&lex.commentbuf)!;
 };
 
-fn scanstr(lex: *lexer) (str | error) = {
+fn scanstr(lex: *lexer) (token | error) = {
 	memio::reset(&lex.strbuf);
 	for (true) {
 		const rn = match (nextrune(lex)?) {
 		case let rn: rune =>
 			yield rn;
 		case io::EOF =>
-			return ("string literal", lex.loc.0, lex.loc.1): unterminated;
+			return mkerror(lex, "unterminated string literal");
 		};
 
 		switch (rn) {
@@ -207,7 +211,7 @@ fn scanstr(lex: *lexer) (str | error) = {
 			memio::appendrune(&lex.strbuf, rn)!;
 		};
 	};
-	return memio::string(&lex.strbuf)!;
+	return mktoken(lex, ty::STRING, memio::string(&lex.strbuf)!);
 };
 
 fn scanpound(lex: *lexer) (token | error) = {
@@ -215,22 +219,22 @@ fn scanpound(lex: *lexer) (token | error) = {
 	case let rn: rune =>
 		yield rn;
 	case io::EOF =>
-		return ("pound literal", lex.loc.0, lex.loc.1): unterminated;
+		return mkerror(lex, "unterminated pound literal");
 	};
 
 	switch (rn) {
 	case 't' =>
-		return true;
+		return mktoken(lex, ty::BOOLEAN, true);
 	case 'f' =>
-		return false;
+		return mktoken(lex, ty::BOOLEAN, false);
 	case '\\' =>
-		return scanchar(lex)?;
+		return scanchar(lex);
 	case =>
-		return lex.loc: invalid;
+		return mkerror(lex, "invalid pound literal");
 	};
 };
 
-fn scanchar(lex: *lexer) (rune | error) = {
+fn scanchar(lex: *lexer) (token | error) = {
 	static let namebuf: [16]u8 = [0...];
 	let namebuf = memio::fixed(namebuf);
 
@@ -238,31 +242,35 @@ fn scanchar(lex: *lexer) (rune | error) = {
 	case let rn: rune =>
 		yield rn;
 	case io::EOF =>
-		return ("character literal", lex.loc.0, lex.loc.1): unterminated;
+		return mkerror(lex, "unterminated character literal");
 	};
 
+	let ret: rune = '\0';
+
 	match (nextrune(lex)?) {
-	case let rnn: rune =>
-		unget(lex, rnn);
-		if (isspace(rnn)) {
-			return rn;
-		} else {
-			if (rn == 'x') {
-				return scanescape2(lex);
-			} else {
-				memio::appendrune(&namebuf, rn)!;
-				memio::concat(&namebuf, scanword(lex)?)!;
-				const name = memio::string(&namebuf)!;
-					for (let i = 0z; i < len(longcharnames); i += 1) {
-					if (name == longcharnames[i].0) {
-						return longcharnames[i].1;
-					};
-				};
-				return lex.loc: invalid;
-			};
-		};
 	case io::EOF =>
-		return rn;
+		return mktoken(lex, ty::CHAR, rn);
+	case let next: rune =>
+		unget(lex, next);
+
+		if (isspace(next)) {
+			return mktoken(lex, ty::CHAR, rn);
+		};
+
+		if (rn == 'x') {
+			return mktoken(lex, ty::CHAR, scanescape2(lex)?);
+		} else {
+			memio::appendrune(&namebuf, rn)!;
+			memio::concat(&namebuf, scanword(lex)?)!;
+			const name = memio::string(&namebuf)!;
+			for (let i = 0z; i < len(longcharnames); i += 1) {
+				if (name == longcharnames[i].0) {
+					return mktoken(lex, ty::CHAR,
+						longcharnames[i].1);
+				};
+			};
+			return mkerror(lex, "invalid named character literal");
+		};
 	};
 };
 
@@ -271,7 +279,7 @@ fn scanescape(lex: *lexer) (rune | error) = {
 	case let rn: rune =>
 		yield rn;
 	case io::EOF =>
-		return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
+		return mkerror(lex, "unterminated character escape");
 	};
 
 	switch (rn) {
@@ -288,7 +296,7 @@ fn scanescape(lex: *lexer) (rune | error) = {
 	case '"' => return '"';
 	case 'x' => return scanescape2(lex)?;
 	case =>
-		return lex.loc: invalid;
+		return mkerror(lex, "invalid character escape");
 	};
 };
 
@@ -304,7 +312,7 @@ fn scanescape2(lex: *lexer) (rune | error) = {
 	case let rn: rune =>
 		yield rn;
 	case io::EOF =>
-		return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
+		return mkerror(lex, "unterminated character escape");
 	};
 
 	const buf: [6]u8 = [0...];
@@ -317,11 +325,11 @@ fn scanescape2(lex: *lexer) (rune | error) = {
 		case let rn: rune =>
 			yield rn;
 		case io::EOF =>
-			return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
+			return mkerror(lex, "unterminated escape sequence");
 		};
 
 		if (count > 6) {
-			return lex.loc: invalid;
+			return mkerror(lex, "invalid escape sequence");
 		} else if (rn == ';') {
 			break;
 		} else {
@@ -337,10 +345,16 @@ fn scanescape2(lex: *lexer) (rune | error) = {
 	case let codepoint: u32 =>
 		return codepoint: rune;
 	case =>
-		return lex.loc: invalid;
+		return mkerror(lex, "invalid escape sequence");
 	};
 };
 
+fn mktoken(lex: *lexer, ty: ty, value: value) token =
+	(ty, value, location{ path = lex.path, line = lex.loc.0, column = lex.loc.1 });
+
+fn mkerror(lex: *lexer, msg: const str) syntax =
+	(location{ path = lex.path, line = lex.loc.0, column = lex.loc.1 }, msg);
+
 fn isspace(rn: rune) bool = {
 	if (ascii::isspace(rn)) {
 		return true;
@@ -355,7 +369,7 @@ fn isspace(rn: rune) bool = {
 };
 
 fn isdelimiter(rn: rune) bool = {
-	match (strings::index(`()[]{}\:#`, rn)) {
+	match (strings::index(`()[]{}`, rn)) {
 	case size =>
 		return true;
 	case =>
diff --git a/kojote/lex/types.ha b/kojote/lex/types.ha
new file mode 100644
index 0000000..3311e72
--- /dev/null
+++ b/kojote/lex/types.ha
@@ -0,0 +1,51 @@
+use io;
+use fmt;
+
+// A syntax error.
+export type syntax = !(location, str);
+
+// All possible lexer errors
+export type error = !(io::error | syntax);
+
+// A token type
+export type ty = enum uint {
+	QUOT_START,
+	QUOT_END,
+	MAP_START,
+	MAP_END,
+	COMMENT,
+	WORD,
+	SYMBOL,
+	KEYWORD,
+	STRING,
+	CHAR,
+	NUMBER,
+	BOOLEAN,
+};
+
+// A token value, used for literal tokens and comments.
+export type value = (str | rune | bool | void);
+
+// A location within a source file.
+// The path is borrowed from the file name given to the lexer.
+export type location = struct {
+	path: str,
+	line: uint,
+	column: uint,
+};
+
+// A single lexical token.
+export type token = (ty, value, location);
+
+// Returns a human-friendly string for a given error. The result may be
+// statically allocated.
+export fn strerror(err: error) const str = {
+	static let buf: [512]u8 = [0...];
+	match (err) {
+	case let err: io::error =>
+		return io::strerror(err);
+	case let s: syntax =>
+		return fmt::bsprintf(buf, "{}:{}:{}: syntax error: {}",
+			s.0.path, s.0.line, s.0.column, s.1);
+	};
+};
diff --git a/parse/+test/lexer.ha b/parse/+test/lexer.ha
deleted file mode 100644
index 5c883cf..0000000
--- a/parse/+test/lexer.ha
+++ /dev/null
@@ -1,117 +0,0 @@
-use memio;
-use fmt;
-use strings;
-use io;
-
-@test fn lex() void = {
-	const cases: [_](str, []token) = [
-		(`"hello" \greeting def`,
-			["hello", mksym("greeting"), mkword("def")]),
-		(`[dup *] (a -- a) \square def`,
-			[quotstart, mkword("dup"), mkword("*"), quotend,
-				mkcomment("a -- a"), mksym("square"),
-				mkword("def")]),
-		(`#t #f`, [true, false]),
-		(`#\a #\space #\nul`, ['a', ' ', '\0']),
-		(`"\x0a;" "\x2014;" "\x2f9f4;"`, ["\n", "—", "嶲"]),
-		(`#\x #\x0a; #\x2014; #\x2f9f4;`, ['x', '\n', '—', '嶲']),
-	];
-
-	for (let i = 0z; i < len(cases); i += 1) {
-		const src = strings::toutf8(cases[i].0);
-		const src = memio::fixed(src);
-		const lexer = newlexer(&src, "<string>");
-		defer close(&lexer);
-
-		for (let j = 0z; j < len(cases[i].1); j += 1) {
-			const want = cases[i].1[j];
-			const have = match (lex(&lexer)) {
-			case let tok: token =>
-				yield tok;
-			case io::EOF =>
-				assert(false, "reached EOF");
-				return;
-			case let err: error =>
-				assert(false, strerror(err));
-				return;
-			};
-
-			if (!tokeq(want, have)) {
-				fmt::printfln("Case {}: {}", i, cases[i].0)!;
-				fmt::print("\tExpected: ")!;
-				tokpp(want);
-				fmt::print("\tGot: ")!;
-				tokpp(have);
-				assert(false);
-			};
-		};
-
-		assert(lex(&lexer) is io::EOF);
-	};
-};
-
-fn tokeq(have: token, want: token) bool = {
-	match (want) {
-	case quotstart =>
-		return have is quotstart;
-	case quotend =>
-		return have is quotend;
-	case mapstart =>
-		return have is mapstart;
-	case mapend =>
-		return have is mapend;
-	case let w: word =>
-		return (have as word).v == w.v;
-	case let s: str =>
-		return have as str == s;
-	case let s: symbol =>
-		return (have as symbol).v == s.v;
-	case let c: comment =>
-		return (have as comment).v == c.v;
-	case let r: rune =>
-		return have as rune == r;
-	case let b: bool =>
-		return have as bool == b;
-	};
-};
-
-fn tokpp(tok: token) void = {
-	match (tok) {
-	case quotstart =>
-		fmt::println("[")!;
-	case quotend =>
-		fmt::println("]")!;
-	case mapstart =>
-		fmt::println("{")!;
-	case mapend =>
-		fmt::println("}")!;
-	case let w: word =>
-		fmt::println(w.v)!;
-	case let s: symbol =>
-		fmt::printfln("{}{}", if (s.kw) ":" else "\\", s.v)!;
-	case let s: str =>
-		fmt::printfln(`"{}"`, s)!;
-	case let c: comment =>
-		fmt::printfln("({})", c.v)!;
-	case let r: rune =>
-		for (let i = 0z; i < len(longcharnames); i += 1) {
-			if (r == longcharnames[i].1) {
-				fmt::printfln("#\\{}", longcharnames[i].0)!;
-				return;
-			};
-		};
-		fmt::printfln("#\\{}", r)!;
-	case let b: bool =>
-		fmt::println(if (b) "#t" else "#f")!;
-	};
-};
-
-fn mkword(v: const str) word =
-	word{ v = v };
-
-fn mkcomment(v: const str) comment =
-	comment{ v = v };
-
-fn mksym(v: const str, kw: bool = false) symbol =
-	symbol{ v = v, kw = kw };
-
diff --git a/parse/types.ha b/parse/types.ha
deleted file mode 100644
index 809ed7e..0000000
--- a/parse/types.ha
+++ /dev/null
@@ -1,32 +0,0 @@
-use io;
-use fmt;
-
-export type invalid      = !(uint, uint);
-export type unterminated = !(const str, uint, uint);
-export type error        = !(invalid | unterminated | io::error);
-
-export type quotstart = void;
-export type quotend   = void;
-export type mapstart  = void;
-export type mapend    = void;
-
-export type comment = struct { v: str };
-export type word    = struct { v: str };
-export type symbol  = struct { v: str, kw: bool };
-
-export type token = (quotstart | quotend | mapstart | mapend |
-	word | symbol | comment | str | rune | bool);
-
-export fn strerror(err: error) const str = {
-	static let buf: [64]u8 = [0...];
-	match (err) {
-	case let err: invalid =>
-		return fmt::bsprintf(buf,
-			"Invalid token found at {}:{}", err.0, err.1);
-	case let err: unterminated =>
-		return fmt::bsprintf(buf,
-			"Unterminated {} found at {}:{}", err.0, err.1, err.2);
-	case let err: io::error =>
-		return io::strerror(err);
-	};
-};
diff --git a/test.kj b/test.kj
index 0a65d59..958f7cf 100644
--- a/test.kj
+++ b/test.kj
@@ -1,5 +1,6 @@
-3.14159 \pi def
-[dup *] \square def
+( hello world! )
+\pi 3.14159 def
+\square [dup *] def
+\circarea [square pi *] def
 
-[square pi *] \circarea def
 20 circarea . ( => 1256.636 )