lex: rework types

2024-12-05 22:06:21 -03:00 · 2024-12-05 22:06:21 -03:00 · d7b6b380fe
commit d7b6b380fe
parent bdfaa4f1ab
6 changed files with 258 additions and 210 deletions
--- a/kojote/lex/+test.ha
+++ b/kojote/lex/+test.ha
@ -0,0 +1,131 @@
 use memio;
 use fmt;
 use strings;
 use io;
 type dummytoken = (ty, value);
@test fn next() void = {
 	const cases: [_](str, []dummytoken) = [
 		(`"hello" \greeting def`,
 			[
 				(ty::STRING, "hello"),
 				(ty::SYMBOL, "greeting"),
 				(ty::WORD, "def")
 			]),
 		(`[dup *] (a -- a) \square def`,
 			[
 				(ty::QUOT_START, void),
 				(ty::WORD, "dup"),
 				(ty::WORD, "*"),
 				(ty::QUOT_END, void),
 				(ty::COMMENT, "a -- a"),
 				(ty::SYMBOL, "square"),
 				(ty::WORD, "def"),
 			]),
 		(`#t #f`,
 			[
 				(ty::BOOLEAN, true),
 				(ty::BOOLEAN, false),
 			]),
 		(`#\a #\space #\nul`,
 			[
 				(ty::CHAR, 'a'),
 				(ty::CHAR, ' '),
 				(ty::CHAR, '\0'),
 			]),
 		(`"\x0a;\x2014;\x2f9f4;"`,
 			[
 				(ty::STRING, "\n—嶲"),
 			]),
 		(`#\x #\x0a; #\x2014; #\x2f9f4;`,
 			[
 				(ty::CHAR, 'x'),
 				(ty::CHAR, '\n'),
 				(ty::CHAR, '—'),
 				(ty::CHAR, '嶲'),
 			]),
 	];
 	for (let i = 0z; i < len(cases); i += 1) {
 		const src = strings::toutf8(cases[i].0);
 		const src = memio::fixed(src);
 		const lexer = newlexer(&src, "<string>");
 		defer close(&lexer);
 		for (let j = 0z; j < len(cases[i].1); j += 1) {
 			const want = cases[i].1[j];
 			const have = match (next(&lexer)) {
 			case let tok: token =>
 				yield tok;
 			case io::EOF =>
 				assert(false, "reached EOF");
 				return;
 			case let err: error =>
 				assert(false, strerror(err));
 				return;
 			};
 			if (!tokeq(have, want)) {
 				fmt::printf("Expected:\n\t")!;
 				fmt::println(tokpp(want.0, want.1))!;
 				fmt::printf("Got:\n\t")!;
 				fmt::println(tokpp(have.0, have.1))!;
 				assert(false);
 			};
 		};
 		assert(next(&lexer) is io::EOF);
 	};
 };
 fn tokeq(have: token, want: dummytoken) bool =
 	have.0 == want.0 && match (have.1) {
 	case void =>
 		yield true;
 	case let s: str =>
 		yield want.1 is str && (want.1 as str) == s;
 	case let r: rune =>
 		yield want.1 is rune && (want.1 as rune) == r;
 	case let b: bool =>
 		yield want.1 is bool && (want.1 as bool) == b;
 	};
 fn tokpp(ty: ty, value: value) const str = {
 	static let buf: [128]u8 = [0...];
 	switch (ty) {
 	case ty::QUOT_START =>
 		return "[";
 	case ty::QUOT_END =>
 		return "]";
 	case ty::MAP_START =>
 		return "{";
 	case ty::MAP_END =>
 		return "}";
 	case ty::COMMENT =>
 		return fmt::bsprintf(buf, "({})", value as str);
 	case ty::WORD =>
 		return value as str;
 	case ty::SYMBOL =>
 		return fmt::bsprintf(buf, "\\{}", value as str);
 	case ty::KEYWORD =>
 		return fmt::bsprintf(buf, ":{}", value as str);
 	case ty::STRING =>
 		return fmt::bsprintf(buf, "\"{}\"", value as str);
 	case ty::CHAR =>
 		let rn = value as rune;
 		for (let i = 0z; i < len(longcharnames); i += 1) {
 			if (longcharnames[i].1 == rn) {
 				return fmt::bsprintf(buf, "#\\{}",
 					longcharnames[i].0);
 			};
 		};
 		return fmt::bsprintf(buf, "#\\{}", rn);
 	case ty::NUMBER =>
 		return value as str;
 	case ty::BOOLEAN =>
 		return fmt::bsprintf(buf, "#{}",
 			if (value as bool) 't' else 'f');
 	};
 };
--- a/kojote/lex/lexer.ha
+++ b/kojote/lex/lexer.ha
@ -27,16 +27,19 @@ def longcharnames: [_](str, rune) = [
 export type lexer = struct {
 	in: io::handle,
 	strbuf: memio::stream,
 	commentbuf: memio::stream,
 	path: str,
 	loc: (uint, uint),
 	prevloc: (uint, uint),
 	unread: (rune | void),
 };
 // Creates a new [[lexer]] for the given [[io::handle]]. The path is borrowed
 export fn newlexer(in: io::handle, path: str) lexer = {
 	return lexer {
 		in = in,
 		strbuf = memio::dynamic(),
 		commentbuf = memio::dynamic(),
 		path = path,
 		loc = (1, 0),
 		unread = void,
@ -44,11 +47,14 @@ export fn newlexer(in: io::handle, path: str) lexer = {
 	};
 };
 // Frees resources associated with a [[lexer]].
 export fn close(lex: *lexer) void = {
 	io::close(&lex.strbuf)!;
 	io::close(&lex.commentbuf)!;
 };
-export fn lex(lex: *lexer) (token | io::EOF | error) = {
+// Returns the next token from the lexer.
 export fn next(lex: *lexer) (token | io::EOF | error) = {
 	const rn = match (nextrunews(lex)?) {
 	case io::EOF =>
 		return io::EOF;
@ -58,30 +64,30 @@ export fn lex(lex: *lexer) (token | io::EOF | error) = {
 	switch (rn) {
 	case '(' =>
-		return comment{ v = scancomment(lex)? };
+		return mktoken(lex, ty::COMMENT, scancomment(lex)?);
 	case ')' =>
-		return lex.loc: invalid;
+		return mkerror(lex, "invalid token");
 	case '[' =>
-		return quotstart;
+		return mktoken(lex, ty::QUOT_START, void);
 	case ']' =>
-		return quotend;
+		return mktoken(lex, ty::QUOT_END, void);
 	case '{' =>
-		return mapstart;
+		return mktoken(lex, ty::MAP_START, void);
 	case '}' =>
-		return mapend;
+		return mktoken(lex, ty::MAP_END, void);
 	case '\\' =>
 		let v = scanword(lex)?;
 		if (len(v) == 0) {
-			return lex.loc: invalid;
+			return mkerror(lex, "invalid symbol literal");
 		} else {
-			return symbol{ v = v, kw = false };
+			return mktoken(lex, ty::SYMBOL, v);
 		};
 	case ':' =>
 		let v = scanword(lex)?;
 		if (len(v) == 0) {
-			return lex.loc: invalid;
+			return mkerror(lex, "invalid keyword");
 		} else {
-			return symbol{ v = v, kw = true };
+			return mktoken(lex, ty::KEYWORD, v);
 		};
 	case '#' =>
 		return scanpound(lex)?;
@ -92,7 +98,7 @@ export fn lex(lex: *lexer) (token | io::EOF | error) = {
 	};
 	unget(lex, rn);
-	return word{ v = scanword(lex)? };
+	return mktoken(lex, ty::WORD, scanword(lex)?);
 };
 fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
@ -124,7 +130,7 @@ fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
 	case let err: io::error =>
 		return err;
 	case utf8::invalid =>
-		return lex.loc: invalid;
+		return mkerror(lex, "invalid UTF-8 sequence");
 	};
 };
@ -132,9 +138,7 @@ fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
 	for (true) {
 		match (nextrune(lex)?) {
 		case let rn: rune =>
-			if (isspace(rn)) {
+			if (isspace(rn)) continue;
 				continue;
 			};
 			return rn;
 		case io::EOF =>
 			return io::EOF;
@ -167,36 +171,36 @@ fn scanword(lex: *lexer) (str | error) = {
 };
 fn scancomment(lex: *lexer) (str | error) = {
-	memio::reset(&lex.strbuf);
+	memio::reset(&lex.commentbuf);
 	for (true) {
 		const rn = match (nextrune(lex)?) {
 		case let rn: rune =>
 			yield rn;
 		case io::EOF =>
-			return ("comment", lex.loc.0, lex.loc.1): unterminated;
+			return mkerror(lex, "unterminated comment");
 		};
 		switch (rn) {
 		case '(' =>
-			return lex.loc: invalid;
+			return mkerror(lex, "nested comments are not allowed");
 		case ')' =>
 			break;
 		case =>
-			memio::appendrune(&lex.strbuf, rn)!;
+			memio::appendrune(&lex.commentbuf, rn)!;
 		};
 	};
-	return memio::string(&lex.strbuf)!;
+	return memio::string(&lex.commentbuf)!;
 };
-fn scanstr(lex: *lexer) (str | error) = {
+fn scanstr(lex: *lexer) (token | error) = {
 	memio::reset(&lex.strbuf);
 	for (true) {
 		const rn = match (nextrune(lex)?) {
 		case let rn: rune =>
 			yield rn;
 		case io::EOF =>
-			return ("string literal", lex.loc.0, lex.loc.1): unterminated;
+			return mkerror(lex, "unterminated string literal");
 		};
 		switch (rn) {
@ -207,7 +211,7 @@ fn scanstr(lex: *lexer) (str | error) = {
 			memio::appendrune(&lex.strbuf, rn)!;
 		};
 	};
-	return memio::string(&lex.strbuf)!;
+	return mktoken(lex, ty::STRING, memio::string(&lex.strbuf)!);
 };
 fn scanpound(lex: *lexer) (token | error) = {
@ -215,22 +219,22 @@ fn scanpound(lex: *lexer) (token | error) = {
 	case let rn: rune =>
 		yield rn;
 	case io::EOF =>
-		return ("pound literal", lex.loc.0, lex.loc.1): unterminated;
+		return mkerror(lex, "unterminated pound literal");
 	};
 	switch (rn) {
 	case 't' =>
-		return true;
+		return mktoken(lex, ty::BOOLEAN, true);
 	case 'f' =>
-		return false;
+		return mktoken(lex, ty::BOOLEAN, false);
 	case '\\' =>
-		return scanchar(lex)?;
+		return scanchar(lex);
 	case =>
-		return lex.loc: invalid;
+		return mkerror(lex, "invalid pound literal");
 	};
 };
-fn scanchar(lex: *lexer) (rune | error) = {
+fn scanchar(lex: *lexer) (token | error) = {
 	static let namebuf: [16]u8 = [0...];
 	let namebuf = memio::fixed(namebuf);
@ -238,31 +242,35 @@ fn scanchar(lex: *lexer) (rune | error) = {
 	case let rn: rune =>
 		yield rn;
 	case io::EOF =>
-		return ("character literal", lex.loc.0, lex.loc.1): unterminated;
+		return mkerror(lex, "unterminated character literal");
 	};
 	let ret: rune = '\0';
 	match (nextrune(lex)?) {
 	case let rnn: rune =>
 		unget(lex, rnn);
 		if (isspace(rnn)) {
 			return rn;
 		} else {
 			if (rn == 'x') {
 				return scanescape2(lex);
 			} else {
 				memio::appendrune(&namebuf, rn)!;
 				memio::concat(&namebuf, scanword(lex)?)!;
 				const name = memio::string(&namebuf)!;
 					for (let i = 0z; i < len(longcharnames); i += 1) {
 					if (name == longcharnames[i].0) {
 						return longcharnames[i].1;
 					};
 				};
 				return lex.loc: invalid;
 			};
 		};
 	case io::EOF =>
-		return rn;
+		return mktoken(lex, ty::CHAR, rn);
 	case let next: rune =>
 		unget(lex, next);
 		if (isspace(next)) {
 			return mktoken(lex, ty::CHAR, rn);
 		};
 		if (rn == 'x') {
 			return mktoken(lex, ty::CHAR, scanescape2(lex)?);
 		} else {
 			memio::appendrune(&namebuf, rn)!;
 			memio::concat(&namebuf, scanword(lex)?)!;
 			const name = memio::string(&namebuf)!;
 			for (let i = 0z; i < len(longcharnames); i += 1) {
 				if (name == longcharnames[i].0) {
 					return mktoken(lex, ty::CHAR,
 						longcharnames[i].1);
 				};
 			};
 			return mkerror(lex, "invalid named character literal");
 		};
 	};
 };
@ -271,7 +279,7 @@ fn scanescape(lex: *lexer) (rune | error) = {
 	case let rn: rune =>
 		yield rn;
 	case io::EOF =>
-		return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
+		return mkerror(lex, "unterminated character escape");
 	};
 	switch (rn) {
@ -288,7 +296,7 @@ fn scanescape(lex: *lexer) (rune | error) = {
 	case '"' => return '"';
 	case 'x' => return scanescape2(lex)?;
 	case =>
-		return lex.loc: invalid;
+		return mkerror(lex, "invalid character escape");
 	};
 };
@ -304,7 +312,7 @@ fn scanescape2(lex: *lexer) (rune | error) = {
 	case let rn: rune =>
 		yield rn;
 	case io::EOF =>
-		return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
+		return mkerror(lex, "unterminated character escape");
 	};
 	const buf: [6]u8 = [0...];
@ -317,11 +325,11 @@ fn scanescape2(lex: *lexer) (rune | error) = {
 		case let rn: rune =>
 			yield rn;
 		case io::EOF =>
-			return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
+			return mkerror(lex, "unterminated escape sequence");
 		};
 		if (count > 6) {
-			return lex.loc: invalid;
+			return mkerror(lex, "invalid escape sequence");
 		} else if (rn == ';') {
 			break;
 		} else {
@ -337,10 +345,16 @@ fn scanescape2(lex: *lexer) (rune | error) = {
 	case let codepoint: u32 =>
 		return codepoint: rune;
 	case =>
-		return lex.loc: invalid;
+		return mkerror(lex, "invalid escape sequence");
 	};
 };
 fn mktoken(lex: *lexer, ty: ty, value: value) token =
 	(ty, value, location{ path = lex.path, line = lex.loc.0, column = lex.loc.1 });
 fn mkerror(lex: *lexer, msg: const str) syntax =
 	(location{ path = lex.path, line = lex.loc.0, column = lex.loc.1 }, msg);
 fn isspace(rn: rune) bool = {
 	if (ascii::isspace(rn)) {
 		return true;
@ -355,7 +369,7 @@ fn isspace(rn: rune) bool = {
 };
 fn isdelimiter(rn: rune) bool = {
-	match (strings::index(`()[]{}\:#`, rn)) {
+	match (strings::index(`()[]{}`, rn)) {
 	case size =>
 		return true;
 	case =>
--- a/kojote/lex/types.ha
+++ b/kojote/lex/types.ha
@ -0,0 +1,51 @@
 use io;
 use fmt;
 // A syntax error.
 export type syntax = !(location, str);
 // All possible lexer errors
 export type error = !(io::error | syntax);
 // A token type
 export type ty = enum uint {
 	QUOT_START,
 	QUOT_END,
 	MAP_START,
 	MAP_END,
 	COMMENT,
 	WORD,
 	SYMBOL,
 	KEYWORD,
 	STRING,
 	CHAR,
 	NUMBER,
 	BOOLEAN,
 };
 // A token value, used for literal tokens and comments.
 export type value = (str | rune | bool | void);
 // A location within a source file.
 // The path is borrowed from the file name given to the lexer.
 export type location = struct {
 	path: str,
 	line: uint,
 	column: uint,
 };
 // A single lexical token.
 export type token = (ty, value, location);
 // Returns a human-friendly string for a given error. The result may be
 // statically allocated.
 export fn strerror(err: error) const str = {
 	static let buf: [512]u8 = [0...];
 	match (err) {
 	case let err: io::error =>
 		return io::strerror(err);
 	case let s: syntax =>
 		return fmt::bsprintf(buf, "{}:{}:{}: syntax error: {}",
 			s.0.path, s.0.line, s.0.column, s.1);
 	};
 };
--- a/parse/+test/lexer.ha
+++ b/parse/+test/lexer.ha
@ -1,117 +0,0 @@
 use memio;
 use fmt;
 use strings;
 use io;
@test fn lex() void = {
 	const cases: [_](str, []token) = [
 		(`"hello" \greeting def`,
 			["hello", mksym("greeting"), mkword("def")]),
 		(`[dup *] (a -- a) \square def`,
 			[quotstart, mkword("dup"), mkword("*"), quotend,
 				mkcomment("a -- a"), mksym("square"),
 				mkword("def")]),
 		(`#t #f`, [true, false]),
 		(`#\a #\space #\nul`, ['a', ' ', '\0']),
 		(`"\x0a;" "\x2014;" "\x2f9f4;"`, ["\n", "—", "嶲"]),
 		(`#\x #\x0a; #\x2014; #\x2f9f4;`, ['x', '\n', '—', '嶲']),
 	];
 	for (let i = 0z; i < len(cases); i += 1) {
 		const src = strings::toutf8(cases[i].0);
 		const src = memio::fixed(src);
 		const lexer = newlexer(&src, "<string>");
 		defer close(&lexer);
 		for (let j = 0z; j < len(cases[i].1); j += 1) {
 			const want = cases[i].1[j];
 			const have = match (lex(&lexer)) {
 			case let tok: token =>
 				yield tok;
 			case io::EOF =>
 				assert(false, "reached EOF");
 				return;
 			case let err: error =>
 				assert(false, strerror(err));
 				return;
 			};
 			if (!tokeq(want, have)) {
 				fmt::printfln("Case {}: {}", i, cases[i].0)!;
 				fmt::print("\tExpected: ")!;
 				tokpp(want);
 				fmt::print("\tGot: ")!;
 				tokpp(have);
 				assert(false);
 			};
 		};
 		assert(lex(&lexer) is io::EOF);
 	};
 };
 fn tokeq(have: token, want: token) bool = {
 	match (want) {
 	case quotstart =>
 		return have is quotstart;
 	case quotend =>
 		return have is quotend;
 	case mapstart =>
 		return have is mapstart;
 	case mapend =>
 		return have is mapend;
 	case let w: word =>
 		return (have as word).v == w.v;
 	case let s: str =>
 		return have as str == s;
 	case let s: symbol =>
 		return (have as symbol).v == s.v;
 	case let c: comment =>
 		return (have as comment).v == c.v;
 	case let r: rune =>
 		return have as rune == r;
 	case let b: bool =>
 		return have as bool == b;
 	};
 };
 fn tokpp(tok: token) void = {
 	match (tok) {
 	case quotstart =>
 		fmt::println("[")!;
 	case quotend =>
 		fmt::println("]")!;
 	case mapstart =>
 		fmt::println("{")!;
 	case mapend =>
 		fmt::println("}")!;
 	case let w: word =>
 		fmt::println(w.v)!;
 	case let s: symbol =>
 		fmt::printfln("{}{}", if (s.kw) ":" else "\\", s.v)!;
 	case let s: str =>
 		fmt::printfln(`"{}"`, s)!;
 	case let c: comment =>
 		fmt::printfln("({})", c.v)!;
 	case let r: rune =>
 		for (let i = 0z; i < len(longcharnames); i += 1) {
 			if (r == longcharnames[i].1) {
 				fmt::printfln("#\\{}", longcharnames[i].0)!;
 				return;
 			};
 		};
 		fmt::printfln("#\\{}", r)!;
 	case let b: bool =>
 		fmt::println(if (b) "#t" else "#f")!;
 	};
 };
 fn mkword(v: const str) word =
 	word{ v = v };
 fn mkcomment(v: const str) comment =
 	comment{ v = v };
 fn mksym(v: const str, kw: bool = false) symbol =
 	symbol{ v = v, kw = kw };
--- a/parse/types.ha
+++ b/parse/types.ha
@ -1,32 +0,0 @@
 use io;
 use fmt;
 export type invalid      = !(uint, uint);
 export type unterminated = !(const str, uint, uint);
 export type error        = !(invalid | unterminated | io::error);
 export type quotstart = void;
 export type quotend   = void;
 export type mapstart  = void;
 export type mapend    = void;
 export type comment = struct { v: str };
 export type word    = struct { v: str };
 export type symbol  = struct { v: str, kw: bool };
 export type token = (quotstart | quotend | mapstart | mapend |
 	word | symbol | comment | str | rune | bool);
 export fn strerror(err: error) const str = {
 	static let buf: [64]u8 = [0...];
 	match (err) {
 	case let err: invalid =>
 		return fmt::bsprintf(buf,
 			"Invalid token found at {}:{}", err.0, err.1);
 	case let err: unterminated =>
 		return fmt::bsprintf(buf,
 			"Unterminated {} found at {}:{}", err.0, err.1, err.2);
 	case let err: io::error =>
 		return io::strerror(err);
 	};
 };
--- a/test.kj
+++ b/test.kj
@ -1,5 +1,6 @@
-3.14159 \pi def
+( hello world! )
-[dup *] \square def
+\pi 3.14159 def
 \square [dup *] def
 \circarea [square pi *] def
 [square pi *] \circarea def
 20 circarea . ( => 1256.636 )