finish first implementation of lexer

2024-12-04 13:29:11 -03:00 · 2024-12-04 13:29:11 -03:00 · 57979aa6fc
commit 57979aa6fc
parent 3efdaf7ade
3 changed files with 196 additions and 24 deletions
--- a/4
+++ b/4
@ -0,0 +1,4 @@
+.PHONY: test
+
+test:
+	@hare test
--- a/lex.ha
+++ b/lex.ha
@ -1,11 +1,11 @@
+use ascii; // TODO: maybe use unicode?
 use bufio;
+use encoding::utf8;
 use fmt;
 use io;
 use memio;
 use os;
-use encoding::utf8;
 use strings;
-use ascii;

 // my cod prob sux :(

@ -33,32 +33,90 @@ export fn close(lex: *lexer) void = {
 	io::close(&lex.strbuf)!;
 };

-fn updateloc(lex: *lexer, rn: rune) void = {
+export fn next(lex: *lexer) (token | io::EOF | error) = {
+	const rn = match (nextrunews(lex)?) {
+	case io::EOF =>
+		return io::EOF;
+	case let rn: rune =>
+		yield rn;
+	};
+
+	switch (rn) {
+	case '(' =>
+		return punctuation::LEFT_PAREN: token;
+	case ')' =>
+		return punctuation::RIGHT_PAREN: token;
+	case '[' =>
+		return punctuation::LEFT_SQUARE_BRACKET: token;
+	case ']' =>
+		return punctuation::RIGHT_SQUARE_BRACKET: token;
+	case '{' =>
+		return punctuation::LEFT_CURLY_BRACKET: token;
+	case '}' =>
+		return punctuation::RIGHT_CURLY_BRACKET: token;
+	case '\\' =>
+		return punctuation::BACKSLASH: token;
+	case ':' =>
+		return punctuation::COLON: token;
+	case '"' =>
+		match (scanstr(lex)?) {
+		case let s: str =>
+			return s;
+		case io::EOF =>
+			return io::EOF;
+		};
+	case =>
+		yield;
+	};
+
+	unget(lex, rn);
+	return word{ v = scanword(lex)? };
+};
+
+fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
+	match (lex.unread) {
+	case let rn: rune =>
+		lex.prevloc = lex.loc;
+		lex.unread = void;
 		if (rn == '\n') {
 			lex.loc = (lex.loc.0 + 1, 0);
 		} else {
 			lex.loc.1 += 1;
 		};
-};
-
-fn nextrune(lex: *lexer) (rune | io::error | io::EOF | utf8::invalid) = {
-	match (lex.unread) {
-	case let rn: rune =>
-		lex.prevloc = lex.loc;
-		lex.unread = void;
-		updateloc(lex, rn);
 		return rn;
 	case void =>
 		yield;
 	};

-	match (bufio::read_rune(lex.in)?) {
+	match (bufio::read_rune(lex.in)) {
 	case let rn: rune =>
 		lex.prevloc = lex.loc;
-		updateloc(lex, rn);
+		if (rn == '\n') {
+			lex.loc = (lex.loc.0 + 1, 0);
+		} else {
+			lex.loc.1 += 1;
+		};
 		return rn;
 	case io::EOF =>
 		return io::EOF;
+	case let err: io::error =>
+		return err;
+	case utf8::invalid =>
+		return lex.loc: invalid;
+	};
+};
+
+fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
+	for (true) {
+		match (nextrune(lex)?) {
+		case let rn: rune =>
+			if (ascii::isspace(rn)) {
+				continue;
+			};
+			return rn;
+		case io::EOF =>
+			return io::EOF;
+		};
 	};
 };

@ -68,22 +126,93 @@ fn unget(lex: *lexer, rn: rune) void = {
 	lex.loc = lex.prevloc;
 };

-fn skipws(lex: *lexer) (void | io::EOF | io::error | utf8::invalid) = {
+fn scanword(lex: *lexer) (str | error) = {
+	memio::reset(&lex.strbuf);
 	for (true) {
-		match (nextrune(lex)?) {
-		case io::EOF => return io::EOF;
+		const rn = match (nextrune(lex)?) {
 		case let rn: rune =>
-			if (!ascii::isspace(rn)) {
+			yield rn;
+		case io::EOF =>
+			break;
+		};
+		if (ascii::isspace(rn)) {
 			unget(lex, rn);
-				return;
-			};
+			break;
 		};
+		memio::appendrune(&lex.strbuf, rn)!;
 	};
+	return memio::string(&lex.strbuf)!;
 };

+fn scanstr(lex: *lexer) (str | io::EOF | error) = {
+	memio::reset(&lex.strbuf);
+	for (true) {
+		const rn = match (nextrune(lex)?) {
+		case let rn: rune =>
+			yield rn;
+		case io::EOF =>
+			return lex.loc: invalid;
+		};
+
+		switch (rn) {
+		case '"' => break;
+		case '\\' =>
+			memio::appendrune(&lex.strbuf, scanescape(lex)?)!;
+		case =>
+			memio::appendrune(&lex.strbuf, rn)!;
+		};
+	};
+	return memio::string(&lex.strbuf)!;
+};
+
+fn scanescape(lex: *lexer) (rune | error) = {
+	const rn = match (nextrune(lex)?) {
+	case let rn: rune =>
+		yield rn;
+	case io::EOF =>
+		return lex.loc: invalid;
+	};
+
+	switch (rn) {
+	case '"' =>
+		return '"';
+	case '\\' =>
+		return '\\';
+	case '\n' =>
+		return '\n';
+	case =>
+		return lex.loc: invalid;
+	};
+};

 // Tests! :)

+fn tnext(lex: *lexer) token = {
+	match (next(lex)!) {
+	case let t: token =>
+		return t;
+	case =>
+		assert(false);
+		return word { v = "" };
+	};
+};
+
+@test fn test_next() void = {
+	let lex = newlexer(&memio::fixed(
+		strings::toutf8("\"hello\" \\greeting def")),
+		"<string>");
+	defer close(&lex);
+
+	let tk = tnext(&lex);
+	assert(tk is str && tk: str == "hello");
+	let tk = tnext(&lex);
+	assert(tk is punctuation && tk: punctuation == punctuation::BACKSLASH);
+	let tk = tnext(&lex);
+	assert(tk is word && (tk: word).v == "greeting");
+	let tk = tnext(&lex);
+	assert(tk is word && (tk: word).v == "def");
+};
+
@test fn test_nextrune() void = {
 	let lex = newlexer(&memio::fixed(strings::toutf8("a\nb")),
 		"<string>");
@ -95,12 +224,33 @@ fn skipws(lex: *lexer) (void | io::EOF | io::error | utf8::invalid) = {
 	assert(lex.loc.0 == 2u && lex.loc.1 == 1u);
 };

-@test fn test_skipws() void = {
+@test fn test_nextrunews() void = {
 	let lex = newlexer(&memio::fixed(strings::toutf8("\n a")),
 		"<string>");
 	defer close(&lex);

-	skipws(&lex)!;
-	assert(nextrune(&lex)! == 'a');
+	assert(nextrunews(&lex)! == 'a');
 	assert(lex.loc.0 == 2u && lex.loc.1 == 2u);
 };
+
+@test fn test_scanword() void = {
+	let lex = newlexer(&memio::fixed(strings::toutf8("string->number .")),
+		"<string>");
+	defer close(&lex);
+
+	assert(scanword(&lex)! == "string->number");
+};
+
+@test fn test_scanstr() void = {
+	let lex = newlexer(&memio::fixed(strings::toutf8("\"\\\\hello\\\"world!\\\n\"")),
+		"<string>");
+	defer close(&lex);
+
+	assert(nextrune(&lex)! == '"');
+	match (scanstr(&lex)!) {
+	case io::EOF =>
+		assert(false);
+	case let s: str =>
+		assert(s == "\\hello\"world!\n");
+	};
+};
--- a/types.ha
+++ b/types.ha
@ -1,7 +1,25 @@
-export type punct = enum uint {
+use io;
+use fmt;
+
+export type invalid = !(uint, uint);
+export type error = !(invalid | io::error);
+
+export type punctuation = enum uint {
 	LEFT_PAREN, RIGHT_PAREN,
 	LEFT_SQUARE_BRACKET, RIGHT_SQUARE_BRACKET,
 	LEFT_CURLY_BRACKET, RIGHT_CURLY_BRACKET,
 	BACKSLASH, COLON,
 };
-export type token = (punct | str | f64 | bool);
+export type word = struct { v: str };
+export type token = (punctuation | word | str);
+
+export fn strerror(err: error) const str = {
+	static let buf: [64]u8 = [0...];
+	match (err) {
+	case let err: invalid =>
+		return fmt::bsprintf(buf,
+			"{}:{}: Invalid token found", err.0, err.1);
+	case let err: io::error =>
+		return io::strerror(err);
+	};
+};