finish first implementation of lexer

2024-12-04 13:29:11 -03:00 · 2024-12-04 13:29:11 -03:00 · 57979aa6fc
commit 57979aa6fc
parent 3efdaf7ade
3 changed files with 196 additions and 24 deletions
--- a/4
+++ b/4
@ -0,0 +1,4 @@
 .PHONY: test
 test:
 	@hare test
--- a/lex.ha
+++ b/lex.ha
@ -1,11 +1,11 @@
 use ascii; // TODO: maybe use unicode?
 use bufio;
 use encoding::utf8;
 use fmt;
 use io;
 use memio;
 use os;
 use encoding::utf8;
 use strings;
 use ascii;
 // my cod prob sux :(
@ -33,32 +33,90 @@ export fn close(lex: *lexer) void = {
 	io::close(&lex.strbuf)!;
 };
-fn updateloc(lex: *lexer, rn: rune) void = {
+export fn next(lex: *lexer) (token | io::EOF | error) = {
-	if (rn == '\n') {
+	const rn = match (nextrunews(lex)?) {
-		lex.loc = (lex.loc.0 + 1, 0);
+	case io::EOF =>
-	} else {
+		return io::EOF;
-		lex.loc.1 += 1;
+	case let rn: rune =>
 		yield rn;
 	};
 	switch (rn) {
 	case '(' =>
 		return punctuation::LEFT_PAREN: token;
 	case ')' =>
 		return punctuation::RIGHT_PAREN: token;
 	case '[' =>
 		return punctuation::LEFT_SQUARE_BRACKET: token;
 	case ']' =>
 		return punctuation::RIGHT_SQUARE_BRACKET: token;
 	case '{' =>
 		return punctuation::LEFT_CURLY_BRACKET: token;
 	case '}' =>
 		return punctuation::RIGHT_CURLY_BRACKET: token;
 	case '\\' =>
 		return punctuation::BACKSLASH: token;
 	case ':' =>
 		return punctuation::COLON: token;
 	case '"' =>
 		match (scanstr(lex)?) {
 		case let s: str =>
 			return s;
 		case io::EOF =>
 			return io::EOF;
 		};
 	case =>
 		yield;
 	};
 	unget(lex, rn);
 	return word{ v = scanword(lex)? };
 };
-fn nextrune(lex: *lexer) (rune | io::error | io::EOF | utf8::invalid) = {
+fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
 	match (lex.unread) {
 	case let rn: rune =>
 		lex.prevloc = lex.loc;
 		lex.unread = void;
-		updateloc(lex, rn);
+		if (rn == '\n') {
 			lex.loc = (lex.loc.0 + 1, 0);
 		} else {
 			lex.loc.1 += 1;
 		};
 		return rn;
 	case void =>
 		yield;
 	};
-	match (bufio::read_rune(lex.in)?) {
+	match (bufio::read_rune(lex.in)) {
 	case let rn: rune =>
 		lex.prevloc = lex.loc;
-		updateloc(lex, rn);
+		if (rn == '\n') {
 			lex.loc = (lex.loc.0 + 1, 0);
 		} else {
 			lex.loc.1 += 1;
 		};
 		return rn;
 	case io::EOF =>
 		return io::EOF;
 	case let err: io::error =>
 		return err;
 	case utf8::invalid =>
 		return lex.loc: invalid;
 	};
 };
 fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
 	for (true) {
 		match (nextrune(lex)?) {
 		case let rn: rune =>
 			if (ascii::isspace(rn)) {
 				continue;
 			};
 			return rn;
 		case io::EOF =>
 			return io::EOF;
 		};
 	};
 };
@ -68,22 +126,93 @@ fn unget(lex: *lexer, rn: rune) void = {
 	lex.loc = lex.prevloc;
 };
-fn skipws(lex: *lexer) (void | io::EOF | io::error | utf8::invalid) = {
+fn scanword(lex: *lexer) (str | error) = {
 	memio::reset(&lex.strbuf);
 	for (true) {
-		match (nextrune(lex)?) {
+		const rn = match (nextrune(lex)?) {
 		case io::EOF => return io::EOF;
 		case let rn: rune =>
-			if (!ascii::isspace(rn)) {
+			yield rn;
-				unget(lex, rn);
+		case io::EOF =>
-				return;
+			break;
 			};
 		};
 		if (ascii::isspace(rn)) {
 			unget(lex, rn);
 			break;
 		};
 		memio::appendrune(&lex.strbuf, rn)!;
 	};
 	return memio::string(&lex.strbuf)!;
 };
 fn scanstr(lex: *lexer) (str | io::EOF | error) = {
 	memio::reset(&lex.strbuf);
 	for (true) {
 		const rn = match (nextrune(lex)?) {
 		case let rn: rune =>
 			yield rn;
 		case io::EOF =>
 			return lex.loc: invalid;
 		};
 		switch (rn) {
 		case '"' => break;
 		case '\\' =>
 			memio::appendrune(&lex.strbuf, scanescape(lex)?)!;
 		case =>
 			memio::appendrune(&lex.strbuf, rn)!;
 		};
 	};
 	return memio::string(&lex.strbuf)!;
 };
 fn scanescape(lex: *lexer) (rune | error) = {
 	const rn = match (nextrune(lex)?) {
 	case let rn: rune =>
 		yield rn;
 	case io::EOF =>
 		return lex.loc: invalid;
 	};
 	switch (rn) {
 	case '"' =>
 		return '"';
 	case '\\' =>
 		return '\\';
 	case '\n' =>
 		return '\n';
 	case =>
 		return lex.loc: invalid;
 	};
 };
 // Tests! :)
 fn tnext(lex: *lexer) token = {
 	match (next(lex)!) {
 	case let t: token =>
 		return t;
 	case =>
 		assert(false);
 		return word { v = "" };
 	};
 };
@test fn test_next() void = {
 	let lex = newlexer(&memio::fixed(
 		strings::toutf8("\"hello\" \\greeting def")),
 		"<string>");
 	defer close(&lex);
 	let tk = tnext(&lex);
 	assert(tk is str && tk: str == "hello");
 	let tk = tnext(&lex);
 	assert(tk is punctuation && tk: punctuation == punctuation::BACKSLASH);
 	let tk = tnext(&lex);
 	assert(tk is word && (tk: word).v == "greeting");
 	let tk = tnext(&lex);
 	assert(tk is word && (tk: word).v == "def");
 };
@test fn test_nextrune() void = {
 	let lex = newlexer(&memio::fixed(strings::toutf8("a\nb")),
 		"<string>");
@ -95,12 +224,33 @@ fn skipws(lex: *lexer) (void | io::EOF | io::error | utf8::invalid) = {
 	assert(lex.loc.0 == 2u && lex.loc.1 == 1u);
 };
-@test fn test_skipws() void = {
+@test fn test_nextrunews() void = {
 	let lex = newlexer(&memio::fixed(strings::toutf8("\n a")),
 		"<string>");
 	defer close(&lex);
-	skipws(&lex)!;
+	assert(nextrunews(&lex)! == 'a');
 	assert(nextrune(&lex)! == 'a');
 	assert(lex.loc.0 == 2u && lex.loc.1 == 2u);
 };
@test fn test_scanword() void = {
 	let lex = newlexer(&memio::fixed(strings::toutf8("string->number .")),
 		"<string>");
 	defer close(&lex);
 	assert(scanword(&lex)! == "string->number");
 };
@test fn test_scanstr() void = {
 	let lex = newlexer(&memio::fixed(strings::toutf8("\"\\\\hello\\\"world!\\\n\"")),
 		"<string>");
 	defer close(&lex);
 	assert(nextrune(&lex)! == '"');
 	match (scanstr(&lex)!) {
 	case io::EOF =>
 		assert(false);
 	case let s: str =>
 		assert(s == "\\hello\"world!\n");
 	};
 };
--- a/types.ha
+++ b/types.ha
@ -1,7 +1,25 @@
-export type punct = enum uint {
+use io;
 use fmt;
 export type invalid = !(uint, uint);
 export type error = !(invalid | io::error);
 export type punctuation = enum uint {
 	LEFT_PAREN, RIGHT_PAREN,
 	LEFT_SQUARE_BRACKET, RIGHT_SQUARE_BRACKET,
 	LEFT_CURLY_BRACKET, RIGHT_CURLY_BRACKET,
 	BACKSLASH, COLON,
 };
-export type token = (punct | str | f64 | bool);
+export type word = struct { v: str };
 export type token = (punctuation | word | str);
 export fn strerror(err: error) const str = {
 	static let buf: [64]u8 = [0...];
 	match (err) {
 	case let err: invalid =>
 		return fmt::bsprintf(buf,
 			"{}:{}: Invalid token found", err.0, err.1);
 	case let err: io::error =>
 		return io::strerror(err);
 	};
 };