fhdskjgfdsjglfdskjgfd lexer

2024-12-04 14:56:08 -03:00 · 2024-12-04 14:56:08 -03:00 · 696d4910b1
commit 696d4910b1
parent 63fe8290a5
2 changed files with 177 additions and 91 deletions
--- a/lex.ha
+++ b/lex.ha
@ -1,10 +1,12 @@
-use ascii; // TODO: maybe use unicode?
+use ascii;
 use bufio;
 use encoding::utf8;
 use fmt;
 use io;
 use memio;
-use os;
+use unicode;
 // Testing dependency
 use fmt;
 use strings;
 // my cod prob sux :(
@ -33,7 +35,7 @@ export fn close(lex: *lexer) void = {
 	io::close(&lex.strbuf)!;
 };
-export fn next(lex: *lexer) (token | io::EOF | error) = {
+export fn lex(lex: *lexer) (token | io::EOF | error) = {
 	const rn = match (nextrunews(lex)?) {
 	case io::EOF =>
 		return io::EOF;
@ -43,28 +45,35 @@ export fn next(lex: *lexer) (token | io::EOF | error) = {
 	switch (rn) {
 	case '(' =>
-		return punctuation::LEFT_PAREN: token;
+		return comment{ v = scancomment(lex)? };
 	case ')' =>
-		return punctuation::RIGHT_PAREN: token;
+		return lex.loc: invalid;
 	case '[' =>
-		return punctuation::LEFT_SQUARE_BRACKET: token;
+		return quotstart;
 	case ']' =>
-		return punctuation::RIGHT_SQUARE_BRACKET: token;
+		return quotend;
 	case '{' =>
-		return punctuation::LEFT_CURLY_BRACKET: token;
+		return mapstart;
 	case '}' =>
-		return punctuation::RIGHT_CURLY_BRACKET: token;
+		return mapend;
 	case '\\' =>
-		return punctuation::BACKSLASH: token;
+		let v = scanword(lex)?;
-	case ':' =>
+		if (len(v) == 0) {
-		return punctuation::COLON: token;
+			return lex.loc: invalid;
-	case '"' =>
+		} else {
-		match (scanstr(lex)?) {
+			return symbol{ v = v, kw = false };
 		case let s: str =>
 			return s;
 		case io::EOF =>
 			return io::EOF;
 		};
 	case ':' =>
 		let v = scanword(lex)?;
 		if (len(v) == 0) {
 			return lex.loc: invalid;
 		} else {
 			return symbol{ v = v, kw = true };
 		};
 	case '\'' =>
 		return scanchar(lex)?;
 	case '"' =>
 		return scanstr(lex)?;
 	case =>
 		yield;
 	};
@ -110,7 +119,7 @@ fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
 	for (true) {
 		match (nextrune(lex)?) {
 		case let rn: rune =>
-			if (ascii::isspace(rn)) {
+			if (isspace(rn)) {
 				continue;
 			};
 			return rn;
@ -135,7 +144,7 @@ fn scanword(lex: *lexer) (str | error) = {
 		case io::EOF =>
 			break;
 		};
-		if (ascii::isspace(rn)) {
+		if (isspace(rn) || isdelimiter(rn)) {
 			unget(lex, rn);
 			break;
 		};
@ -144,14 +153,37 @@ fn scanword(lex: *lexer) (str | error) = {
 	return memio::string(&lex.strbuf)!;
 };
-fn scanstr(lex: *lexer) (str | io::EOF | error) = {
+fn scancomment(lex: *lexer) (str | error) = {
 	memio::reset(&lex.strbuf);
 	for (true) {
 		const rn = match (nextrune(lex)?) {
 		case let rn: rune =>
 			yield rn;
 		case io::EOF =>
 			return ("comment", lex.loc.0, lex.loc.1): unterminated;
 		};
 		switch (rn) {
 		case '(' =>
 			return lex.loc: invalid;
 		case ')' =>
 			break;
 		case =>
 			memio::appendrune(&lex.strbuf, rn)!;
 		};
 	};
 	return memio::string(&lex.strbuf)!;
 };
 fn scanstr(lex: *lexer) (str | error) = {
 	memio::reset(&lex.strbuf);
 	for (true) {
 		const rn = match (nextrune(lex)?) {
 		case let rn: rune =>
 			yield rn;
 		case io::EOF =>
 			return ("string literal", lex.loc.0, lex.loc.1): unterminated;
 		};
 		switch (rn) {
@ -165,6 +197,22 @@ fn scanstr(lex: *lexer) (str | io::EOF | error) = {
 	return memio::string(&lex.strbuf)!;
 };
 fn scanchar(lex: *lexer) (rune | error) = {
 	const rn = match (nextrune(lex)?) {
 	case let rn: rune =>
 		yield rn;
 	case io::EOF =>
 		return ("character literal", lex.loc.0, lex.loc.1): unterminated;
 	};
 	switch (rn) {
 	case '\\' =>
 		return scanescape(lex)?;
 	case =>
 		return rn;
 	};
 };
 fn scanescape(lex: *lexer) (rune | error) = {
 	const rn = match (nextrune(lex)?) {
 	case let rn: rune =>
@ -178,79 +226,110 @@ fn scanescape(lex: *lexer) (rune | error) = {
 		return '"';
 	case '\\' =>
 		return '\\';
-	case '\n' =>
+	case 'n' =>
 		return '\n';
 	case 't' =>
 		return '\t';
 	case 's' =>
 		return ' ';
 	case =>
 		return lex.loc: invalid;
 	};
 };
-// Tests! :)
+fn isspace(rn: rune) bool = {
 	if (ascii::isspace(rn)) {
 		return true;
 	} else {
 		switch (unicode::rune_gc(rn)) {
 		case unicode::gc::Zs =>
 			return true;
 		case =>
 			return false;
 		};
 	};
 };
-fn tnext(lex: *lexer) token = {
+def delimiters = `()[]{}\:'`;
-	match (next(lex)!) {
+fn isdelimiter(rn: rune) bool = {
-	case let t: token =>
+	match (strings::index(delimiters, rn)) {
-		return t;
+	case size =>
 		return true;
 	case =>
-		assert(false);
+		return false;
 		return word { v = "" };
 	};
 };
-@test fn test_next() void = {
+@test fn lex() void = {
-	let lex = newlexer(&memio::fixed(
+	const cases: [_](str, []token) = [
-		strings::toutf8("\"hello\" \\greeting def")),
+		(
-		"<string>");
+			`"hello" \greeting def`,
-	defer close(&lex);
+			[
 				"hello",
 				mksym("greeting"),
 				mkword("def"),
 			]
 		),
 		(
 			`[dup *] (a -- a) \square def`,
 			[
 				quotstart,
 				mkword("dup"),
 				mkword("*"),
 				quotend,
 				mkcomment("a -- a"),
 				mksym("square"),
 				mkword("def"),
 			]
 		),
 		(`'\s`, [' '])
 	];
-	let tk = tnext(&lex);
+	for (let i = 0z; i < len(cases); i += 1) {
-	assert(tk is str && tk: str == "hello");
+		const src = strings::toutf8(cases[i].0);
-	let tk = tnext(&lex);
+		const src = memio::fixed(src);
-	assert(tk is punctuation && tk: punctuation == punctuation::BACKSLASH);
+		const lexer = newlexer(&src, "<string>");
-	let tk = tnext(&lex);
+		defer close(&lexer);
-	assert(tk is word && (tk: word).v == "greeting");
+
-	let tk = tnext(&lex);
+		for (let j = 0z; j < len(cases[i].1); j += 1) {
-	assert(tk is word && (tk: word).v == "def");
+			const want = cases[i].1[j];
 			const have = lex(&lexer)! as token;
 			assert(tokeq(want, have));
 		};
 		assert(lex(&lexer) is io::EOF);
 	};
 };
-@test fn test_nextrune() void = {
+fn tokeq(have: token, want: token) bool = {
-	let lex = newlexer(&memio::fixed(strings::toutf8("a\nb")),
+	match (want) {
-		"<string>");
+	case quotstart =>
-	defer close(&lex);
+		return have is quotstart;
-
+	case quotend =>
-	assert(nextrune(&lex)! == 'a');
+		return have is quotend;
-	assert(nextrune(&lex)! == '\n');
+	case mapstart =>
-	assert(nextrune(&lex)! == 'b');
+		return have is mapstart;
-	assert(lex.loc.0 == 2u && lex.loc.1 == 1u);
+	case mapend =>
-};
+		return have is mapend;
-
+	case let w: word =>
-@test fn test_nextrunews() void = {
+		return (have as word).v == w.v;
 	let lex = newlexer(&memio::fixed(strings::toutf8("\n a")),
 		"<string>");
 	defer close(&lex);
 	assert(nextrunews(&lex)! == 'a');
 	assert(lex.loc.0 == 2u && lex.loc.1 == 2u);
 };
@test fn test_scanword() void = {
 	let lex = newlexer(&memio::fixed(strings::toutf8("string->number .")),
 		"<string>");
 	defer close(&lex);
 	assert(scanword(&lex)! == "string->number");
 };
@test fn test_scanstr() void = {
 	let lex = newlexer(&memio::fixed(strings::toutf8("\"\\\\hello\\\"world!\\\n\"")),
 		"<string>");
 	defer close(&lex);
 	assert(nextrune(&lex)! == '"');
 	match (scanstr(&lex)!) {
 	case io::EOF =>
 		assert(false);
 	case let s: str =>
-		assert(s == "\\hello\"world!\n");
+		return have as str == s;
 	case let s: symbol =>
 		return (have as symbol).v == s.v;
 	case let c: comment =>
 		return (have as comment).v == c.v;
 	case let r: rune =>
 		return have as rune == r;
 	};
 };
 fn mkword(v: const str) word =
 	word{ v = v };
 fn mkcomment(v: const str) comment =
 	comment{ v = v };
 fn mksym(v: const str, kw: bool = false) symbol =
 	symbol{ v = v, kw = kw };
--- a/types.ha
+++ b/types.ha
@ -1,17 +1,21 @@
 use io;
 use fmt;
-export type invalid = !(uint, uint);
+export type invalid      = !(uint, uint);
-export type error = !(invalid | io::error);
+export type unterminated = !(const str, uint, uint);
 export type error        = !(invalid | unterminated | io::error);
-export type punctuation = enum uint {
+export type quotstart = void;
-	LEFT_PAREN, RIGHT_PAREN,
+export type quotend   = void;
-	LEFT_SQUARE_BRACKET, RIGHT_SQUARE_BRACKET,
+export type mapstart  = void;
-	LEFT_CURLY_BRACKET, RIGHT_CURLY_BRACKET,
+export type mapend    = void;
-	BACKSLASH, COLON,
+
-};
+export type comment = struct { v: str };
-export type word = struct { v: str };
+export type word    = struct { v: str };
-export type token = (punctuation | word | str);
+export type symbol  = struct { v: str, kw: bool };
 export type token = (quotstart | quotend | mapstart | mapend |
 	word | symbol | comment | str | rune);
 export fn strerror(err: error) const str = {
 	static let buf: [64]u8 = [0...];
@ -19,6 +23,9 @@ export fn strerror(err: error) const str = {
 	case let err: invalid =>
 		return fmt::bsprintf(buf,
 			"{}:{}: Invalid token found", err.0, err.1);
 	case let err: unterminated =>
 		return fmt::bsprintf(buf,
 			"{}:{}: Unterminated {} found", err.1, err.2, err.0);
 	case let err: io::error =>
 		return io::strerror(err);
 	};