finish first implementation of lexer

This commit is contained in:
Lobo Torres 2024-12-04 13:29:11 -03:00
parent 3efdaf7ade
commit 57979aa6fc
3 changed files with 196 additions and 24 deletions

4
Makefile Normal file
View file

@ -0,0 +1,4 @@
.PHONY: test
test:
@hare test

194
lex.ha
View file

@ -1,11 +1,11 @@
use ascii; // TODO: maybe use unicode?
use bufio; use bufio;
use encoding::utf8;
use fmt; use fmt;
use io; use io;
use memio; use memio;
use os; use os;
use encoding::utf8;
use strings; use strings;
use ascii;
// my cod prob sux :( // my cod prob sux :(
@ -33,32 +33,90 @@ export fn close(lex: *lexer) void = {
io::close(&lex.strbuf)!; io::close(&lex.strbuf)!;
}; };
fn updateloc(lex: *lexer, rn: rune) void = { export fn next(lex: *lexer) (token | io::EOF | error) = {
if (rn == '\n') { const rn = match (nextrunews(lex)?) {
lex.loc = (lex.loc.0 + 1, 0); case io::EOF =>
} else { return io::EOF;
lex.loc.1 += 1; case let rn: rune =>
yield rn;
}; };
switch (rn) {
case '(' =>
return punctuation::LEFT_PAREN: token;
case ')' =>
return punctuation::RIGHT_PAREN: token;
case '[' =>
return punctuation::LEFT_SQUARE_BRACKET: token;
case ']' =>
return punctuation::RIGHT_SQUARE_BRACKET: token;
case '{' =>
return punctuation::LEFT_CURLY_BRACKET: token;
case '}' =>
return punctuation::RIGHT_CURLY_BRACKET: token;
case '\\' =>
return punctuation::BACKSLASH: token;
case ':' =>
return punctuation::COLON: token;
case '"' =>
match (scanstr(lex)?) {
case let s: str =>
return s;
case io::EOF =>
return io::EOF;
};
case =>
yield;
};
unget(lex, rn);
return word{ v = scanword(lex)? };
}; };
fn nextrune(lex: *lexer) (rune | io::error | io::EOF | utf8::invalid) = { fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
match (lex.unread) { match (lex.unread) {
case let rn: rune => case let rn: rune =>
lex.prevloc = lex.loc; lex.prevloc = lex.loc;
lex.unread = void; lex.unread = void;
updateloc(lex, rn); if (rn == '\n') {
lex.loc = (lex.loc.0 + 1, 0);
} else {
lex.loc.1 += 1;
};
return rn; return rn;
case void => case void =>
yield; yield;
}; };
match (bufio::read_rune(lex.in)?) { match (bufio::read_rune(lex.in)) {
case let rn: rune => case let rn: rune =>
lex.prevloc = lex.loc; lex.prevloc = lex.loc;
updateloc(lex, rn); if (rn == '\n') {
lex.loc = (lex.loc.0 + 1, 0);
} else {
lex.loc.1 += 1;
};
return rn; return rn;
case io::EOF => case io::EOF =>
return io::EOF; return io::EOF;
case let err: io::error =>
return err;
case utf8::invalid =>
return lex.loc: invalid;
};
};
fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
for (true) {
match (nextrune(lex)?) {
case let rn: rune =>
if (ascii::isspace(rn)) {
continue;
};
return rn;
case io::EOF =>
return io::EOF;
};
}; };
}; };
@ -68,22 +126,93 @@ fn unget(lex: *lexer, rn: rune) void = {
lex.loc = lex.prevloc; lex.loc = lex.prevloc;
}; };
fn skipws(lex: *lexer) (void | io::EOF | io::error | utf8::invalid) = { fn scanword(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
for (true) { for (true) {
match (nextrune(lex)?) { const rn = match (nextrune(lex)?) {
case io::EOF => return io::EOF;
case let rn: rune => case let rn: rune =>
if (!ascii::isspace(rn)) { yield rn;
unget(lex, rn); case io::EOF =>
return; break;
};
}; };
if (ascii::isspace(rn)) {
unget(lex, rn);
break;
};
memio::appendrune(&lex.strbuf, rn)!;
};
return memio::string(&lex.strbuf)!;
};
fn scanstr(lex: *lexer) (str | io::EOF | error) = {
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return lex.loc: invalid;
};
switch (rn) {
case '"' => break;
case '\\' =>
memio::appendrune(&lex.strbuf, scanescape(lex)?)!;
case =>
memio::appendrune(&lex.strbuf, rn)!;
};
};
return memio::string(&lex.strbuf)!;
};
fn scanescape(lex: *lexer) (rune | error) = {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return lex.loc: invalid;
};
switch (rn) {
case '"' =>
return '"';
case '\\' =>
return '\\';
case '\n' =>
return '\n';
case =>
return lex.loc: invalid;
}; };
}; };
// Tests! :) // Tests! :)
fn tnext(lex: *lexer) token = {
match (next(lex)!) {
case let t: token =>
return t;
case =>
assert(false);
return word { v = "" };
};
};
@test fn test_next() void = {
let lex = newlexer(&memio::fixed(
strings::toutf8("\"hello\" \\greeting def")),
"<string>");
defer close(&lex);
let tk = tnext(&lex);
assert(tk is str && tk: str == "hello");
let tk = tnext(&lex);
assert(tk is punctuation && tk: punctuation == punctuation::BACKSLASH);
let tk = tnext(&lex);
assert(tk is word && (tk: word).v == "greeting");
let tk = tnext(&lex);
assert(tk is word && (tk: word).v == "def");
};
@test fn test_nextrune() void = { @test fn test_nextrune() void = {
let lex = newlexer(&memio::fixed(strings::toutf8("a\nb")), let lex = newlexer(&memio::fixed(strings::toutf8("a\nb")),
"<string>"); "<string>");
@ -95,12 +224,33 @@ fn skipws(lex: *lexer) (void | io::EOF | io::error | utf8::invalid) = {
assert(lex.loc.0 == 2u && lex.loc.1 == 1u); assert(lex.loc.0 == 2u && lex.loc.1 == 1u);
}; };
@test fn test_skipws() void = { @test fn test_nextrunews() void = {
let lex = newlexer(&memio::fixed(strings::toutf8("\n a")), let lex = newlexer(&memio::fixed(strings::toutf8("\n a")),
"<string>"); "<string>");
defer close(&lex); defer close(&lex);
skipws(&lex)!; assert(nextrunews(&lex)! == 'a');
assert(nextrune(&lex)! == 'a');
assert(lex.loc.0 == 2u && lex.loc.1 == 2u); assert(lex.loc.0 == 2u && lex.loc.1 == 2u);
}; };
@test fn test_scanword() void = {
let lex = newlexer(&memio::fixed(strings::toutf8("string->number .")),
"<string>");
defer close(&lex);
assert(scanword(&lex)! == "string->number");
};
@test fn test_scanstr() void = {
let lex = newlexer(&memio::fixed(strings::toutf8("\"\\\\hello\\\"world!\\\n\"")),
"<string>");
defer close(&lex);
assert(nextrune(&lex)! == '"');
match (scanstr(&lex)!) {
case io::EOF =>
assert(false);
case let s: str =>
assert(s == "\\hello\"world!\n");
};
};

View file

@ -1,7 +1,25 @@
export type punct = enum uint { use io;
use fmt;
export type invalid = !(uint, uint);
export type error = !(invalid | io::error);
export type punctuation = enum uint {
LEFT_PAREN, RIGHT_PAREN, LEFT_PAREN, RIGHT_PAREN,
LEFT_SQUARE_BRACKET, RIGHT_SQUARE_BRACKET, LEFT_SQUARE_BRACKET, RIGHT_SQUARE_BRACKET,
LEFT_CURLY_BRACKET, RIGHT_CURLY_BRACKET, LEFT_CURLY_BRACKET, RIGHT_CURLY_BRACKET,
BACKSLASH, COLON, BACKSLASH, COLON,
}; };
export type token = (punct | str | f64 | bool); export type word = struct { v: str };
export type token = (punctuation | word | str);
export fn strerror(err: error) const str = {
static let buf: [64]u8 = [0...];
match (err) {
case let err: invalid =>
return fmt::bsprintf(buf,
"{}:{}: Invalid token found", err.0, err.1);
case let err: io::error =>
return io::strerror(err);
};
};