fhdskjgfdsjglfdskjgfd lexer

This commit is contained in:
Lobo Torres 2024-12-04 14:56:08 -03:00
parent 63fe8290a5
commit 696d4910b1
2 changed files with 177 additions and 91 deletions

241
lex.ha
View file

@ -1,10 +1,12 @@
use ascii; // TODO: maybe use unicode? use ascii;
use bufio; use bufio;
use encoding::utf8; use encoding::utf8;
use fmt;
use io; use io;
use memio; use memio;
use os; use unicode;
// Testing dependency
use fmt;
use strings; use strings;
// my cod prob sux :( // my cod prob sux :(
@ -33,7 +35,7 @@ export fn close(lex: *lexer) void = {
io::close(&lex.strbuf)!; io::close(&lex.strbuf)!;
}; };
export fn next(lex: *lexer) (token | io::EOF | error) = { export fn lex(lex: *lexer) (token | io::EOF | error) = {
const rn = match (nextrunews(lex)?) { const rn = match (nextrunews(lex)?) {
case io::EOF => case io::EOF =>
return io::EOF; return io::EOF;
@ -43,28 +45,35 @@ export fn next(lex: *lexer) (token | io::EOF | error) = {
switch (rn) { switch (rn) {
case '(' => case '(' =>
return punctuation::LEFT_PAREN: token; return comment{ v = scancomment(lex)? };
case ')' => case ')' =>
return punctuation::RIGHT_PAREN: token; return lex.loc: invalid;
case '[' => case '[' =>
return punctuation::LEFT_SQUARE_BRACKET: token; return quotstart;
case ']' => case ']' =>
return punctuation::RIGHT_SQUARE_BRACKET: token; return quotend;
case '{' => case '{' =>
return punctuation::LEFT_CURLY_BRACKET: token; return mapstart;
case '}' => case '}' =>
return punctuation::RIGHT_CURLY_BRACKET: token; return mapend;
case '\\' => case '\\' =>
return punctuation::BACKSLASH: token; let v = scanword(lex)?;
case ':' => if (len(v) == 0) {
return punctuation::COLON: token; return lex.loc: invalid;
case '"' => } else {
match (scanstr(lex)?) { return symbol{ v = v, kw = false };
case let s: str =>
return s;
case io::EOF =>
return io::EOF;
}; };
case ':' =>
let v = scanword(lex)?;
if (len(v) == 0) {
return lex.loc: invalid;
} else {
return symbol{ v = v, kw = true };
};
case '\'' =>
return scanchar(lex)?;
case '"' =>
return scanstr(lex)?;
case => case =>
yield; yield;
}; };
@ -110,7 +119,7 @@ fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
for (true) { for (true) {
match (nextrune(lex)?) { match (nextrune(lex)?) {
case let rn: rune => case let rn: rune =>
if (ascii::isspace(rn)) { if (isspace(rn)) {
continue; continue;
}; };
return rn; return rn;
@ -135,7 +144,7 @@ fn scanword(lex: *lexer) (str | error) = {
case io::EOF => case io::EOF =>
break; break;
}; };
if (ascii::isspace(rn)) { if (isspace(rn) || isdelimiter(rn)) {
unget(lex, rn); unget(lex, rn);
break; break;
}; };
@ -144,14 +153,37 @@ fn scanword(lex: *lexer) (str | error) = {
return memio::string(&lex.strbuf)!; return memio::string(&lex.strbuf)!;
}; };
fn scanstr(lex: *lexer) (str | io::EOF | error) = { fn scancomment(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf); memio::reset(&lex.strbuf);
for (true) { for (true) {
const rn = match (nextrune(lex)?) { const rn = match (nextrune(lex)?) {
case let rn: rune => case let rn: rune =>
yield rn; yield rn;
case io::EOF => case io::EOF =>
return ("comment", lex.loc.0, lex.loc.1): unterminated;
};
switch (rn) {
case '(' =>
return lex.loc: invalid; return lex.loc: invalid;
case ')' =>
break;
case =>
memio::appendrune(&lex.strbuf, rn)!;
};
};
return memio::string(&lex.strbuf)!;
};
fn scanstr(lex: *lexer) (str | error) = {
memio::reset(&lex.strbuf);
for (true) {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("string literal", lex.loc.0, lex.loc.1): unterminated;
}; };
switch (rn) { switch (rn) {
@ -165,6 +197,22 @@ fn scanstr(lex: *lexer) (str | io::EOF | error) = {
return memio::string(&lex.strbuf)!; return memio::string(&lex.strbuf)!;
}; };
fn scanchar(lex: *lexer) (rune | error) = {
const rn = match (nextrune(lex)?) {
case let rn: rune =>
yield rn;
case io::EOF =>
return ("character literal", lex.loc.0, lex.loc.1): unterminated;
};
switch (rn) {
case '\\' =>
return scanescape(lex)?;
case =>
return rn;
};
};
fn scanescape(lex: *lexer) (rune | error) = { fn scanescape(lex: *lexer) (rune | error) = {
const rn = match (nextrune(lex)?) { const rn = match (nextrune(lex)?) {
case let rn: rune => case let rn: rune =>
@ -178,79 +226,110 @@ fn scanescape(lex: *lexer) (rune | error) = {
return '"'; return '"';
case '\\' => case '\\' =>
return '\\'; return '\\';
case '\n' => case 'n' =>
return '\n'; return '\n';
case 't' =>
return '\t';
case 's' =>
return ' ';
case => case =>
return lex.loc: invalid; return lex.loc: invalid;
}; };
}; };
// Tests! :) fn isspace(rn: rune) bool = {
if (ascii::isspace(rn)) {
return true;
} else {
switch (unicode::rune_gc(rn)) {
case unicode::gc::Zs =>
return true;
case =>
return false;
};
};
};
fn tnext(lex: *lexer) token = { def delimiters = `()[]{}\:'`;
match (next(lex)!) { fn isdelimiter(rn: rune) bool = {
case let t: token => match (strings::index(delimiters, rn)) {
return t; case size =>
return true;
case => case =>
assert(false); return false;
return word { v = "" };
}; };
}; };
@test fn test_next() void = { @test fn lex() void = {
let lex = newlexer(&memio::fixed( const cases: [_](str, []token) = [
strings::toutf8("\"hello\" \\greeting def")), (
"<string>"); `"hello" \greeting def`,
defer close(&lex); [
"hello",
mksym("greeting"),
mkword("def"),
]
),
(
`[dup *] (a -- a) \square def`,
[
quotstart,
mkword("dup"),
mkword("*"),
quotend,
mkcomment("a -- a"),
mksym("square"),
mkword("def"),
]
),
(`'\s`, [' '])
];
let tk = tnext(&lex); for (let i = 0z; i < len(cases); i += 1) {
assert(tk is str && tk: str == "hello"); const src = strings::toutf8(cases[i].0);
let tk = tnext(&lex); const src = memio::fixed(src);
assert(tk is punctuation && tk: punctuation == punctuation::BACKSLASH); const lexer = newlexer(&src, "<string>");
let tk = tnext(&lex); defer close(&lexer);
assert(tk is word && (tk: word).v == "greeting");
let tk = tnext(&lex); for (let j = 0z; j < len(cases[i].1); j += 1) {
assert(tk is word && (tk: word).v == "def"); const want = cases[i].1[j];
const have = lex(&lexer)! as token;
assert(tokeq(want, have));
};
assert(lex(&lexer) is io::EOF);
};
}; };
@test fn test_nextrune() void = { fn tokeq(have: token, want: token) bool = {
let lex = newlexer(&memio::fixed(strings::toutf8("a\nb")), match (want) {
"<string>"); case quotstart =>
defer close(&lex); return have is quotstart;
case quotend =>
assert(nextrune(&lex)! == 'a'); return have is quotend;
assert(nextrune(&lex)! == '\n'); case mapstart =>
assert(nextrune(&lex)! == 'b'); return have is mapstart;
assert(lex.loc.0 == 2u && lex.loc.1 == 1u); case mapend =>
}; return have is mapend;
case let w: word =>
@test fn test_nextrunews() void = { return (have as word).v == w.v;
let lex = newlexer(&memio::fixed(strings::toutf8("\n a")),
"<string>");
defer close(&lex);
assert(nextrunews(&lex)! == 'a');
assert(lex.loc.0 == 2u && lex.loc.1 == 2u);
};
@test fn test_scanword() void = {
let lex = newlexer(&memio::fixed(strings::toutf8("string->number .")),
"<string>");
defer close(&lex);
assert(scanword(&lex)! == "string->number");
};
@test fn test_scanstr() void = {
let lex = newlexer(&memio::fixed(strings::toutf8("\"\\\\hello\\\"world!\\\n\"")),
"<string>");
defer close(&lex);
assert(nextrune(&lex)! == '"');
match (scanstr(&lex)!) {
case io::EOF =>
assert(false);
case let s: str => case let s: str =>
assert(s == "\\hello\"world!\n"); return have as str == s;
case let s: symbol =>
return (have as symbol).v == s.v;
case let c: comment =>
return (have as comment).v == c.v;
case let r: rune =>
return have as rune == r;
}; };
}; };
fn mkword(v: const str) word =
word{ v = v };
fn mkcomment(v: const str) comment =
comment{ v = v };
fn mksym(v: const str, kw: bool = false) symbol =
symbol{ v = v, kw = kw };

View file

@ -1,17 +1,21 @@
use io; use io;
use fmt; use fmt;
export type invalid = !(uint, uint); export type invalid = !(uint, uint);
export type error = !(invalid | io::error); export type unterminated = !(const str, uint, uint);
export type error = !(invalid | unterminated | io::error);
export type punctuation = enum uint { export type quotstart = void;
LEFT_PAREN, RIGHT_PAREN, export type quotend = void;
LEFT_SQUARE_BRACKET, RIGHT_SQUARE_BRACKET, export type mapstart = void;
LEFT_CURLY_BRACKET, RIGHT_CURLY_BRACKET, export type mapend = void;
BACKSLASH, COLON,
}; export type comment = struct { v: str };
export type word = struct { v: str }; export type word = struct { v: str };
export type token = (punctuation | word | str); export type symbol = struct { v: str, kw: bool };
export type token = (quotstart | quotend | mapstart | mapend |
word | symbol | comment | str | rune);
export fn strerror(err: error) const str = { export fn strerror(err: error) const str = {
static let buf: [64]u8 = [0...]; static let buf: [64]u8 = [0...];
@ -19,6 +23,9 @@ export fn strerror(err: error) const str = {
case let err: invalid => case let err: invalid =>
return fmt::bsprintf(buf, return fmt::bsprintf(buf,
"{}:{}: Invalid token found", err.0, err.1); "{}:{}: Invalid token found", err.0, err.1);
case let err: unterminated =>
return fmt::bsprintf(buf,
"{}:{}: Unterminated {} found", err.1, err.2, err.0);
case let err: io::error => case let err: io::error =>
return io::strerror(err); return io::strerror(err);
}; };