kojote/lex.ha

use ascii;
use bufio;
use encoding::utf8;
use io;
use memio;
use unicode;

// Testing dependency
use fmt;
use strings;

// my cod prob sux :(

export type lexer = struct {
	in: io::handle,
	strbuf: memio::stream,
	path: str,
	loc: (uint, uint),
	prevloc: (uint, uint),
	unread: (rune | void),
};

export fn newlexer(in: io::handle, path: str) lexer = {
	return lexer {
		in = in,
		strbuf = memio::dynamic(),
		path = path,
		loc = (1, 0),
		unread = void,
		...
	};
};

export fn close(lex: *lexer) void = {
	io::close(&lex.strbuf)!;
};

export fn lex(lex: *lexer) (token | io::EOF | error) = {
	const rn = match (nextrunews(lex)?) {
	case io::EOF =>
		return io::EOF;
	case let rn: rune =>
		yield rn;
	};

	switch (rn) {
	case '(' =>
		return comment{ v = scancomment(lex)? };
	case ')' =>
		return lex.loc: invalid;
	case '[' =>
		return quotstart;
	case ']' =>
		return quotend;
	case '{' =>
		return mapstart;
	case '}' =>
		return mapend;
	case '\\' =>
		let v = scanword(lex)?;
		if (len(v) == 0) {
			return lex.loc: invalid;
		} else {
			return symbol{ v = v, kw = false };
		};
	case ':' =>
		let v = scanword(lex)?;
		if (len(v) == 0) {
			return lex.loc: invalid;
		} else {
			return symbol{ v = v, kw = true };
		};
	case '\'' =>
		return scanchar(lex)?;
	case '"' =>
		return scanstr(lex)?;
	case =>
		yield;
	};

	unget(lex, rn);
	return word{ v = scanword(lex)? };
};

fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
	match (lex.unread) {
	case let rn: rune =>
		lex.prevloc = lex.loc;
		lex.unread = void;
		if (rn == '\n') {
			lex.loc = (lex.loc.0 + 1, 0);
		} else {
			lex.loc.1 += 1;
		};
		return rn;
	case void =>
		yield;
	};

	match (bufio::read_rune(lex.in)) {
	case let rn: rune =>
		lex.prevloc = lex.loc;
		if (rn == '\n') {
			lex.loc = (lex.loc.0 + 1, 0);
		} else {
			lex.loc.1 += 1;
		};
		return rn;
	case io::EOF =>
		return io::EOF;
	case let err: io::error =>
		return err;
	case utf8::invalid =>
		return lex.loc: invalid;
	};
};

fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
	for (true) {
		match (nextrune(lex)?) {
		case let rn: rune =>
			if (isspace(rn)) {
				continue;
			};
			return rn;
		case io::EOF =>
			return io::EOF;
		};
	};
};

fn unget(lex: *lexer, rn: rune) void = {
	assert(lex.unread is void);
	lex.unread = rn;
	lex.loc = lex.prevloc;
};

fn scanword(lex: *lexer) (str | error) = {
	memio::reset(&lex.strbuf);
	for (true) {
		const rn = match (nextrune(lex)?) {
		case let rn: rune =>
			yield rn;
		case io::EOF =>
			break;
		};
		if (isspace(rn) || isdelimiter(rn)) {
			unget(lex, rn);
			break;
		};
		memio::appendrune(&lex.strbuf, rn)!;
	};
	return memio::string(&lex.strbuf)!;
};

fn scancomment(lex: *lexer) (str | error) = {
	memio::reset(&lex.strbuf);
	for (true) {
		const rn = match (nextrune(lex)?) {
		case let rn: rune =>
			yield rn;
		case io::EOF =>
			return ("comment", lex.loc.0, lex.loc.1): unterminated;
		};

		switch (rn) {
		case '(' =>
			return lex.loc: invalid;
		case ')' =>
			break;
		case =>
			memio::appendrune(&lex.strbuf, rn)!;
		};
	};

	return memio::string(&lex.strbuf)!;
};

fn scanstr(lex: *lexer) (str | error) = {
	memio::reset(&lex.strbuf);
	for (true) {
		const rn = match (nextrune(lex)?) {
		case let rn: rune =>
			yield rn;
		case io::EOF =>
			return ("string literal", lex.loc.0, lex.loc.1): unterminated;
		};

		switch (rn) {
		case '"' => break;
		case '\\' =>
			memio::appendrune(&lex.strbuf, scanescape(lex)?)!;
		case =>
			memio::appendrune(&lex.strbuf, rn)!;
		};
	};
	return memio::string(&lex.strbuf)!;
};

fn scanchar(lex: *lexer) (rune | error) = {
	const rn = match (nextrune(lex)?) {
	case let rn: rune =>
		yield rn;
	case io::EOF =>
		return ("character literal", lex.loc.0, lex.loc.1): unterminated;
	};

	switch (rn) {
	case '\\' =>
		return scanescape(lex)?;
	case =>
		return rn;
	};
};

fn scanescape(lex: *lexer) (rune | error) = {
	const rn = match (nextrune(lex)?) {
	case let rn: rune =>
		yield rn;
	case io::EOF =>
		return lex.loc: invalid;
	};

	switch (rn) {
	case '"' =>
		return '"';
	case '\\' =>
		return '\\';
	case 'n' =>
		return '\n';
	case 't' =>
		return '\t';
	case 's' =>
		return ' ';
	case =>
		return lex.loc: invalid;
	};
};

fn isspace(rn: rune) bool = {
	if (ascii::isspace(rn)) {
		return true;
	} else {
		switch (unicode::rune_gc(rn)) {
		case unicode::gc::Zs =>
			return true;
		case =>
			return false;
		};
	};
};

def delimiters = `()[]{}\:'`;
fn isdelimiter(rn: rune) bool = {
	match (strings::index(delimiters, rn)) {
	case size =>
		return true;
	case =>
		return false;
	};
};

@test fn lex() void = {
	const cases: [_](str, []token) = [
		(
			`"hello" \greeting def`,
			[
				"hello",
				mksym("greeting"),
				mkword("def"),
			]
		),
		(
			`[dup *] (a -- a) \square def`,
			[
				quotstart,
				mkword("dup"),
				mkword("*"),
				quotend,
				mkcomment("a -- a"),
				mksym("square"),
				mkword("def"),
			]
		),
		(`'\s`, [' '])
	];

	for (let i = 0z; i < len(cases); i += 1) {
		const src = strings::toutf8(cases[i].0);
		const src = memio::fixed(src);
		const lexer = newlexer(&src, "<string>");
		defer close(&lexer);

		for (let j = 0z; j < len(cases[i].1); j += 1) {
			const want = cases[i].1[j];
			const have = lex(&lexer)! as token;
			assert(tokeq(want, have));
		};

		assert(lex(&lexer) is io::EOF);
	};
};

fn tokeq(have: token, want: token) bool = {
	match (want) {
	case quotstart =>
		return have is quotstart;
	case quotend =>
		return have is quotend;
	case mapstart =>
		return have is mapstart;
	case mapend =>
		return have is mapend;
	case let w: word =>
		return (have as word).v == w.v;
	case let s: str =>
		return have as str == s;
	case let s: symbol =>
		return (have as symbol).v == s.v;
	case let c: comment =>
		return (have as comment).v == c.v;
	case let r: rune =>
		return have as rune == r;
	};
};

fn mkword(v: const str) word =
	word{ v = v };

fn mkcomment(v: const str) comment =
	comment{ v = v };

fn mksym(v: const str, kw: bool = false) symbol =
	symbol{ v = v, kw = kw };