kojote/lex.ha

use ascii; // TODO: maybe use unicode?
use bufio;
use encoding::utf8;
use fmt;
use io;
use memio;
use os;
use strings;

// my cod prob sux :(

export type lexer = struct {
	in: io::handle,
	strbuf: memio::stream,
	path: str,
	loc: (uint, uint),
	prevloc: (uint, uint),
	unread: (rune | void),
};

export fn newlexer(in: io::handle, path: str) lexer = {
	return lexer {
		in = in,
		strbuf = memio::dynamic(),
		path = path,
		loc = (1, 0),
		unread = void,
		...
	};
};

export fn close(lex: *lexer) void = {
	io::close(&lex.strbuf)!;
};

export fn next(lex: *lexer) (token | io::EOF | error) = {
	const rn = match (nextrunews(lex)?) {
	case io::EOF =>
		return io::EOF;
	case let rn: rune =>
		yield rn;
	};

	switch (rn) {
	case '(' =>
		return punctuation::LEFT_PAREN: token;
	case ')' =>
		return punctuation::RIGHT_PAREN: token;
	case '[' =>
		return punctuation::LEFT_SQUARE_BRACKET: token;
	case ']' =>
		return punctuation::RIGHT_SQUARE_BRACKET: token;
	case '{' =>
		return punctuation::LEFT_CURLY_BRACKET: token;
	case '}' =>
		return punctuation::RIGHT_CURLY_BRACKET: token;
	case '\\' =>
		return punctuation::BACKSLASH: token;
	case ':' =>
		return punctuation::COLON: token;
	case '"' =>
		match (scanstr(lex)?) {
		case let s: str =>
			return s;
		case io::EOF =>
			return io::EOF;
		};
	case =>
		yield;
	};

	unget(lex, rn);
	return word{ v = scanword(lex)? };
};

fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
	match (lex.unread) {
	case let rn: rune =>
		lex.prevloc = lex.loc;
		lex.unread = void;
		if (rn == '\n') {
			lex.loc = (lex.loc.0 + 1, 0);
		} else {
			lex.loc.1 += 1;
		};
		return rn;
	case void =>
		yield;
	};

	match (bufio::read_rune(lex.in)) {
	case let rn: rune =>
		lex.prevloc = lex.loc;
		if (rn == '\n') {
			lex.loc = (lex.loc.0 + 1, 0);
		} else {
			lex.loc.1 += 1;
		};
		return rn;
	case io::EOF =>
		return io::EOF;
	case let err: io::error =>
		return err;
	case utf8::invalid =>
		return lex.loc: invalid;
	};
};

fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
	for (true) {
		match (nextrune(lex)?) {
		case let rn: rune =>
			if (ascii::isspace(rn)) {
				continue;
			};
			return rn;
		case io::EOF =>
			return io::EOF;
		};
	};
};

fn unget(lex: *lexer, rn: rune) void = {
	assert(lex.unread is void);
	lex.unread = rn;
	lex.loc = lex.prevloc;
};

fn scanword(lex: *lexer) (str | error) = {
	memio::reset(&lex.strbuf);
	for (true) {
		const rn = match (nextrune(lex)?) {
		case let rn: rune =>
			yield rn;
		case io::EOF =>
			break;
		};
		if (ascii::isspace(rn)) {
			unget(lex, rn);
			break;
		};
		memio::appendrune(&lex.strbuf, rn)!;
	};
	return memio::string(&lex.strbuf)!;
};

fn scanstr(lex: *lexer) (str | io::EOF | error) = {
	memio::reset(&lex.strbuf);
	for (true) {
		const rn = match (nextrune(lex)?) {
		case let rn: rune =>
			yield rn;
		case io::EOF =>
			return lex.loc: invalid;
		};

		switch (rn) {
		case '"' => break;
		case '\\' =>
			memio::appendrune(&lex.strbuf, scanescape(lex)?)!;
		case =>
			memio::appendrune(&lex.strbuf, rn)!;
		};
	};
	return memio::string(&lex.strbuf)!;
};

fn scanescape(lex: *lexer) (rune | error) = {
	const rn = match (nextrune(lex)?) {
	case let rn: rune =>
		yield rn;
	case io::EOF =>
		return lex.loc: invalid;
	};

	switch (rn) {
	case '"' =>
		return '"';
	case '\\' =>
		return '\\';
	case '\n' =>
		return '\n';
	case =>
		return lex.loc: invalid;
	};
};

// Tests! :)

fn tnext(lex: *lexer) token = {
	match (next(lex)!) {
	case let t: token =>
		return t;
	case =>
		assert(false);
		return word { v = "" };
	};
};

@test fn test_next() void = {
	let lex = newlexer(&memio::fixed(
		strings::toutf8("\"hello\" \\greeting def")),
		"<string>");
	defer close(&lex);

	let tk = tnext(&lex);
	assert(tk is str && tk: str == "hello");
	let tk = tnext(&lex);
	assert(tk is punctuation && tk: punctuation == punctuation::BACKSLASH);
	let tk = tnext(&lex);
	assert(tk is word && (tk: word).v == "greeting");
	let tk = tnext(&lex);
	assert(tk is word && (tk: word).v == "def");
};

@test fn test_nextrune() void = {
	let lex = newlexer(&memio::fixed(strings::toutf8("a\nb")),
		"<string>");
	defer close(&lex);

	assert(nextrune(&lex)! == 'a');
	assert(nextrune(&lex)! == '\n');
	assert(nextrune(&lex)! == 'b');
	assert(lex.loc.0 == 2u && lex.loc.1 == 1u);
};

@test fn test_nextrunews() void = {
	let lex = newlexer(&memio::fixed(strings::toutf8("\n a")),
		"<string>");
	defer close(&lex);

	assert(nextrunews(&lex)! == 'a');
	assert(lex.loc.0 == 2u && lex.loc.1 == 2u);
};

@test fn test_scanword() void = {
	let lex = newlexer(&memio::fixed(strings::toutf8("string->number .")),
		"<string>");
	defer close(&lex);

	assert(scanword(&lex)! == "string->number");
};

@test fn test_scanstr() void = {
	let lex = newlexer(&memio::fixed(strings::toutf8("\"\\\\hello\\\"world!\\\n\"")),
		"<string>");
	defer close(&lex);

	assert(nextrune(&lex)! == '"');
	match (scanstr(&lex)!) {
	case io::EOF =>
		assert(false);
	case let s: str =>
		assert(s == "\\hello\"world!\n");
	};
};