kojote/parse/lex.ha

use ascii;
use bufio;
use encoding::utf8;
use io;
use memio;
use unicode;
use strconv;

use fmt;
use strings;

def longcharnames: [_](str, rune) = [
	("nul", '\u0000'),
	("alarm", '\u0007'),
	("backspace", '\u0008'),
	("tab", '\u0009'),
	("newline", '\u000a'),
	("linefeed", '\u000a'),
	("vtab", '\u000b'),
	("page", '\u000c'),
	("return", '\u000d'),
	("esc", '\u001b'),
	("space", '\u0020'),
	("delete", '\u007f'),
];

export type lexer = struct {
	in: io::handle,
	strbuf: memio::stream,
	path: str,
	loc: (uint, uint),
	prevloc: (uint, uint),
	unread: (rune | void),
};

export fn newlexer(in: io::handle, path: str) lexer = {
	return lexer {
		in = in,
		strbuf = memio::dynamic(),
		path = path,
		loc = (1, 0),
		unread = void,
		...
	};
};

export fn close(lex: *lexer) void = {
	io::close(&lex.strbuf)!;
};

export fn lex(lex: *lexer) (token | io::EOF | error) = {
	const rn = match (nextrunews(lex)?) {
	case io::EOF =>
		return io::EOF;
	case let rn: rune =>
		yield rn;
	};

	switch (rn) {
	case '(' =>
		return comment{ v = scancomment(lex)? };
	case ')' =>
		return lex.loc: invalid;
	case '[' =>
		return quotstart;
	case ']' =>
		return quotend;
	case '{' =>
		return mapstart;
	case '}' =>
		return mapend;
	case '\\' =>
		let v = scanword(lex)?;
		if (len(v) == 0) {
			return lex.loc: invalid;
		} else {
			return symbol{ v = v, kw = false };
		};
	case ':' =>
		let v = scanword(lex)?;
		if (len(v) == 0) {
			return lex.loc: invalid;
		} else {
			return symbol{ v = v, kw = true };
		};
	case '#' =>
		return scanpound(lex)?;
	case '"' =>
		return scanstr(lex)?;
	case =>
		yield;
	};

	unget(lex, rn);
	return word{ v = scanword(lex)? };
};

fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
	match (lex.unread) {
	case let rn: rune =>
		lex.prevloc = lex.loc;
		lex.unread = void;
		if (rn == '\n') {
			lex.loc = (lex.loc.0 + 1, 0);
		} else {
			lex.loc.1 += 1;
		};
		return rn;
	case void =>
		yield;
	};

	match (bufio::read_rune(lex.in)) {
	case let rn: rune =>
		lex.prevloc = lex.loc;
		if (rn == '\n') {
			lex.loc = (lex.loc.0 + 1, 0);
		} else {
			lex.loc.1 += 1;
		};
		return rn;
	case io::EOF =>
		return io::EOF;
	case let err: io::error =>
		return err;
	case utf8::invalid =>
		return lex.loc: invalid;
	};
};

fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
	for (true) {
		match (nextrune(lex)?) {
		case let rn: rune =>
			if (isspace(rn)) {
				continue;
			};
			return rn;
		case io::EOF =>
			return io::EOF;
		};
	};
};

fn unget(lex: *lexer, rn: rune) void = {
	assert(lex.unread is void);
	lex.unread = rn;
	lex.loc = lex.prevloc;
};

fn scanword(lex: *lexer) (str | error) = {
	memio::reset(&lex.strbuf);
	for (true) {
		const rn = match (nextrune(lex)?) {
		case let rn: rune =>
			yield rn;
		case io::EOF =>
			break;
		};
		if (isspace(rn) || isdelimiter(rn)) {
			unget(lex, rn);
			break;
		};
		memio::appendrune(&lex.strbuf, rn)!;
	};
	return memio::string(&lex.strbuf)!;
};

fn scancomment(lex: *lexer) (str | error) = {
	memio::reset(&lex.strbuf);
	for (true) {
		const rn = match (nextrune(lex)?) {
		case let rn: rune =>
			yield rn;
		case io::EOF =>
			return ("comment", lex.loc.0, lex.loc.1): unterminated;
		};

		switch (rn) {
		case '(' =>
			return lex.loc: invalid;
		case ')' =>
			break;
		case =>
			memio::appendrune(&lex.strbuf, rn)!;
		};
	};

	return memio::string(&lex.strbuf)!;
};

fn scanstr(lex: *lexer) (str | error) = {
	memio::reset(&lex.strbuf);
	for (true) {
		const rn = match (nextrune(lex)?) {
		case let rn: rune =>
			yield rn;
		case io::EOF =>
			return ("string literal", lex.loc.0, lex.loc.1): unterminated;
		};

		switch (rn) {
		case '"' => break;
		case '\\' =>
			memio::appendrune(&lex.strbuf, scanescape(lex)?)!;
		case =>
			memio::appendrune(&lex.strbuf, rn)!;
		};
	};
	return memio::string(&lex.strbuf)!;
};

fn scanpound(lex: *lexer) (token | error) = {
	const rn = match (nextrune(lex)?) {
	case let rn: rune =>
		yield rn;
	case io::EOF =>
		return ("pound literal", lex.loc.0, lex.loc.1): unterminated;
	};

	switch (rn) {
	case 't' =>
		return true;
	case 'f' =>
		return false;
	case '\\' =>
		return scanchar(lex)?;
	case =>
		return lex.loc: invalid;
	};
};

fn scanchar(lex: *lexer) (rune | error) = {
	static let namebuf: [16]u8 = [0...];
	let namebuf = memio::fixed(namebuf);

	const rn = match (nextrune(lex)?) {
	case let rn: rune =>
		yield rn;
	case io::EOF =>
		return ("character literal", lex.loc.0, lex.loc.1): unterminated;
	};

	match (nextrune(lex)?) {
	case let rnn: rune =>
		unget(lex, rnn);
		if (isspace(rnn)) {
			return rn;
		} else {
			if (rn == 'x') {
				return scanescape2(lex);
			} else {
				memio::appendrune(&namebuf, rn)!;
				memio::concat(&namebuf, scanword(lex)?)!;
				const name = memio::string(&namebuf)!;
					for (let i = 0z; i < len(longcharnames); i += 1) {
					if (name == longcharnames[i].0) {
						return longcharnames[i].1;
					};
				};
				return lex.loc: invalid;
			};
		};
	case io::EOF =>
		return rn;
	};
};

fn scanescape(lex: *lexer) (rune | error) = {
	const rn = match (nextrune(lex)?) {
	case let rn: rune =>
		yield rn;
	case io::EOF =>
		return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
	};

	switch (rn) {
	case '0' => return '\0';
	case 'a' => return '\a';
	case 'b' => return '\b';
	case 'e' => return '\x1b';
	case 'f' => return '\f';
	case 'n' => return '\n';
	case 'r' => return '\r';
	case 't' => return '\t';
	case 'v' => return '\v';
	case '\\' => return '\\';
	case '"' => return '"';
	case 'x' => return scanescape2(lex)?;
	case =>
		return lex.loc: invalid;
	};
};

fn scanescape2(lex: *lexer) (rune | error) = {
	// This handles the `\xhh...;` family of escapes.
	// It's on a separate function since both [[scanescape]] and
	// [[scanchar]] make use of it. Much like how [[scanescape]] assumes
	// that the backslash has already been consumed, this one assumes that
	// the leading character has been consumed prior to entering this
	// function.

	const rn = match (nextrune(lex)?) {
	case let rn: rune =>
		yield rn;
	case io::EOF =>
		return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
	};

	const buf: [6]u8 = [0...];
	let buf = memio::fixed(buf);
	memio::appendrune(&buf, rn)!;

	let count = 1z;
	for (true) {
		const rn = match (nextrune(lex)?) {
		case let rn: rune =>
			yield rn;
		case io::EOF =>
			return ("escape sequence", lex.loc.0, lex.loc.1): unterminated;
		};

		count += 1;

		if (count > 6) {
			return lex.loc: invalid;
		} else if (rn == ';') {
			break;
		} else {
			memio::appendrune(&buf, rn)!;
		};
	};

	const buf = memio::string(&buf)!;

	return match (strconv::stou32(buf, strconv::base::HEX)) {
	case let codepoint: u32 =>
		return codepoint: rune;
	case =>
		return lex.loc: invalid;
	};
};

fn isspace(rn: rune) bool = {
	if (ascii::isspace(rn)) {
		return true;
	} else {
		switch (unicode::rune_gc(rn)) {
		case unicode::gc::Zs =>
			return true;
		case =>
			return false;
		};
	};
};

fn isdelimiter(rn: rune) bool = {
	match (strings::index(`()[]{}\:#`, rn)) {
	case size =>
		return true;
	case =>
		return false;
	};
};