linebreak: return bytewise position as well

Signed-off-by: Drew DeVault <sir@cmpwn.com>
2024-04-17 11:31:28 +02:00 · 2024-04-17 11:31:28 +02:00 · 1488c26f46
commit 1488c26f46
parent 9c73c17238
2 changed files with 34 additions and 23 deletions
--- a/cmd/linebreak/main.ha
+++ b/cmd/linebreak/main.ha
@ -1,15 +1,19 @@
+use encoding::hex;
 use fmt;
 use os;
+use strings;
 use unicode;

 export fn main() void = {
 	const input = os::args[1];
+	const data = strings::toutf8(input);
+	hex::dump(os::stdout, data)!;

 	fmt::println(input)!;

 	let ix = 0u;
 	const lb = unicode::new_line_breaker(input);
-	for (const (pos, mand) => unicode::next_line_break(&lb)) {
+	for (const (pos, _, mand) => unicode::next_line_break(&lb)) {
 		for (ix < pos; ix += 1) {
 			fmt::print(' ')!;
 		};
@ -27,7 +31,10 @@ export fn main() void = {

 	fmt::println("Line break opportunities:")!;
 	const lb = unicode::new_line_breaker(input);
-	for (const (pos, mand) => unicode::next_line_break(&lb)) {
-		fmt::printfln("- {} {}", pos, if (mand) "(mandatory)" else "")!;
+	for (const (pos, bpos, mand) => unicode::next_line_break(&lb)) {
+		fmt::printfln("- {}:{} {} (before '{}'/0x{:x})", pos, bpos,
+			if (mand) "(mandatory)" else "",
+			strings::sub(input, pos, pos+1),
+			data[bpos])!;
 	};
 };
--- a/unicode/linebreak.ha
+++ b/unicode/linebreak.ha
@ -1,12 +1,13 @@
+use encoding::utf8;
 use strings;

 export type line_breaker = struct {
 	input: str,
 	iter: strings::iterator,
 	// Current position
-	pos: uint,
-	// Previous position
-	ppos: uint,
+	pos: size,
+	// Current position, bytes
+	bpos: size,
 	// Current line break class
 	cur: line_break,
 	// Next line break class
@ -26,14 +27,14 @@ export fn new_line_breaker(input: str) line_breaker = {
 	};
 };

-// Returns the next line break opportunity as a tuple of the rune-wise index of
-// the opportunity in the input string and a boolean indicating if the line
-// break is mandatory at this location. The line break opportunity directly
-// precedes the index returned from this function.
+// Returns the next line break opportunity as a tuple of the rune-wise index,
+// byte-wise index, and a boolean indicating whether or not the break is
+// mandatory at this location. The line break opportunity directly precedes the
+// index returned from this function.
 //
 // 	Hello world!
 // 	      ^ Line break opportunity at index 6
-export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
+export fn next_line_break(lb: *line_breaker) ((size, size, bool) | done) = {
 	if (lb.pos == 0) {
 		if (len(lb.input) == 0) {
 			return done; // special case
@ -41,25 +42,28 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {

 		lb.iter = strings::iter(lb.input);

-		let class = next_lb1_class(lb) as line_break;
+		const (class, rn) = next_lb1_class(lb) as (line_break, rune);
 		class = resolve_lb2_class(class);
 		lb.cur = class;
 		lb.next = class;
 		lb.lb8a = class == line_break::ZWJ;
 	};

-	for (const next => next_lb1_class(lb)) {
+	for (const (next, rn) => next_lb1_class(lb)) {
 		const prev = lb.next;
 		lb.next = next;
-		lb.ppos = lb.pos;
-		defer lb.pos += 1;
+		const rnsz = utf8::runesz(rn);
+		defer {
+			lb.pos += 1;
+			lb.bpos += rnsz;
+		};

 		const mandatory = lb.cur == line_break::BK
 			|| (lb.cur == line_break::CR
 				&& lb.next != line_break::LF);
 		if (mandatory) {
 			lb.cur = resolve_lb2_class(next);
-			return (lb.pos + 1, true);
+			return (lb.pos + 1, lb.bpos + rnsz, true);
 		};

 		lb.lb8a = next == line_break::ZWJ;
@ -74,7 +78,7 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
 		assert(can_break is bool);
 		const can_break = can_break as bool;
 		if (can_break) {
-			return (lb.pos + 1, false);
+			return (lb.pos + 1, lb.bpos + rnsz, false);
 		};
 	};

@ -82,7 +86,7 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
 };

 // Applies LB1 suggested rules for resolving context-dependent classes.
-fn next_lb1_class(lb: *line_breaker) (line_break | done) = {
+fn next_lb1_class(lb: *line_breaker) ((line_break, rune) | done) = {
 	const rn = match (strings::next(&lb.iter)) {
 	case let rn: rune =>
 		yield rn;
@ -93,18 +97,18 @@ fn next_lb1_class(lb: *line_breaker) (line_break | done) = {
 	const class = rune_line_break(rn);
 	switch (class) {
 	case line_break::AI, line_break::SG, line_break::XX =>
-		return line_break::AL;
+		return (line_break::AL, rn);
 	case line_break::SA =>
 		switch (rune_gc(rn)) {
 		case gc::Mn, gc::Mc =>
-			return line_break::CM;
+			return (line_break::CM, rn);
 		case =>
-			return line_break::AL;
+			return (line_break::AL, rn);
 		};
 	case line_break::CJ =>
-		return line_break::NS;
+		return (line_break::NS, rn);
 	case =>
-		return class;
+		return (class, rn);
 	};
 };