From 1488c26f46f7f8568235eaee6224983ac46e78ff Mon Sep 17 00:00:00 2001 From: Drew DeVault Date: Wed, 17 Apr 2024 11:31:28 +0200 Subject: [PATCH] linebreak: return bytewise position as well Signed-off-by: Drew DeVault --- cmd/linebreak/main.ha | 13 ++++++++++--- unicode/linebreak.ha | 44 +++++++++++++++++++++++-------------------- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/cmd/linebreak/main.ha b/cmd/linebreak/main.ha index 90a80ab..1a2d897 100644 --- a/cmd/linebreak/main.ha +++ b/cmd/linebreak/main.ha @@ -1,15 +1,19 @@ +use encoding::hex; use fmt; use os; +use strings; use unicode; export fn main() void = { const input = os::args[1]; + const data = strings::toutf8(input); + hex::dump(os::stdout, data)!; fmt::println(input)!; let ix = 0u; const lb = unicode::new_line_breaker(input); - for (const (pos, mand) => unicode::next_line_break(&lb)) { + for (const (pos, _, mand) => unicode::next_line_break(&lb)) { for (ix < pos; ix += 1) { fmt::print(' ')!; }; @@ -27,7 +31,10 @@ export fn main() void = { fmt::println("Line break opportunities:")!; const lb = unicode::new_line_breaker(input); - for (const (pos, mand) => unicode::next_line_break(&lb)) { - fmt::printfln("- {} {}", pos, if (mand) "(mandatory)" else "")!; + for (const (pos, bpos, mand) => unicode::next_line_break(&lb)) { + fmt::printfln("- {}:{} {} (before '{}'/0x{:x})", pos, bpos, + if (mand) "(mandatory)" else "", + strings::sub(input, pos, pos+1), + data[bpos])!; }; }; diff --git a/unicode/linebreak.ha b/unicode/linebreak.ha index 6710138..bc88087 100644 --- a/unicode/linebreak.ha +++ b/unicode/linebreak.ha @@ -1,12 +1,13 @@ +use encoding::utf8; use strings; export type line_breaker = struct { input: str, iter: strings::iterator, // Current position - pos: uint, - // Previous position - ppos: uint, + pos: size, + // Current position, bytes + bpos: size, // Current line break class cur: line_break, // Next line break class @@ -26,14 +27,14 @@ export fn new_line_breaker(input: str) line_breaker = { }; }; -// Returns the next line break opportunity as a tuple of the rune-wise index of -// the opportunity in the input string and a boolean indicating if the line -// break is mandatory at this location. The line break opportunity directly -// precedes the index returned from this function. +// Returns the next line break opportunity as a tuple of the rune-wise index, +// byte-wise index, and a boolean indicating whether or not the break is +// mandatory at this location. The line break opportunity directly precedes the +// index returned from this function. // // Hello world! // ^ Line break opportunity at index 6 -export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = { +export fn next_line_break(lb: *line_breaker) ((size, size, bool) | done) = { if (lb.pos == 0) { if (len(lb.input) == 0) { return done; // special case @@ -41,25 +42,28 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = { lb.iter = strings::iter(lb.input); - let class = next_lb1_class(lb) as line_break; + const (class, rn) = next_lb1_class(lb) as (line_break, rune); class = resolve_lb2_class(class); lb.cur = class; lb.next = class; lb.lb8a = class == line_break::ZWJ; }; - for (const next => next_lb1_class(lb)) { + for (const (next, rn) => next_lb1_class(lb)) { const prev = lb.next; lb.next = next; - lb.ppos = lb.pos; - defer lb.pos += 1; + const rnsz = utf8::runesz(rn); + defer { + lb.pos += 1; + lb.bpos += rnsz; + }; const mandatory = lb.cur == line_break::BK || (lb.cur == line_break::CR && lb.next != line_break::LF); if (mandatory) { lb.cur = resolve_lb2_class(next); - return (lb.pos + 1, true); + return (lb.pos + 1, lb.bpos + rnsz, true); }; lb.lb8a = next == line_break::ZWJ; @@ -74,7 +78,7 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = { assert(can_break is bool); const can_break = can_break as bool; if (can_break) { - return (lb.pos + 1, false); + return (lb.pos + 1, lb.bpos + rnsz, false); }; }; @@ -82,7 +86,7 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = { }; // Applies LB1 suggested rules for resolving context-dependent classes. -fn next_lb1_class(lb: *line_breaker) (line_break | done) = { +fn next_lb1_class(lb: *line_breaker) ((line_break, rune) | done) = { const rn = match (strings::next(&lb.iter)) { case let rn: rune => yield rn; @@ -93,18 +97,18 @@ fn next_lb1_class(lb: *line_breaker) (line_break | done) = { const class = rune_line_break(rn); switch (class) { case line_break::AI, line_break::SG, line_break::XX => - return line_break::AL; + return (line_break::AL, rn); case line_break::SA => switch (rune_gc(rn)) { case gc::Mn, gc::Mc => - return line_break::CM; + return (line_break::CM, rn); case => - return line_break::AL; + return (line_break::AL, rn); }; case line_break::CJ => - return line_break::NS; + return (line_break::NS, rn); case => - return class; + return (class, rn); }; };