linebreak: return bytewise position as well

Signed-off-by: Drew DeVault <sir@cmpwn.com>
This commit is contained in:
Drew DeVault 2024-04-17 11:31:28 +02:00
parent 9c73c17238
commit 1488c26f46
2 changed files with 34 additions and 23 deletions

View file

@ -1,15 +1,19 @@
use encoding::hex;
use fmt; use fmt;
use os; use os;
use strings;
use unicode; use unicode;
export fn main() void = { export fn main() void = {
const input = os::args[1]; const input = os::args[1];
const data = strings::toutf8(input);
hex::dump(os::stdout, data)!;
fmt::println(input)!; fmt::println(input)!;
let ix = 0u; let ix = 0u;
const lb = unicode::new_line_breaker(input); const lb = unicode::new_line_breaker(input);
for (const (pos, mand) => unicode::next_line_break(&lb)) { for (const (pos, _, mand) => unicode::next_line_break(&lb)) {
for (ix < pos; ix += 1) { for (ix < pos; ix += 1) {
fmt::print(' ')!; fmt::print(' ')!;
}; };
@ -27,7 +31,10 @@ export fn main() void = {
fmt::println("Line break opportunities:")!; fmt::println("Line break opportunities:")!;
const lb = unicode::new_line_breaker(input); const lb = unicode::new_line_breaker(input);
for (const (pos, mand) => unicode::next_line_break(&lb)) { for (const (pos, bpos, mand) => unicode::next_line_break(&lb)) {
fmt::printfln("- {} {}", pos, if (mand) "(mandatory)" else "")!; fmt::printfln("- {}:{} {} (before '{}'/0x{:x})", pos, bpos,
if (mand) "(mandatory)" else "",
strings::sub(input, pos, pos+1),
data[bpos])!;
}; };
}; };

View file

@ -1,12 +1,13 @@
use encoding::utf8;
use strings; use strings;
export type line_breaker = struct { export type line_breaker = struct {
input: str, input: str,
iter: strings::iterator, iter: strings::iterator,
// Current position // Current position
pos: uint, pos: size,
// Previous position // Current position, bytes
ppos: uint, bpos: size,
// Current line break class // Current line break class
cur: line_break, cur: line_break,
// Next line break class // Next line break class
@ -26,14 +27,14 @@ export fn new_line_breaker(input: str) line_breaker = {
}; };
}; };
// Returns the next line break opportunity as a tuple of the rune-wise index of // Returns the next line break opportunity as a tuple of the rune-wise index,
// the opportunity in the input string and a boolean indicating if the line // byte-wise index, and a boolean indicating whether or not the break is
// break is mandatory at this location. The line break opportunity directly // mandatory at this location. The line break opportunity directly precedes the
// precedes the index returned from this function. // index returned from this function.
// //
// Hello world! // Hello world!
// ^ Line break opportunity at index 6 // ^ Line break opportunity at index 6
export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = { export fn next_line_break(lb: *line_breaker) ((size, size, bool) | done) = {
if (lb.pos == 0) { if (lb.pos == 0) {
if (len(lb.input) == 0) { if (len(lb.input) == 0) {
return done; // special case return done; // special case
@ -41,25 +42,28 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
lb.iter = strings::iter(lb.input); lb.iter = strings::iter(lb.input);
let class = next_lb1_class(lb) as line_break; const (class, rn) = next_lb1_class(lb) as (line_break, rune);
class = resolve_lb2_class(class); class = resolve_lb2_class(class);
lb.cur = class; lb.cur = class;
lb.next = class; lb.next = class;
lb.lb8a = class == line_break::ZWJ; lb.lb8a = class == line_break::ZWJ;
}; };
for (const next => next_lb1_class(lb)) { for (const (next, rn) => next_lb1_class(lb)) {
const prev = lb.next; const prev = lb.next;
lb.next = next; lb.next = next;
lb.ppos = lb.pos; const rnsz = utf8::runesz(rn);
defer lb.pos += 1; defer {
lb.pos += 1;
lb.bpos += rnsz;
};
const mandatory = lb.cur == line_break::BK const mandatory = lb.cur == line_break::BK
|| (lb.cur == line_break::CR || (lb.cur == line_break::CR
&& lb.next != line_break::LF); && lb.next != line_break::LF);
if (mandatory) { if (mandatory) {
lb.cur = resolve_lb2_class(next); lb.cur = resolve_lb2_class(next);
return (lb.pos + 1, true); return (lb.pos + 1, lb.bpos + rnsz, true);
}; };
lb.lb8a = next == line_break::ZWJ; lb.lb8a = next == line_break::ZWJ;
@ -74,7 +78,7 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
assert(can_break is bool); assert(can_break is bool);
const can_break = can_break as bool; const can_break = can_break as bool;
if (can_break) { if (can_break) {
return (lb.pos + 1, false); return (lb.pos + 1, lb.bpos + rnsz, false);
}; };
}; };
@ -82,7 +86,7 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
}; };
// Applies LB1 suggested rules for resolving context-dependent classes. // Applies LB1 suggested rules for resolving context-dependent classes.
fn next_lb1_class(lb: *line_breaker) (line_break | done) = { fn next_lb1_class(lb: *line_breaker) ((line_break, rune) | done) = {
const rn = match (strings::next(&lb.iter)) { const rn = match (strings::next(&lb.iter)) {
case let rn: rune => case let rn: rune =>
yield rn; yield rn;
@ -93,18 +97,18 @@ fn next_lb1_class(lb: *line_breaker) (line_break | done) = {
const class = rune_line_break(rn); const class = rune_line_break(rn);
switch (class) { switch (class) {
case line_break::AI, line_break::SG, line_break::XX => case line_break::AI, line_break::SG, line_break::XX =>
return line_break::AL; return (line_break::AL, rn);
case line_break::SA => case line_break::SA =>
switch (rune_gc(rn)) { switch (rune_gc(rn)) {
case gc::Mn, gc::Mc => case gc::Mn, gc::Mc =>
return line_break::CM; return (line_break::CM, rn);
case => case =>
return line_break::AL; return (line_break::AL, rn);
}; };
case line_break::CJ => case line_break::CJ =>
return line_break::NS; return (line_break::NS, rn);
case => case =>
return class; return (class, rn);
}; };
}; };