linebreak: return bytewise position as well

Signed-off-by: Drew DeVault <sir@cmpwn.com>
This commit is contained in:
Drew DeVault 2024-04-17 11:31:28 +02:00
parent 9c73c17238
commit 1488c26f46
2 changed files with 34 additions and 23 deletions

View file

@ -1,15 +1,19 @@
use encoding::hex;
use fmt;
use os;
use strings;
use unicode;
export fn main() void = {
const input = os::args[1];
const data = strings::toutf8(input);
hex::dump(os::stdout, data)!;
fmt::println(input)!;
let ix = 0u;
const lb = unicode::new_line_breaker(input);
for (const (pos, mand) => unicode::next_line_break(&lb)) {
for (const (pos, _, mand) => unicode::next_line_break(&lb)) {
for (ix < pos; ix += 1) {
fmt::print(' ')!;
};
@ -27,7 +31,10 @@ export fn main() void = {
fmt::println("Line break opportunities:")!;
const lb = unicode::new_line_breaker(input);
for (const (pos, mand) => unicode::next_line_break(&lb)) {
fmt::printfln("- {} {}", pos, if (mand) "(mandatory)" else "")!;
for (const (pos, bpos, mand) => unicode::next_line_break(&lb)) {
fmt::printfln("- {}:{} {} (before '{}'/0x{:x})", pos, bpos,
if (mand) "(mandatory)" else "",
strings::sub(input, pos, pos+1),
data[bpos])!;
};
};

View file

@ -1,12 +1,13 @@
use encoding::utf8;
use strings;
export type line_breaker = struct {
input: str,
iter: strings::iterator,
// Current position
pos: uint,
// Previous position
ppos: uint,
pos: size,
// Current position, bytes
bpos: size,
// Current line break class
cur: line_break,
// Next line break class
@ -26,14 +27,14 @@ export fn new_line_breaker(input: str) line_breaker = {
};
};
// Returns the next line break opportunity as a tuple of the rune-wise index of
// the opportunity in the input string and a boolean indicating if the line
// break is mandatory at this location. The line break opportunity directly
// precedes the index returned from this function.
// Returns the next line break opportunity as a tuple of the rune-wise index,
// byte-wise index, and a boolean indicating whether or not the break is
// mandatory at this location. The line break opportunity directly precedes the
// index returned from this function.
//
// Hello world!
// ^ Line break opportunity at index 6
export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
export fn next_line_break(lb: *line_breaker) ((size, size, bool) | done) = {
if (lb.pos == 0) {
if (len(lb.input) == 0) {
return done; // special case
@ -41,25 +42,28 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
lb.iter = strings::iter(lb.input);
let class = next_lb1_class(lb) as line_break;
const (class, rn) = next_lb1_class(lb) as (line_break, rune);
class = resolve_lb2_class(class);
lb.cur = class;
lb.next = class;
lb.lb8a = class == line_break::ZWJ;
};
for (const next => next_lb1_class(lb)) {
for (const (next, rn) => next_lb1_class(lb)) {
const prev = lb.next;
lb.next = next;
lb.ppos = lb.pos;
defer lb.pos += 1;
const rnsz = utf8::runesz(rn);
defer {
lb.pos += 1;
lb.bpos += rnsz;
};
const mandatory = lb.cur == line_break::BK
|| (lb.cur == line_break::CR
&& lb.next != line_break::LF);
if (mandatory) {
lb.cur = resolve_lb2_class(next);
return (lb.pos + 1, true);
return (lb.pos + 1, lb.bpos + rnsz, true);
};
lb.lb8a = next == line_break::ZWJ;
@ -74,7 +78,7 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
assert(can_break is bool);
const can_break = can_break as bool;
if (can_break) {
return (lb.pos + 1, false);
return (lb.pos + 1, lb.bpos + rnsz, false);
};
};
@ -82,7 +86,7 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
};
// Applies LB1 suggested rules for resolving context-dependent classes.
fn next_lb1_class(lb: *line_breaker) (line_break | done) = {
fn next_lb1_class(lb: *line_breaker) ((line_break, rune) | done) = {
const rn = match (strings::next(&lb.iter)) {
case let rn: rune =>
yield rn;
@ -93,18 +97,18 @@ fn next_lb1_class(lb: *line_breaker) (line_break | done) = {
const class = rune_line_break(rn);
switch (class) {
case line_break::AI, line_break::SG, line_break::XX =>
return line_break::AL;
return (line_break::AL, rn);
case line_break::SA =>
switch (rune_gc(rn)) {
case gc::Mn, gc::Mc =>
return line_break::CM;
return (line_break::CM, rn);
case =>
return line_break::AL;
return (line_break::AL, rn);
};
case line_break::CJ =>
return line_break::NS;
return (line_break::NS, rn);
case =>
return class;
return (class, rn);
};
};