kojote/unicode/linebreak.ha
Drew DeVault 9c73c17238 Implement Unicode line breaking algorithm
Signed-off-by: Drew DeVault <sir@cmpwn.com>
2024-04-17 11:13:25 +02:00

192 lines
4.4 KiB
Hare

use strings;
export type line_breaker = struct {
input: str,
iter: strings::iterator,
// Current position
pos: uint,
// Previous position
ppos: uint,
// Current line break class
cur: line_break,
// Next line break class
next: line_break,
// State for specific rules
lb8a: bool,
lb21a: bool,
lb30a: uint,
};
// Creates a new line breaking algorithm state machine. See [[next_line_break]]
// to enumerate the line break opportunities in the input string.
export fn new_line_breaker(input: str) line_breaker = {
return line_breaker {
input = input,
...
};
};
// Returns the next line break opportunity as a tuple of the rune-wise index of
// the opportunity in the input string and a boolean indicating if the line
// break is mandatory at this location. The line break opportunity directly
// precedes the index returned from this function.
//
// Hello world!
// ^ Line break opportunity at index 6
export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
if (lb.pos == 0) {
if (len(lb.input) == 0) {
return done; // special case
};
lb.iter = strings::iter(lb.input);
let class = next_lb1_class(lb) as line_break;
class = resolve_lb2_class(class);
lb.cur = class;
lb.next = class;
lb.lb8a = class == line_break::ZWJ;
};
for (const next => next_lb1_class(lb)) {
const prev = lb.next;
lb.next = next;
lb.ppos = lb.pos;
defer lb.pos += 1;
const mandatory = lb.cur == line_break::BK
|| (lb.cur == line_break::CR
&& lb.next != line_break::LF);
if (mandatory) {
lb.cur = resolve_lb2_class(next);
return (lb.pos + 1, true);
};
lb.lb8a = next == line_break::ZWJ;
let can_break = lb_simple_case(lb);
match (can_break) {
case bool => void;
case void =>
can_break = lb_complex_case(lb, prev);
};
assert(can_break is bool);
const can_break = can_break as bool;
if (can_break) {
return (lb.pos + 1, false);
};
};
return done;
};
// Applies LB1 suggested rules for resolving context-dependent classes.
fn next_lb1_class(lb: *line_breaker) (line_break | done) = {
const rn = match (strings::next(&lb.iter)) {
case let rn: rune =>
yield rn;
case done =>
return done;
};
const class = rune_line_break(rn);
switch (class) {
case line_break::AI, line_break::SG, line_break::XX =>
return line_break::AL;
case line_break::SA =>
switch (rune_gc(rn)) {
case gc::Mn, gc::Mc =>
return line_break::CM;
case =>
return line_break::AL;
};
case line_break::CJ =>
return line_break::NS;
case =>
return class;
};
};
// Applies LB2 suggested rules for resolving the start-of-text line-break class.
fn resolve_lb2_class(lb: line_break) line_break = {
switch (lb) {
case line_break::LF, line_break::NL =>
return line_break::BK;
case line_break::SP =>
return line_break::WJ;
case =>
return lb;
};
};
// If this is a simple case, return whether or not this is a break opportunity
// as a boolean. Returns void for special cases.
fn lb_simple_case(lb: *line_breaker) (bool | void) = {
switch (lb.next) {
case line_break::SP =>
return false;
case line_break::BK, line_break::LF, line_break::NL =>
lb.cur = line_break::BK;
return false;
case line_break::CR =>
lb.cur = line_break::CR;
return false;
case =>
return;
};
};
// Handles more complex rules, including pair table lookups via
// linebreak_table.ha.
fn lb_complex_case(lb: *line_breaker, prev: line_break) bool = {
let can_break = false;
const ucur = lb.cur: uint - line_break::OP: uint;
const unext = lb.next: uint - line_break::OP: uint;
if (ucur < len(lb_pairs) && unext < len(lb_pairs[0])) {
switch (lb_pairs[ucur][unext]) {
case bo::DI => // Direct break
can_break = true;
case bo::IN => // Indirect break opportunity
can_break = prev == line_break::SP;
case bo::CI => // Indirect opportunity for combining marks
can_break = prev == line_break::SP;
if (!can_break) {
return false;
};
case bo::CP => // Prohibited for combining marks
if (prev != line_break::SP) {
return false;
};
case bo::PR => void;
};
};
// Rule LB8a
if (lb.lb8a) {
can_break = false;
};
// Rule LB21a
if (lb.lb21a && (lb.cur == line_break::HY || lb.cur == line_break::BA)) {
can_break = false;
lb.lb21a = false;
} else {
lb.lb21a = lb.cur == line_break::HL;
};
// Rule LB30a
if (lb.cur == line_break::RI) {
lb.lb30a += 1;
if (lb.lb30a == 2 && lb.next == line_break::RI) {
can_break = true;
lb.lb30a = 0;
};
} else {
lb.lb30a = 0;
};
lb.cur = lb.next;
return can_break;
};