Lobo Torres
c70ec9f648
git-subtree-dir: vendor/hare-unicode git-subtree-mainline:57979aa6fc
git-subtree-split:1488c26f46
196 lines
4.6 KiB
Hare
196 lines
4.6 KiB
Hare
use encoding::utf8;
|
|
use strings;
|
|
|
|
export type line_breaker = struct {
|
|
input: str,
|
|
iter: strings::iterator,
|
|
// Current position
|
|
pos: size,
|
|
// Current position, bytes
|
|
bpos: size,
|
|
// Current line break class
|
|
cur: line_break,
|
|
// Next line break class
|
|
next: line_break,
|
|
// State for specific rules
|
|
lb8a: bool,
|
|
lb21a: bool,
|
|
lb30a: uint,
|
|
};
|
|
|
|
// Creates a new line breaking algorithm state machine. See [[next_line_break]]
|
|
// to enumerate the line break opportunities in the input string.
|
|
export fn new_line_breaker(input: str) line_breaker = {
|
|
return line_breaker {
|
|
input = input,
|
|
...
|
|
};
|
|
};
|
|
|
|
// Returns the next line break opportunity as a tuple of the rune-wise index,
|
|
// byte-wise index, and a boolean indicating whether or not the break is
|
|
// mandatory at this location. The line break opportunity directly precedes the
|
|
// index returned from this function.
|
|
//
|
|
// Hello world!
|
|
// ^ Line break opportunity at index 6
|
|
export fn next_line_break(lb: *line_breaker) ((size, size, bool) | done) = {
|
|
if (lb.pos == 0) {
|
|
if (len(lb.input) == 0) {
|
|
return done; // special case
|
|
};
|
|
|
|
lb.iter = strings::iter(lb.input);
|
|
|
|
const (class, rn) = next_lb1_class(lb) as (line_break, rune);
|
|
class = resolve_lb2_class(class);
|
|
lb.cur = class;
|
|
lb.next = class;
|
|
lb.lb8a = class == line_break::ZWJ;
|
|
};
|
|
|
|
for (const (next, rn) => next_lb1_class(lb)) {
|
|
const prev = lb.next;
|
|
lb.next = next;
|
|
const rnsz = utf8::runesz(rn);
|
|
defer {
|
|
lb.pos += 1;
|
|
lb.bpos += rnsz;
|
|
};
|
|
|
|
const mandatory = lb.cur == line_break::BK
|
|
|| (lb.cur == line_break::CR
|
|
&& lb.next != line_break::LF);
|
|
if (mandatory) {
|
|
lb.cur = resolve_lb2_class(next);
|
|
return (lb.pos + 1, lb.bpos + rnsz, true);
|
|
};
|
|
|
|
lb.lb8a = next == line_break::ZWJ;
|
|
|
|
let can_break = lb_simple_case(lb);
|
|
match (can_break) {
|
|
case bool => void;
|
|
case void =>
|
|
can_break = lb_complex_case(lb, prev);
|
|
};
|
|
|
|
assert(can_break is bool);
|
|
const can_break = can_break as bool;
|
|
if (can_break) {
|
|
return (lb.pos + 1, lb.bpos + rnsz, false);
|
|
};
|
|
};
|
|
|
|
return done;
|
|
};
|
|
|
|
// Applies LB1 suggested rules for resolving context-dependent classes.
|
|
fn next_lb1_class(lb: *line_breaker) ((line_break, rune) | done) = {
|
|
const rn = match (strings::next(&lb.iter)) {
|
|
case let rn: rune =>
|
|
yield rn;
|
|
case done =>
|
|
return done;
|
|
};
|
|
|
|
const class = rune_line_break(rn);
|
|
switch (class) {
|
|
case line_break::AI, line_break::SG, line_break::XX =>
|
|
return (line_break::AL, rn);
|
|
case line_break::SA =>
|
|
switch (rune_gc(rn)) {
|
|
case gc::Mn, gc::Mc =>
|
|
return (line_break::CM, rn);
|
|
case =>
|
|
return (line_break::AL, rn);
|
|
};
|
|
case line_break::CJ =>
|
|
return (line_break::NS, rn);
|
|
case =>
|
|
return (class, rn);
|
|
};
|
|
};
|
|
|
|
// Applies LB2 suggested rules for resolving the start-of-text line-break class.
|
|
fn resolve_lb2_class(lb: line_break) line_break = {
|
|
switch (lb) {
|
|
case line_break::LF, line_break::NL =>
|
|
return line_break::BK;
|
|
case line_break::SP =>
|
|
return line_break::WJ;
|
|
case =>
|
|
return lb;
|
|
};
|
|
};
|
|
|
|
// If this is a simple case, return whether or not this is a break opportunity
|
|
// as a boolean. Returns void for special cases.
|
|
fn lb_simple_case(lb: *line_breaker) (bool | void) = {
|
|
switch (lb.next) {
|
|
case line_break::SP =>
|
|
return false;
|
|
case line_break::BK, line_break::LF, line_break::NL =>
|
|
lb.cur = line_break::BK;
|
|
return false;
|
|
case line_break::CR =>
|
|
lb.cur = line_break::CR;
|
|
return false;
|
|
case =>
|
|
return;
|
|
};
|
|
};
|
|
|
|
// Handles more complex rules, including pair table lookups via
|
|
// linebreak_table.ha.
|
|
fn lb_complex_case(lb: *line_breaker, prev: line_break) bool = {
|
|
let can_break = false;
|
|
|
|
const ucur = lb.cur: uint - line_break::OP: uint;
|
|
const unext = lb.next: uint - line_break::OP: uint;
|
|
if (ucur < len(lb_pairs) && unext < len(lb_pairs[0])) {
|
|
switch (lb_pairs[ucur][unext]) {
|
|
case bo::DI => // Direct break
|
|
can_break = true;
|
|
case bo::IN => // Indirect break opportunity
|
|
can_break = prev == line_break::SP;
|
|
case bo::CI => // Indirect opportunity for combining marks
|
|
can_break = prev == line_break::SP;
|
|
if (!can_break) {
|
|
return false;
|
|
};
|
|
case bo::CP => // Prohibited for combining marks
|
|
if (prev != line_break::SP) {
|
|
return false;
|
|
};
|
|
case bo::PR => void;
|
|
};
|
|
};
|
|
|
|
// Rule LB8a
|
|
if (lb.lb8a) {
|
|
can_break = false;
|
|
};
|
|
|
|
// Rule LB21a
|
|
if (lb.lb21a && (lb.cur == line_break::HY || lb.cur == line_break::BA)) {
|
|
can_break = false;
|
|
lb.lb21a = false;
|
|
} else {
|
|
lb.lb21a = lb.cur == line_break::HL;
|
|
};
|
|
|
|
// Rule LB30a
|
|
if (lb.cur == line_break::RI) {
|
|
lb.lb30a += 1;
|
|
if (lb.lb30a == 2 && lb.next == line_break::RI) {
|
|
can_break = true;
|
|
lb.lb30a = 0;
|
|
};
|
|
} else {
|
|
lb.lb30a = 0;
|
|
};
|
|
|
|
lb.cur = lb.next;
|
|
return can_break;
|
|
};
|