linebreak: return bytewise position as well
Signed-off-by: Drew DeVault <sir@cmpwn.com>
This commit is contained in:
parent
9c73c17238
commit
1488c26f46
2 changed files with 34 additions and 23 deletions
|
@ -1,15 +1,19 @@
|
|||
use encoding::hex;
|
||||
use fmt;
|
||||
use os;
|
||||
use strings;
|
||||
use unicode;
|
||||
|
||||
export fn main() void = {
|
||||
const input = os::args[1];
|
||||
const data = strings::toutf8(input);
|
||||
hex::dump(os::stdout, data)!;
|
||||
|
||||
fmt::println(input)!;
|
||||
|
||||
let ix = 0u;
|
||||
const lb = unicode::new_line_breaker(input);
|
||||
for (const (pos, mand) => unicode::next_line_break(&lb)) {
|
||||
for (const (pos, _, mand) => unicode::next_line_break(&lb)) {
|
||||
for (ix < pos; ix += 1) {
|
||||
fmt::print(' ')!;
|
||||
};
|
||||
|
@ -27,7 +31,10 @@ export fn main() void = {
|
|||
|
||||
fmt::println("Line break opportunities:")!;
|
||||
const lb = unicode::new_line_breaker(input);
|
||||
for (const (pos, mand) => unicode::next_line_break(&lb)) {
|
||||
fmt::printfln("- {} {}", pos, if (mand) "(mandatory)" else "")!;
|
||||
for (const (pos, bpos, mand) => unicode::next_line_break(&lb)) {
|
||||
fmt::printfln("- {}:{} {} (before '{}'/0x{:x})", pos, bpos,
|
||||
if (mand) "(mandatory)" else "",
|
||||
strings::sub(input, pos, pos+1),
|
||||
data[bpos])!;
|
||||
};
|
||||
};
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
use encoding::utf8;
|
||||
use strings;
|
||||
|
||||
export type line_breaker = struct {
|
||||
input: str,
|
||||
iter: strings::iterator,
|
||||
// Current position
|
||||
pos: uint,
|
||||
// Previous position
|
||||
ppos: uint,
|
||||
pos: size,
|
||||
// Current position, bytes
|
||||
bpos: size,
|
||||
// Current line break class
|
||||
cur: line_break,
|
||||
// Next line break class
|
||||
|
@ -26,14 +27,14 @@ export fn new_line_breaker(input: str) line_breaker = {
|
|||
};
|
||||
};
|
||||
|
||||
// Returns the next line break opportunity as a tuple of the rune-wise index of
|
||||
// the opportunity in the input string and a boolean indicating if the line
|
||||
// break is mandatory at this location. The line break opportunity directly
|
||||
// precedes the index returned from this function.
|
||||
// Returns the next line break opportunity as a tuple of the rune-wise index,
|
||||
// byte-wise index, and a boolean indicating whether or not the break is
|
||||
// mandatory at this location. The line break opportunity directly precedes the
|
||||
// index returned from this function.
|
||||
//
|
||||
// Hello world!
|
||||
// ^ Line break opportunity at index 6
|
||||
export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
|
||||
export fn next_line_break(lb: *line_breaker) ((size, size, bool) | done) = {
|
||||
if (lb.pos == 0) {
|
||||
if (len(lb.input) == 0) {
|
||||
return done; // special case
|
||||
|
@ -41,25 +42,28 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
|
|||
|
||||
lb.iter = strings::iter(lb.input);
|
||||
|
||||
let class = next_lb1_class(lb) as line_break;
|
||||
const (class, rn) = next_lb1_class(lb) as (line_break, rune);
|
||||
class = resolve_lb2_class(class);
|
||||
lb.cur = class;
|
||||
lb.next = class;
|
||||
lb.lb8a = class == line_break::ZWJ;
|
||||
};
|
||||
|
||||
for (const next => next_lb1_class(lb)) {
|
||||
for (const (next, rn) => next_lb1_class(lb)) {
|
||||
const prev = lb.next;
|
||||
lb.next = next;
|
||||
lb.ppos = lb.pos;
|
||||
defer lb.pos += 1;
|
||||
const rnsz = utf8::runesz(rn);
|
||||
defer {
|
||||
lb.pos += 1;
|
||||
lb.bpos += rnsz;
|
||||
};
|
||||
|
||||
const mandatory = lb.cur == line_break::BK
|
||||
|| (lb.cur == line_break::CR
|
||||
&& lb.next != line_break::LF);
|
||||
if (mandatory) {
|
||||
lb.cur = resolve_lb2_class(next);
|
||||
return (lb.pos + 1, true);
|
||||
return (lb.pos + 1, lb.bpos + rnsz, true);
|
||||
};
|
||||
|
||||
lb.lb8a = next == line_break::ZWJ;
|
||||
|
@ -74,7 +78,7 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
|
|||
assert(can_break is bool);
|
||||
const can_break = can_break as bool;
|
||||
if (can_break) {
|
||||
return (lb.pos + 1, false);
|
||||
return (lb.pos + 1, lb.bpos + rnsz, false);
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -82,7 +86,7 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
|
|||
};
|
||||
|
||||
// Applies LB1 suggested rules for resolving context-dependent classes.
|
||||
fn next_lb1_class(lb: *line_breaker) (line_break | done) = {
|
||||
fn next_lb1_class(lb: *line_breaker) ((line_break, rune) | done) = {
|
||||
const rn = match (strings::next(&lb.iter)) {
|
||||
case let rn: rune =>
|
||||
yield rn;
|
||||
|
@ -93,18 +97,18 @@ fn next_lb1_class(lb: *line_breaker) (line_break | done) = {
|
|||
const class = rune_line_break(rn);
|
||||
switch (class) {
|
||||
case line_break::AI, line_break::SG, line_break::XX =>
|
||||
return line_break::AL;
|
||||
return (line_break::AL, rn);
|
||||
case line_break::SA =>
|
||||
switch (rune_gc(rn)) {
|
||||
case gc::Mn, gc::Mc =>
|
||||
return line_break::CM;
|
||||
return (line_break::CM, rn);
|
||||
case =>
|
||||
return line_break::AL;
|
||||
return (line_break::AL, rn);
|
||||
};
|
||||
case line_break::CJ =>
|
||||
return line_break::NS;
|
||||
return (line_break::NS, rn);
|
||||
case =>
|
||||
return class;
|
||||
return (class, rn);
|
||||
};
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in a new issue