linebreak: return bytewise position as well
Signed-off-by: Drew DeVault <sir@cmpwn.com>
This commit is contained in:
parent
9c73c17238
commit
1488c26f46
2 changed files with 34 additions and 23 deletions
|
@ -1,15 +1,19 @@
|
||||||
|
use encoding::hex;
|
||||||
use fmt;
|
use fmt;
|
||||||
use os;
|
use os;
|
||||||
|
use strings;
|
||||||
use unicode;
|
use unicode;
|
||||||
|
|
||||||
export fn main() void = {
|
export fn main() void = {
|
||||||
const input = os::args[1];
|
const input = os::args[1];
|
||||||
|
const data = strings::toutf8(input);
|
||||||
|
hex::dump(os::stdout, data)!;
|
||||||
|
|
||||||
fmt::println(input)!;
|
fmt::println(input)!;
|
||||||
|
|
||||||
let ix = 0u;
|
let ix = 0u;
|
||||||
const lb = unicode::new_line_breaker(input);
|
const lb = unicode::new_line_breaker(input);
|
||||||
for (const (pos, mand) => unicode::next_line_break(&lb)) {
|
for (const (pos, _, mand) => unicode::next_line_break(&lb)) {
|
||||||
for (ix < pos; ix += 1) {
|
for (ix < pos; ix += 1) {
|
||||||
fmt::print(' ')!;
|
fmt::print(' ')!;
|
||||||
};
|
};
|
||||||
|
@ -27,7 +31,10 @@ export fn main() void = {
|
||||||
|
|
||||||
fmt::println("Line break opportunities:")!;
|
fmt::println("Line break opportunities:")!;
|
||||||
const lb = unicode::new_line_breaker(input);
|
const lb = unicode::new_line_breaker(input);
|
||||||
for (const (pos, mand) => unicode::next_line_break(&lb)) {
|
for (const (pos, bpos, mand) => unicode::next_line_break(&lb)) {
|
||||||
fmt::printfln("- {} {}", pos, if (mand) "(mandatory)" else "")!;
|
fmt::printfln("- {}:{} {} (before '{}'/0x{:x})", pos, bpos,
|
||||||
|
if (mand) "(mandatory)" else "",
|
||||||
|
strings::sub(input, pos, pos+1),
|
||||||
|
data[bpos])!;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
|
use encoding::utf8;
|
||||||
use strings;
|
use strings;
|
||||||
|
|
||||||
export type line_breaker = struct {
|
export type line_breaker = struct {
|
||||||
input: str,
|
input: str,
|
||||||
iter: strings::iterator,
|
iter: strings::iterator,
|
||||||
// Current position
|
// Current position
|
||||||
pos: uint,
|
pos: size,
|
||||||
// Previous position
|
// Current position, bytes
|
||||||
ppos: uint,
|
bpos: size,
|
||||||
// Current line break class
|
// Current line break class
|
||||||
cur: line_break,
|
cur: line_break,
|
||||||
// Next line break class
|
// Next line break class
|
||||||
|
@ -26,14 +27,14 @@ export fn new_line_breaker(input: str) line_breaker = {
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
// Returns the next line break opportunity as a tuple of the rune-wise index of
|
// Returns the next line break opportunity as a tuple of the rune-wise index,
|
||||||
// the opportunity in the input string and a boolean indicating if the line
|
// byte-wise index, and a boolean indicating whether or not the break is
|
||||||
// break is mandatory at this location. The line break opportunity directly
|
// mandatory at this location. The line break opportunity directly precedes the
|
||||||
// precedes the index returned from this function.
|
// index returned from this function.
|
||||||
//
|
//
|
||||||
// Hello world!
|
// Hello world!
|
||||||
// ^ Line break opportunity at index 6
|
// ^ Line break opportunity at index 6
|
||||||
export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
|
export fn next_line_break(lb: *line_breaker) ((size, size, bool) | done) = {
|
||||||
if (lb.pos == 0) {
|
if (lb.pos == 0) {
|
||||||
if (len(lb.input) == 0) {
|
if (len(lb.input) == 0) {
|
||||||
return done; // special case
|
return done; // special case
|
||||||
|
@ -41,25 +42,28 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
|
||||||
|
|
||||||
lb.iter = strings::iter(lb.input);
|
lb.iter = strings::iter(lb.input);
|
||||||
|
|
||||||
let class = next_lb1_class(lb) as line_break;
|
const (class, rn) = next_lb1_class(lb) as (line_break, rune);
|
||||||
class = resolve_lb2_class(class);
|
class = resolve_lb2_class(class);
|
||||||
lb.cur = class;
|
lb.cur = class;
|
||||||
lb.next = class;
|
lb.next = class;
|
||||||
lb.lb8a = class == line_break::ZWJ;
|
lb.lb8a = class == line_break::ZWJ;
|
||||||
};
|
};
|
||||||
|
|
||||||
for (const next => next_lb1_class(lb)) {
|
for (const (next, rn) => next_lb1_class(lb)) {
|
||||||
const prev = lb.next;
|
const prev = lb.next;
|
||||||
lb.next = next;
|
lb.next = next;
|
||||||
lb.ppos = lb.pos;
|
const rnsz = utf8::runesz(rn);
|
||||||
defer lb.pos += 1;
|
defer {
|
||||||
|
lb.pos += 1;
|
||||||
|
lb.bpos += rnsz;
|
||||||
|
};
|
||||||
|
|
||||||
const mandatory = lb.cur == line_break::BK
|
const mandatory = lb.cur == line_break::BK
|
||||||
|| (lb.cur == line_break::CR
|
|| (lb.cur == line_break::CR
|
||||||
&& lb.next != line_break::LF);
|
&& lb.next != line_break::LF);
|
||||||
if (mandatory) {
|
if (mandatory) {
|
||||||
lb.cur = resolve_lb2_class(next);
|
lb.cur = resolve_lb2_class(next);
|
||||||
return (lb.pos + 1, true);
|
return (lb.pos + 1, lb.bpos + rnsz, true);
|
||||||
};
|
};
|
||||||
|
|
||||||
lb.lb8a = next == line_break::ZWJ;
|
lb.lb8a = next == line_break::ZWJ;
|
||||||
|
@ -74,7 +78,7 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
|
||||||
assert(can_break is bool);
|
assert(can_break is bool);
|
||||||
const can_break = can_break as bool;
|
const can_break = can_break as bool;
|
||||||
if (can_break) {
|
if (can_break) {
|
||||||
return (lb.pos + 1, false);
|
return (lb.pos + 1, lb.bpos + rnsz, false);
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -82,7 +86,7 @@ export fn next_line_break(lb: *line_breaker) ((uint, bool) | done) = {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Applies LB1 suggested rules for resolving context-dependent classes.
|
// Applies LB1 suggested rules for resolving context-dependent classes.
|
||||||
fn next_lb1_class(lb: *line_breaker) (line_break | done) = {
|
fn next_lb1_class(lb: *line_breaker) ((line_break, rune) | done) = {
|
||||||
const rn = match (strings::next(&lb.iter)) {
|
const rn = match (strings::next(&lb.iter)) {
|
||||||
case let rn: rune =>
|
case let rn: rune =>
|
||||||
yield rn;
|
yield rn;
|
||||||
|
@ -93,18 +97,18 @@ fn next_lb1_class(lb: *line_breaker) (line_break | done) = {
|
||||||
const class = rune_line_break(rn);
|
const class = rune_line_break(rn);
|
||||||
switch (class) {
|
switch (class) {
|
||||||
case line_break::AI, line_break::SG, line_break::XX =>
|
case line_break::AI, line_break::SG, line_break::XX =>
|
||||||
return line_break::AL;
|
return (line_break::AL, rn);
|
||||||
case line_break::SA =>
|
case line_break::SA =>
|
||||||
switch (rune_gc(rn)) {
|
switch (rune_gc(rn)) {
|
||||||
case gc::Mn, gc::Mc =>
|
case gc::Mn, gc::Mc =>
|
||||||
return line_break::CM;
|
return (line_break::CM, rn);
|
||||||
case =>
|
case =>
|
||||||
return line_break::AL;
|
return (line_break::AL, rn);
|
||||||
};
|
};
|
||||||
case line_break::CJ =>
|
case line_break::CJ =>
|
||||||
return line_break::NS;
|
return (line_break::NS, rn);
|
||||||
case =>
|
case =>
|
||||||
return class;
|
return (class, rn);
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue