Add line break properties to UCD
Signed-off-by: Drew DeVault <sir@cmpwn.com>
This commit is contained in:
parent
771e9850fc
commit
8183289d6f
4 changed files with 4130 additions and 3655 deletions
|
@ -6,6 +6,7 @@ use unicode;
|
||||||
export fn main() void = {
|
export fn main() void = {
|
||||||
const in = os::args[1];
|
const in = os::args[1];
|
||||||
const iter = strings::iter(in);
|
const iter = strings::iter(in);
|
||||||
|
|
||||||
for (true) {
|
for (true) {
|
||||||
const rn = match (strings::next(&iter)) {
|
const rn = match (strings::next(&iter)) {
|
||||||
case let rn: rune =>
|
case let rn: rune =>
|
||||||
|
@ -14,9 +15,11 @@ export fn main() void = {
|
||||||
};
|
};
|
||||||
const gc = unicode::rune_gc(rn);
|
const gc = unicode::rune_gc(rn);
|
||||||
const sc = unicode::rune_script(rn);
|
const sc = unicode::rune_script(rn);
|
||||||
fmt::printfln("'{}'/{:x}: {} : {}",
|
const lb = unicode::rune_line_break(rn);
|
||||||
|
fmt::printfln("'{}'/{:x}: {} : {} : {}",
|
||||||
rn, rn: u32,
|
rn, rn: u32,
|
||||||
unicode::gc_code(gc),
|
unicode::gc_code(gc),
|
||||||
unicode::script_code(sc))!;
|
unicode::script_code(sc),
|
||||||
|
unicode::line_break_code(lb))!;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
@ -89,6 +89,14 @@ EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
|
||||||
|
|
||||||
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
|
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
|
||||||
|
|
||||||
|
LINE_BREAKS = [
|
||||||
|
"XX",
|
||||||
|
"AI", "AL", "B2", "BA", "BB", "BK", "CB", "CJ", "CL", "CM", "CP", "CR",
|
||||||
|
"EB", "EM", "EX", "GL", "H2", "H3", "HL", "HY", "ID", "IN", "IS", "JL",
|
||||||
|
"JT", "JV", "LF", "NL", "NS", "NU", "OP", "PO", "PR", "QU", "RI", "SA",
|
||||||
|
"SG", "SP", "SY", "WJ", "ZW", "ZWJ",
|
||||||
|
]
|
||||||
|
|
||||||
SCRIPT_NAMES = [
|
SCRIPT_NAMES = [
|
||||||
"Common",
|
"Common",
|
||||||
"Inherited",
|
"Inherited",
|
||||||
|
@ -301,7 +309,7 @@ def maketables(trace=0):
|
||||||
|
|
||||||
def makeunicodedata(unicode, trace):
|
def makeunicodedata(unicode, trace):
|
||||||
|
|
||||||
dummy = (0, 0, 0, 0, 0, 0)
|
dummy = (0, 0, 0, 0, 0, 0, 0)
|
||||||
table = [dummy]
|
table = [dummy]
|
||||||
cache = {0: dummy}
|
cache = {0: dummy}
|
||||||
index = [0] * len(unicode.chars)
|
index = [0] * len(unicode.chars)
|
||||||
|
@ -320,8 +328,11 @@ def makeunicodedata(unicode, trace):
|
||||||
mirrored = record.bidi_mirrored == "Y"
|
mirrored = record.bidi_mirrored == "Y"
|
||||||
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
|
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
|
||||||
script = SCRIPT_NAMES.index(record.script or "Unknown")
|
script = SCRIPT_NAMES.index(record.script or "Unknown")
|
||||||
|
line_break = LINE_BREAKS.index(record.line_break)
|
||||||
item = (
|
item = (
|
||||||
category, combining, bidirectional, mirrored, eastasianwidth, script,
|
category, combining, bidirectional,
|
||||||
|
mirrored, eastasianwidth, script,
|
||||||
|
line_break,
|
||||||
)
|
)
|
||||||
# add entry to index and item tables
|
# add entry to index and item tables
|
||||||
i = cache.get(item)
|
i = cache.get(item)
|
||||||
|
@ -345,7 +356,7 @@ def makeunicodedata(unicode, trace):
|
||||||
fprint("// List of unique database records")
|
fprint("// List of unique database records")
|
||||||
fprint("const ucd_records: [_]ucd_encodedrec = [")
|
fprint("const ucd_records: [_]ucd_encodedrec = [")
|
||||||
for item in table:
|
for item in table:
|
||||||
fprint(" (%d, %d, %d, %d, %d, %d)," % item)
|
fprint(" (%d, %d, %d, %d, %d, %d, %d)," % item)
|
||||||
fprint("];")
|
fprint("];")
|
||||||
fprint()
|
fprint()
|
||||||
|
|
||||||
|
@ -460,9 +471,12 @@ class UcdRecord:
|
||||||
# From Script.txt
|
# From Script.txt
|
||||||
script: str
|
script: str
|
||||||
|
|
||||||
|
# From LineBreak.txt
|
||||||
|
line_break: str
|
||||||
|
|
||||||
|
|
||||||
def from_row(row: List[str]) -> UcdRecord:
|
def from_row(row: List[str]) -> UcdRecord:
|
||||||
return UcdRecord(*row, None, set(), 0, "Unknown")
|
return UcdRecord(*row, None, set(), 0, "Unknown", "XX")
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------
|
# --------------------------------------------------------------------
|
||||||
|
@ -573,10 +587,12 @@ class UnicodeData:
|
||||||
table[char].binary_properties.add(p)
|
table[char].binary_properties.add(p)
|
||||||
|
|
||||||
for char_range, value in UcdFile(LINE_BREAK, version):
|
for char_range, value in UcdFile(LINE_BREAK, version):
|
||||||
if value not in MANDATORY_LINE_BREAKS:
|
|
||||||
continue
|
|
||||||
for char in expand_range(char_range):
|
for char in expand_range(char_range):
|
||||||
table[char].binary_properties.add('Line_Break')
|
if not table[char]:
|
||||||
|
continue
|
||||||
|
if value in MANDATORY_LINE_BREAKS:
|
||||||
|
table[char].binary_properties.add('Line_Break')
|
||||||
|
table[char].line_break = value
|
||||||
|
|
||||||
# We only want the quickcheck properties
|
# We only want the quickcheck properties
|
||||||
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
|
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
|
||||||
|
|
149
unicode/ucd.ha
149
unicode/ucd.ha
|
@ -1,4 +1,4 @@
|
||||||
type ucd_encodedrec = (u8, u8, u8, u8, u8, u16);
|
type ucd_encodedrec = (u8, u8, u8, u8, u8, u16, u8);
|
||||||
|
|
||||||
type ucd_record = struct {
|
type ucd_record = struct {
|
||||||
category: u8,
|
category: u8,
|
||||||
|
@ -7,6 +7,7 @@ type ucd_record = struct {
|
||||||
mirrored: u8,
|
mirrored: u8,
|
||||||
east_asian_width: u8,
|
east_asian_width: u8,
|
||||||
script: u16,
|
script: u16,
|
||||||
|
line_break: u8,
|
||||||
};
|
};
|
||||||
|
|
||||||
fn get_ucdrecord(rn: rune) *ucd_record = {
|
fn get_ucdrecord(rn: rune) *ucd_record = {
|
||||||
|
@ -296,7 +297,7 @@ export type script = enum u16 {
|
||||||
MATH, // Zmth
|
MATH, // Zmth
|
||||||
};
|
};
|
||||||
|
|
||||||
// Returns the [[general_category]] corresponding to this rune.
|
// Returns the [[script]] corresponding to this rune.
|
||||||
export fn rune_script(rn: rune) script = {
|
export fn rune_script(rn: rune) script = {
|
||||||
return get_ucdrecord(rn).script: script;
|
return get_ucdrecord(rn).script: script;
|
||||||
};
|
};
|
||||||
|
@ -471,3 +472,147 @@ export fn script_code(sc: script) const str = {
|
||||||
case script::NAG_MUNDARI => return "Nagm";
|
case script::NAG_MUNDARI => return "Nagm";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Line break classification.
|
||||||
|
export type line_break = enum u8 {
|
||||||
|
XX,
|
||||||
|
AI,
|
||||||
|
AL,
|
||||||
|
B2,
|
||||||
|
BA,
|
||||||
|
BB,
|
||||||
|
BK,
|
||||||
|
CB,
|
||||||
|
CJ,
|
||||||
|
CL,
|
||||||
|
CM,
|
||||||
|
CP,
|
||||||
|
CR,
|
||||||
|
EB,
|
||||||
|
EM,
|
||||||
|
EX,
|
||||||
|
GL,
|
||||||
|
H2,
|
||||||
|
H3,
|
||||||
|
HL,
|
||||||
|
HY,
|
||||||
|
ID,
|
||||||
|
IN,
|
||||||
|
IS,
|
||||||
|
JL,
|
||||||
|
JT,
|
||||||
|
JV,
|
||||||
|
LF,
|
||||||
|
NL,
|
||||||
|
NS,
|
||||||
|
NU,
|
||||||
|
OP,
|
||||||
|
PO,
|
||||||
|
PR,
|
||||||
|
QU,
|
||||||
|
RI,
|
||||||
|
SA,
|
||||||
|
SG,
|
||||||
|
SP,
|
||||||
|
SY,
|
||||||
|
WJ,
|
||||||
|
ZW,
|
||||||
|
ZWJ,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Returns the [[line_break]] classification corresponding to this rune.
|
||||||
|
export fn rune_line_break(rn: rune) line_break = {
|
||||||
|
return get_ucdrecord(rn).line_break: line_break;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Returns the two-character code associated with a [[line_break]] value.
|
||||||
|
export fn line_break_code(lb: line_break) const str = {
|
||||||
|
switch (lb) {
|
||||||
|
case line_break::XX =>
|
||||||
|
return "XX";
|
||||||
|
case line_break::AI =>
|
||||||
|
return "AI";
|
||||||
|
case line_break::AL =>
|
||||||
|
return "AL";
|
||||||
|
case line_break::B2 =>
|
||||||
|
return "B2";
|
||||||
|
case line_break::BA =>
|
||||||
|
return "BA";
|
||||||
|
case line_break::BB =>
|
||||||
|
return "BB";
|
||||||
|
case line_break::BK =>
|
||||||
|
return "BK";
|
||||||
|
case line_break::CB =>
|
||||||
|
return "CB";
|
||||||
|
case line_break::CJ =>
|
||||||
|
return "CJ";
|
||||||
|
case line_break::CL =>
|
||||||
|
return "CL";
|
||||||
|
case line_break::CM =>
|
||||||
|
return "CM";
|
||||||
|
case line_break::CP =>
|
||||||
|
return "CP";
|
||||||
|
case line_break::CR =>
|
||||||
|
return "CR";
|
||||||
|
case line_break::EB =>
|
||||||
|
return "EB";
|
||||||
|
case line_break::EM =>
|
||||||
|
return "EM";
|
||||||
|
case line_break::EX =>
|
||||||
|
return "EX";
|
||||||
|
case line_break::GL =>
|
||||||
|
return "GL";
|
||||||
|
case line_break::H2 =>
|
||||||
|
return "H2";
|
||||||
|
case line_break::H3 =>
|
||||||
|
return "H3";
|
||||||
|
case line_break::HL =>
|
||||||
|
return "HL";
|
||||||
|
case line_break::HY =>
|
||||||
|
return "HY";
|
||||||
|
case line_break::ID =>
|
||||||
|
return "ID";
|
||||||
|
case line_break::IN =>
|
||||||
|
return "IN";
|
||||||
|
case line_break::IS =>
|
||||||
|
return "IS";
|
||||||
|
case line_break::JL =>
|
||||||
|
return "JL";
|
||||||
|
case line_break::JT =>
|
||||||
|
return "JT";
|
||||||
|
case line_break::JV =>
|
||||||
|
return "JV";
|
||||||
|
case line_break::LF =>
|
||||||
|
return "LF";
|
||||||
|
case line_break::NL =>
|
||||||
|
return "NL";
|
||||||
|
case line_break::NS =>
|
||||||
|
return "NS";
|
||||||
|
case line_break::NU =>
|
||||||
|
return "NU";
|
||||||
|
case line_break::OP =>
|
||||||
|
return "OP";
|
||||||
|
case line_break::PO =>
|
||||||
|
return "PO";
|
||||||
|
case line_break::PR =>
|
||||||
|
return "PR";
|
||||||
|
case line_break::QU =>
|
||||||
|
return "QU";
|
||||||
|
case line_break::RI =>
|
||||||
|
return "RI";
|
||||||
|
case line_break::SA =>
|
||||||
|
return "SA";
|
||||||
|
case line_break::SG =>
|
||||||
|
return "SG";
|
||||||
|
case line_break::SP =>
|
||||||
|
return "SP";
|
||||||
|
case line_break::SY =>
|
||||||
|
return "SY";
|
||||||
|
case line_break::WJ =>
|
||||||
|
return "WJ";
|
||||||
|
case line_break::ZW =>
|
||||||
|
return "ZW";
|
||||||
|
case line_break::ZWJ =>
|
||||||
|
return "ZWJ";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
7599
unicode/ucd_gen.ha
7599
unicode/ucd_gen.ha
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue