Add line break properties to UCD
Signed-off-by: Drew DeVault <sir@cmpwn.com>
This commit is contained in:
parent
771e9850fc
commit
8183289d6f
4 changed files with 4130 additions and 3655 deletions
|
@ -6,6 +6,7 @@ use unicode;
|
|||
export fn main() void = {
|
||||
const in = os::args[1];
|
||||
const iter = strings::iter(in);
|
||||
|
||||
for (true) {
|
||||
const rn = match (strings::next(&iter)) {
|
||||
case let rn: rune =>
|
||||
|
@ -14,9 +15,11 @@ export fn main() void = {
|
|||
};
|
||||
const gc = unicode::rune_gc(rn);
|
||||
const sc = unicode::rune_script(rn);
|
||||
fmt::printfln("'{}'/{:x}: {} : {}",
|
||||
const lb = unicode::rune_line_break(rn);
|
||||
fmt::printfln("'{}'/{:x}: {} : {} : {}",
|
||||
rn, rn: u32,
|
||||
unicode::gc_code(gc),
|
||||
unicode::script_code(sc))!;
|
||||
unicode::script_code(sc),
|
||||
unicode::line_break_code(lb))!;
|
||||
};
|
||||
};
|
||||
|
|
|
@ -89,6 +89,14 @@ EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
|
|||
|
||||
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
|
||||
|
||||
LINE_BREAKS = [
|
||||
"XX",
|
||||
"AI", "AL", "B2", "BA", "BB", "BK", "CB", "CJ", "CL", "CM", "CP", "CR",
|
||||
"EB", "EM", "EX", "GL", "H2", "H3", "HL", "HY", "ID", "IN", "IS", "JL",
|
||||
"JT", "JV", "LF", "NL", "NS", "NU", "OP", "PO", "PR", "QU", "RI", "SA",
|
||||
"SG", "SP", "SY", "WJ", "ZW", "ZWJ",
|
||||
]
|
||||
|
||||
SCRIPT_NAMES = [
|
||||
"Common",
|
||||
"Inherited",
|
||||
|
@ -301,7 +309,7 @@ def maketables(trace=0):
|
|||
|
||||
def makeunicodedata(unicode, trace):
|
||||
|
||||
dummy = (0, 0, 0, 0, 0, 0)
|
||||
dummy = (0, 0, 0, 0, 0, 0, 0)
|
||||
table = [dummy]
|
||||
cache = {0: dummy}
|
||||
index = [0] * len(unicode.chars)
|
||||
|
@ -320,8 +328,11 @@ def makeunicodedata(unicode, trace):
|
|||
mirrored = record.bidi_mirrored == "Y"
|
||||
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
|
||||
script = SCRIPT_NAMES.index(record.script or "Unknown")
|
||||
line_break = LINE_BREAKS.index(record.line_break)
|
||||
item = (
|
||||
category, combining, bidirectional, mirrored, eastasianwidth, script,
|
||||
category, combining, bidirectional,
|
||||
mirrored, eastasianwidth, script,
|
||||
line_break,
|
||||
)
|
||||
# add entry to index and item tables
|
||||
i = cache.get(item)
|
||||
|
@ -345,7 +356,7 @@ def makeunicodedata(unicode, trace):
|
|||
fprint("// List of unique database records")
|
||||
fprint("const ucd_records: [_]ucd_encodedrec = [")
|
||||
for item in table:
|
||||
fprint(" (%d, %d, %d, %d, %d, %d)," % item)
|
||||
fprint(" (%d, %d, %d, %d, %d, %d, %d)," % item)
|
||||
fprint("];")
|
||||
fprint()
|
||||
|
||||
|
@ -460,9 +471,12 @@ class UcdRecord:
|
|||
# From Script.txt
|
||||
script: str
|
||||
|
||||
# From LineBreak.txt
|
||||
line_break: str
|
||||
|
||||
|
||||
def from_row(row: List[str]) -> UcdRecord:
|
||||
return UcdRecord(*row, None, set(), 0, "Unknown")
|
||||
return UcdRecord(*row, None, set(), 0, "Unknown", "XX")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
|
@ -573,10 +587,12 @@ class UnicodeData:
|
|||
table[char].binary_properties.add(p)
|
||||
|
||||
for char_range, value in UcdFile(LINE_BREAK, version):
|
||||
if value not in MANDATORY_LINE_BREAKS:
|
||||
continue
|
||||
for char in expand_range(char_range):
|
||||
table[char].binary_properties.add('Line_Break')
|
||||
if not table[char]:
|
||||
continue
|
||||
if value in MANDATORY_LINE_BREAKS:
|
||||
table[char].binary_properties.add('Line_Break')
|
||||
table[char].line_break = value
|
||||
|
||||
# We only want the quickcheck properties
|
||||
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
|
||||
|
|
149
unicode/ucd.ha
149
unicode/ucd.ha
|
@ -1,4 +1,4 @@
|
|||
type ucd_encodedrec = (u8, u8, u8, u8, u8, u16);
|
||||
type ucd_encodedrec = (u8, u8, u8, u8, u8, u16, u8);
|
||||
|
||||
type ucd_record = struct {
|
||||
category: u8,
|
||||
|
@ -7,6 +7,7 @@ type ucd_record = struct {
|
|||
mirrored: u8,
|
||||
east_asian_width: u8,
|
||||
script: u16,
|
||||
line_break: u8,
|
||||
};
|
||||
|
||||
fn get_ucdrecord(rn: rune) *ucd_record = {
|
||||
|
@ -296,7 +297,7 @@ export type script = enum u16 {
|
|||
MATH, // Zmth
|
||||
};
|
||||
|
||||
// Returns the [[general_category]] corresponding to this rune.
|
||||
// Returns the [[script]] corresponding to this rune.
|
||||
export fn rune_script(rn: rune) script = {
|
||||
return get_ucdrecord(rn).script: script;
|
||||
};
|
||||
|
@ -471,3 +472,147 @@ export fn script_code(sc: script) const str = {
|
|||
case script::NAG_MUNDARI => return "Nagm";
|
||||
};
|
||||
};
|
||||
|
||||
// Line break classification.
|
||||
export type line_break = enum u8 {
|
||||
XX,
|
||||
AI,
|
||||
AL,
|
||||
B2,
|
||||
BA,
|
||||
BB,
|
||||
BK,
|
||||
CB,
|
||||
CJ,
|
||||
CL,
|
||||
CM,
|
||||
CP,
|
||||
CR,
|
||||
EB,
|
||||
EM,
|
||||
EX,
|
||||
GL,
|
||||
H2,
|
||||
H3,
|
||||
HL,
|
||||
HY,
|
||||
ID,
|
||||
IN,
|
||||
IS,
|
||||
JL,
|
||||
JT,
|
||||
JV,
|
||||
LF,
|
||||
NL,
|
||||
NS,
|
||||
NU,
|
||||
OP,
|
||||
PO,
|
||||
PR,
|
||||
QU,
|
||||
RI,
|
||||
SA,
|
||||
SG,
|
||||
SP,
|
||||
SY,
|
||||
WJ,
|
||||
ZW,
|
||||
ZWJ,
|
||||
};
|
||||
|
||||
// Returns the [[line_break]] classification corresponding to this rune.
|
||||
export fn rune_line_break(rn: rune) line_break = {
|
||||
return get_ucdrecord(rn).line_break: line_break;
|
||||
};
|
||||
|
||||
// Returns the two-character code associated with a [[line_break]] value.
|
||||
export fn line_break_code(lb: line_break) const str = {
|
||||
switch (lb) {
|
||||
case line_break::XX =>
|
||||
return "XX";
|
||||
case line_break::AI =>
|
||||
return "AI";
|
||||
case line_break::AL =>
|
||||
return "AL";
|
||||
case line_break::B2 =>
|
||||
return "B2";
|
||||
case line_break::BA =>
|
||||
return "BA";
|
||||
case line_break::BB =>
|
||||
return "BB";
|
||||
case line_break::BK =>
|
||||
return "BK";
|
||||
case line_break::CB =>
|
||||
return "CB";
|
||||
case line_break::CJ =>
|
||||
return "CJ";
|
||||
case line_break::CL =>
|
||||
return "CL";
|
||||
case line_break::CM =>
|
||||
return "CM";
|
||||
case line_break::CP =>
|
||||
return "CP";
|
||||
case line_break::CR =>
|
||||
return "CR";
|
||||
case line_break::EB =>
|
||||
return "EB";
|
||||
case line_break::EM =>
|
||||
return "EM";
|
||||
case line_break::EX =>
|
||||
return "EX";
|
||||
case line_break::GL =>
|
||||
return "GL";
|
||||
case line_break::H2 =>
|
||||
return "H2";
|
||||
case line_break::H3 =>
|
||||
return "H3";
|
||||
case line_break::HL =>
|
||||
return "HL";
|
||||
case line_break::HY =>
|
||||
return "HY";
|
||||
case line_break::ID =>
|
||||
return "ID";
|
||||
case line_break::IN =>
|
||||
return "IN";
|
||||
case line_break::IS =>
|
||||
return "IS";
|
||||
case line_break::JL =>
|
||||
return "JL";
|
||||
case line_break::JT =>
|
||||
return "JT";
|
||||
case line_break::JV =>
|
||||
return "JV";
|
||||
case line_break::LF =>
|
||||
return "LF";
|
||||
case line_break::NL =>
|
||||
return "NL";
|
||||
case line_break::NS =>
|
||||
return "NS";
|
||||
case line_break::NU =>
|
||||
return "NU";
|
||||
case line_break::OP =>
|
||||
return "OP";
|
||||
case line_break::PO =>
|
||||
return "PO";
|
||||
case line_break::PR =>
|
||||
return "PR";
|
||||
case line_break::QU =>
|
||||
return "QU";
|
||||
case line_break::RI =>
|
||||
return "RI";
|
||||
case line_break::SA =>
|
||||
return "SA";
|
||||
case line_break::SG =>
|
||||
return "SG";
|
||||
case line_break::SP =>
|
||||
return "SP";
|
||||
case line_break::SY =>
|
||||
return "SY";
|
||||
case line_break::WJ =>
|
||||
return "WJ";
|
||||
case line_break::ZW =>
|
||||
return "ZW";
|
||||
case line_break::ZWJ =>
|
||||
return "ZWJ";
|
||||
};
|
||||
};
|
||||
|
|
7599
unicode/ucd_gen.ha
7599
unicode/ucd_gen.ha
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue