Add line break properties to UCD

Signed-off-by: Drew DeVault <sir@cmpwn.com>
This commit is contained in:
Drew DeVault 2024-04-16 13:17:50 +02:00
parent 771e9850fc
commit 8183289d6f
4 changed files with 4130 additions and 3655 deletions

View file

@ -6,6 +6,7 @@ use unicode;
export fn main() void = { export fn main() void = {
const in = os::args[1]; const in = os::args[1];
const iter = strings::iter(in); const iter = strings::iter(in);
for (true) { for (true) {
const rn = match (strings::next(&iter)) { const rn = match (strings::next(&iter)) {
case let rn: rune => case let rn: rune =>
@ -14,9 +15,11 @@ export fn main() void = {
}; };
const gc = unicode::rune_gc(rn); const gc = unicode::rune_gc(rn);
const sc = unicode::rune_script(rn); const sc = unicode::rune_script(rn);
fmt::printfln("'{}'/{:x}: {} : {}", const lb = unicode::rune_line_break(rn);
fmt::printfln("'{}'/{:x}: {} : {} : {}",
rn, rn: u32, rn, rn: u32,
unicode::gc_code(gc), unicode::gc_code(gc),
unicode::script_code(sc))!; unicode::script_code(sc),
unicode::line_break_code(lb))!;
}; };
}; };

View file

@ -89,6 +89,14 @@ EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
LINE_BREAKS = [
"XX",
"AI", "AL", "B2", "BA", "BB", "BK", "CB", "CJ", "CL", "CM", "CP", "CR",
"EB", "EM", "EX", "GL", "H2", "H3", "HL", "HY", "ID", "IN", "IS", "JL",
"JT", "JV", "LF", "NL", "NS", "NU", "OP", "PO", "PR", "QU", "RI", "SA",
"SG", "SP", "SY", "WJ", "ZW", "ZWJ",
]
SCRIPT_NAMES = [ SCRIPT_NAMES = [
"Common", "Common",
"Inherited", "Inherited",
@ -301,7 +309,7 @@ def maketables(trace=0):
def makeunicodedata(unicode, trace): def makeunicodedata(unicode, trace):
dummy = (0, 0, 0, 0, 0, 0) dummy = (0, 0, 0, 0, 0, 0, 0)
table = [dummy] table = [dummy]
cache = {0: dummy} cache = {0: dummy}
index = [0] * len(unicode.chars) index = [0] * len(unicode.chars)
@ -320,8 +328,11 @@ def makeunicodedata(unicode, trace):
mirrored = record.bidi_mirrored == "Y" mirrored = record.bidi_mirrored == "Y"
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width) eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
script = SCRIPT_NAMES.index(record.script or "Unknown") script = SCRIPT_NAMES.index(record.script or "Unknown")
line_break = LINE_BREAKS.index(record.line_break)
item = ( item = (
category, combining, bidirectional, mirrored, eastasianwidth, script, category, combining, bidirectional,
mirrored, eastasianwidth, script,
line_break,
) )
# add entry to index and item tables # add entry to index and item tables
i = cache.get(item) i = cache.get(item)
@ -345,7 +356,7 @@ def makeunicodedata(unicode, trace):
fprint("// List of unique database records") fprint("// List of unique database records")
fprint("const ucd_records: [_]ucd_encodedrec = [") fprint("const ucd_records: [_]ucd_encodedrec = [")
for item in table: for item in table:
fprint(" (%d, %d, %d, %d, %d, %d)," % item) fprint(" (%d, %d, %d, %d, %d, %d, %d)," % item)
fprint("];") fprint("];")
fprint() fprint()
@ -460,9 +471,12 @@ class UcdRecord:
# From Script.txt # From Script.txt
script: str script: str
# From LineBreak.txt
line_break: str
def from_row(row: List[str]) -> UcdRecord: def from_row(row: List[str]) -> UcdRecord:
return UcdRecord(*row, None, set(), 0, "Unknown") return UcdRecord(*row, None, set(), 0, "Unknown", "XX")
# -------------------------------------------------------------------- # --------------------------------------------------------------------
@ -573,10 +587,12 @@ class UnicodeData:
table[char].binary_properties.add(p) table[char].binary_properties.add(p)
for char_range, value in UcdFile(LINE_BREAK, version): for char_range, value in UcdFile(LINE_BREAK, version):
if value not in MANDATORY_LINE_BREAKS:
continue
for char in expand_range(char_range): for char in expand_range(char_range):
table[char].binary_properties.add('Line_Break') if not table[char]:
continue
if value in MANDATORY_LINE_BREAKS:
table[char].binary_properties.add('Line_Break')
table[char].line_break = value
# We only want the quickcheck properties # We only want the quickcheck properties
# Format: NF?_QC; Y(es)/N(o)/M(aybe) # Format: NF?_QC; Y(es)/N(o)/M(aybe)

View file

@ -1,4 +1,4 @@
type ucd_encodedrec = (u8, u8, u8, u8, u8, u16); type ucd_encodedrec = (u8, u8, u8, u8, u8, u16, u8);
type ucd_record = struct { type ucd_record = struct {
category: u8, category: u8,
@ -7,6 +7,7 @@ type ucd_record = struct {
mirrored: u8, mirrored: u8,
east_asian_width: u8, east_asian_width: u8,
script: u16, script: u16,
line_break: u8,
}; };
fn get_ucdrecord(rn: rune) *ucd_record = { fn get_ucdrecord(rn: rune) *ucd_record = {
@ -296,7 +297,7 @@ export type script = enum u16 {
MATH, // Zmth MATH, // Zmth
}; };
// Returns the [[general_category]] corresponding to this rune. // Returns the [[script]] corresponding to this rune.
export fn rune_script(rn: rune) script = { export fn rune_script(rn: rune) script = {
return get_ucdrecord(rn).script: script; return get_ucdrecord(rn).script: script;
}; };
@ -471,3 +472,147 @@ export fn script_code(sc: script) const str = {
case script::NAG_MUNDARI => return "Nagm"; case script::NAG_MUNDARI => return "Nagm";
}; };
}; };
// Line break classification.
export type line_break = enum u8 {
XX,
AI,
AL,
B2,
BA,
BB,
BK,
CB,
CJ,
CL,
CM,
CP,
CR,
EB,
EM,
EX,
GL,
H2,
H3,
HL,
HY,
ID,
IN,
IS,
JL,
JT,
JV,
LF,
NL,
NS,
NU,
OP,
PO,
PR,
QU,
RI,
SA,
SG,
SP,
SY,
WJ,
ZW,
ZWJ,
};
// Returns the [[line_break]] classification corresponding to this rune.
export fn rune_line_break(rn: rune) line_break = {
return get_ucdrecord(rn).line_break: line_break;
};
// Returns the two-character code associated with a [[line_break]] value.
export fn line_break_code(lb: line_break) const str = {
switch (lb) {
case line_break::XX =>
return "XX";
case line_break::AI =>
return "AI";
case line_break::AL =>
return "AL";
case line_break::B2 =>
return "B2";
case line_break::BA =>
return "BA";
case line_break::BB =>
return "BB";
case line_break::BK =>
return "BK";
case line_break::CB =>
return "CB";
case line_break::CJ =>
return "CJ";
case line_break::CL =>
return "CL";
case line_break::CM =>
return "CM";
case line_break::CP =>
return "CP";
case line_break::CR =>
return "CR";
case line_break::EB =>
return "EB";
case line_break::EM =>
return "EM";
case line_break::EX =>
return "EX";
case line_break::GL =>
return "GL";
case line_break::H2 =>
return "H2";
case line_break::H3 =>
return "H3";
case line_break::HL =>
return "HL";
case line_break::HY =>
return "HY";
case line_break::ID =>
return "ID";
case line_break::IN =>
return "IN";
case line_break::IS =>
return "IS";
case line_break::JL =>
return "JL";
case line_break::JT =>
return "JT";
case line_break::JV =>
return "JV";
case line_break::LF =>
return "LF";
case line_break::NL =>
return "NL";
case line_break::NS =>
return "NS";
case line_break::NU =>
return "NU";
case line_break::OP =>
return "OP";
case line_break::PO =>
return "PO";
case line_break::PR =>
return "PR";
case line_break::QU =>
return "QU";
case line_break::RI =>
return "RI";
case line_break::SA =>
return "SA";
case line_break::SG =>
return "SG";
case line_break::SP =>
return "SP";
case line_break::SY =>
return "SY";
case line_break::WJ =>
return "WJ";
case line_break::ZW =>
return "ZW";
case line_break::ZWJ =>
return "ZWJ";
};
};

File diff suppressed because it is too large Load diff