Implement script detection

This commit is contained in:
Drew DeVault 2024-01-26 11:16:11 +01:00
parent 04940cea04
commit 2097b64da5
4 changed files with 4288 additions and 2716 deletions

View file

@ -13,6 +13,10 @@ export fn main() void = {
case => break; case => break;
}; };
const gc = unicode::rune_gc(rn); const gc = unicode::rune_gc(rn);
fmt::printfln("'{}'/{:x}: {}", rn, rn: u32, unicode::gc_code(gc))!; const sc = unicode::rune_script(rn);
fmt::printfln("'{}'/{:x}: {} : {}",
rn, rn: u32,
unicode::gc_code(gc),
unicode::script_code(sc))!;
}; };
}; };

View file

@ -61,6 +61,7 @@ NAME_ALIASES = "NameAliases%s.txt"
NAMED_SEQUENCES = "NamedSequences%s.txt" NAMED_SEQUENCES = "NamedSequences%s.txt"
SPECIAL_CASING = "SpecialCasing%s.txt" SPECIAL_CASING = "SpecialCasing%s.txt"
CASE_FOLDING = "CaseFolding%s.txt" CASE_FOLDING = "CaseFolding%s.txt"
SCRIPTS = "Scripts%s.txt"
# Private Use Areas -- in planes 1, 15, 16 # Private Use Areas -- in planes 1, 15, 16
PUA_1 = range(0xE000, 0xF900) PUA_1 = range(0xE000, 0xF900)
@ -88,6 +89,173 @@ EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
SCRIPT_NAMES = [
"Common",
"Inherited",
"Unknown",
"Adlam",
"Caucasian_Albanian",
"Ahom",
"Arabic",
"Imperial_Aramaic",
"Armenian",
"Avestan",
"Balinese",
"Bamum",
"Bassa_Vah",
"Batak",
"Bengali",
"Bhaiksuki",
"Bopomofo",
"Brahmi",
"Braille",
"Buginese",
"Buhid",
"Chakma",
"Canadian_Aboriginal",
"Carian",
"Cham",
"Cherokee",
"Chorasmian",
"Coptic",
"Cypro_Minoan",
"Cypriot",
"Cyrillic",
"Devanagari",
"Dives_Akuru",
"Dogra",
"Deseret",
"Duployan",
"Egyptian_Hieroglyphs",
"Elbasan",
"Elymaic",
"Ethiopic",
"Georgian",
"Glagolitic",
"Gunjala_Gondi",
"Masaram_Gondi",
"Gothic",
"Grantha",
"Greek",
"Gujarati",
"Gurmukhi",
"Hangul",
"Han",
"Hanunoo",
"Hatran",
"Hebrew",
"Hiragana",
"Anatolian_Hieroglyphs",
"Pahawh_Hmong",
"Nyiakeng_Puachue_Hmong",
"Old_Hungarian",
"Old_Italic",
"Javanese",
"Kayah_Li",
"Katakana",
"Kawi",
"Kharoshthi",
"Khmer",
"Khojki",
"Khitan_Small_Script",
"Kannada",
"Kaithi",
"Tai_Tham",
"Lao",
"Latin",
"Lepcha",
"Limbu",
"Linear_A",
"Linear_B",
"Lisu",
"Lycian",
"Lydian",
"Mahajani",
"Makasar",
"Mandaic",
"Manichaean",
"Marchen",
"Medefaidrin",
"Mende_Kikakui",
"Meroitic_Cursive",
"Meroitic_Hieroglyphs",
"Malayalam",
"Modi",
"Mongolian",
"Mro",
"Meetei_Mayek",
"Multani",
"Myanmar",
"Nag_Mundari",
"Nandinagari",
"Old_North_Arabian",
"Nabataean",
"Newa",
"Nko",
"Nushu",
"Ogham",
"Ol_Chiki",
"Old_Turkic",
"Oriya",
"Osage",
"Osmanya",
"Old_Uyghur",
"Palmyrene",
"Pau_Cin_Hau",
"Old_Permic",
"Phags_Pa",
"Inscriptional_Pahlavi",
"Psalter_Pahlavi",
"Phoenician",
"Miao",
"Inscriptional_Parthian",
"Rejang",
"Hanifi_Rohingya",
"Runic",
"Samaritan",
"Old_South_Arabian",
"Saurashtra",
"SignWriting",
"Shavian",
"Sharada",
"Siddham",
"Khudawadi",
"Sinhala",
"Sogdian",
"Old_Sogdian",
"Sora_Sompeng",
"Soyombo",
"Sundanese",
"Syloti_Nagri",
"Syriac",
"Tagbanwa",
"Takri",
"Tai_Le",
"New_Tai_Lue",
"Tamil",
"Tangut",
"Tai_Viet",
"Telugu",
"Tifinagh",
"Tagalog",
"Thaana",
"Thai",
"Tibetan",
"Tirhuta",
"Tangsa",
"Toto",
"Ugaritic",
"Vai",
"Vithkuqi",
"Warang_Citi",
"Wancho",
"Old_Persian",
"Cuneiform",
"Yezidi",
"Yi",
"Zanabazar_Square",
]
# note: should match definitions in Objects/unicodectype.c # note: should match definitions in Objects/unicodectype.c
ALPHA_MASK = 0x01 ALPHA_MASK = 0x01
DECIMAL_MASK = 0x02 DECIMAL_MASK = 0x02
@ -133,7 +301,7 @@ def maketables(trace=0):
def makeunicodedata(unicode, trace): def makeunicodedata(unicode, trace):
dummy = (0, 0, 0, 0, 0) dummy = (0, 0, 0, 0, 0, 0)
table = [dummy] table = [dummy]
cache = {0: dummy} cache = {0: dummy}
index = [0] * len(unicode.chars) index = [0] * len(unicode.chars)
@ -151,8 +319,9 @@ def makeunicodedata(unicode, trace):
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class) bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
mirrored = record.bidi_mirrored == "Y" mirrored = record.bidi_mirrored == "Y"
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width) eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
script = SCRIPT_NAMES.index(record.script or "Unknown")
item = ( item = (
category, combining, bidirectional, mirrored, eastasianwidth, category, combining, bidirectional, mirrored, eastasianwidth, script,
) )
# add entry to index and item tables # add entry to index and item tables
i = cache.get(item) i = cache.get(item)
@ -176,7 +345,7 @@ def makeunicodedata(unicode, trace):
fprint("// List of unique database records") fprint("// List of unique database records")
fprint("const ucd_records: [_]ucd_encodedrec = [") fprint("const ucd_records: [_]ucd_encodedrec = [")
for item in table: for item in table:
fprint(" (%d, %d, %d, %d, %d)," % item) fprint(" (%d, %d, %d, %d, %d, %d)," % item)
fprint("];") fprint("];")
fprint() fprint()
@ -288,9 +457,12 @@ class UcdRecord:
# We store them as a bitmask. # We store them as a bitmask.
quick_check: int quick_check: int
# From Script.txt
script: str
def from_row(row: List[str]) -> UcdRecord: def from_row(row: List[str]) -> UcdRecord:
return UcdRecord(*row, None, set(), 0) return UcdRecord(*row, None, set(), 0, "Unknown")
# -------------------------------------------------------------------- # --------------------------------------------------------------------
@ -386,6 +558,14 @@ class UnicodeData:
if table[i] is not None: if table[i] is not None:
table[i].east_asian_width = widths[i] table[i].east_asian_width = widths[i]
scripts = [None] * 0x110000
for char, (script,) in UcdFile(SCRIPTS, version).expanded():
scripts[char] = script
for i in range(0, 0x110000):
if table[i] is not None:
table[i].script = scripts[i]
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded(): for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
if table[char]: if table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point) # Some properties (e.g. Default_Ignorable_Code_Point)

View file

@ -1,4 +1,4 @@
type ucd_encodedrec = (u8, u8, u8, u8, u8); type ucd_encodedrec = (u8, u8, u8, u8, u8, u16);
type ucd_record = struct { type ucd_record = struct {
category: u8, category: u8,
@ -6,11 +6,12 @@ type ucd_record = struct {
bidirectional: u8, bidirectional: u8,
mirrored: u8, mirrored: u8,
east_asian_width: u8, east_asian_width: u8,
script: u16,
}; };
fn get_ucdrecord(rn: rune) *ucd_record = { fn get_ucdrecord(rn: rune) *ucd_record = {
const code = rn: u32; const code = rn: u32;
let index = 0u8; let index = 0u16;
if (code < 0x110000) { if (code < 0x110000) {
index = index1[(code>>UCD_RECORD_SHIFT)]; index = index1[(code>>UCD_RECORD_SHIFT)];
index = index2[(index<<UCD_RECORD_SHIFT)+(code&((1<<UCD_RECORD_SHIFT)-1))]; index = index2[(index<<UCD_RECORD_SHIFT)+(code&((1<<UCD_RECORD_SHIFT)-1))];
@ -92,3 +93,348 @@ export fn gc_code(v: gc) const str = {
case gc::SPACE_SEPARATOR => return "Zs"; case gc::SPACE_SEPARATOR => return "Zs";
}; };
}; };
// Unicode character Script attribute.
export type script = enum u16 {
COMMON, // Zyyy
INHERITED, // Zinh
UNKNOWN, // Zzzz
ADLAM, // Adlm
CAUCASIAN_ALBANIAN, // Aghb
AHOM, // Ahom
ARABIC, // Arab
IMPERIAL_ARAMAIC, // Armi
ARMENIAN, // Armn
AVESTAN, // Avst
BALINESE, // Bali
BAMUM, // Bamu
BASSA_VAH, // Bass
BATAK, // Batk
BENGALI, // Beng
BHAIKSUKI, // Bhks
BOPOMOFO, // Bopo
BRAHMI, // Brah
BRAILLE, // Brai
BUGINESE, // Bugi
BUHID, // Buhd
CHAKMA, // Cakm
CANADIAN_SYLLABICS, // Cans
CARIAN, // Cari
CHAM, // Cham
CHEROKEE, // Cher
CHORASMIAN, // Chrs
COPTIC, // Copt
CYPRO_MINOAN, // Cpmn
CYPRIOT, // Cprt
CYRILLIC, // Cyrl
DEVANAGARI, // Deva
DIVES_AKURU, // Diak
DOGRA, // Dogr
DESERET, // Dsrt
DUPLOYAN, // Dupl
EGYPTIAN_HIEROGLYPHS, // Egyp
ELBASAN, // Elba
ELYMAIC, // Elym
ETHIOPIC, // Ethi
GEORGIAN, // Geor
GLAGOLITIC, // Glag
GUNJALA_GONDI, // Gong
MASARAM_GONDI, // Gonm
GOTHIC, // Goth
GRANTHA, // Gran
GREEK, // Grek
GUJARATI, // Gujr
GURMUKHI, // Guru
HANGUL, // Hang
HAN, // Hani
HANUNOO, // Hano
HATRAN, // Hatr
HEBREW, // Hebr
HIRAGANA, // Hira
ANATOLIAN_HIEROGLYPHS, // Hluw
PAHAWH_HMONG, // Hmng
NYIAKENG_PUACHUE_HMONG, // Hmnp
OLD_HUNGARIAN, // Hung
OLD_ITALIC, // Ital
JAVANESE, // Java
KAYAH_LI, // Kali
KATAKANA, // Kana
KAWI, // Kawi
KHAROSHTHI, // Khar
KHMER, // Khmr
KHOJKI, // Khoj
KHITAN_SMALL_SCRIPT, // Kits
KANNADA, // Knda
KAITHI, // Kthi
TAI_THAM, // Lana
LAO, // Laoo
LATIN, // Latn
LEPCHA, // Lepc
LIMBU, // Limb
LINEAR_A, // Lina
LINEAR_B, // Linb
LISU, // Lisu
LYCIAN, // Lyci
LYDIAN, // Lydi
MAHAJANI, // Mahj
MAKASAR, // Maka
MANDAIC, // Mand
MANICHAEAN, // Mani
MARCHEN, // Marc
MEDEFAIDRIN, // Medf
MENDE_KIKAKUI, // Mend
MEROITIC_CURSIVE, // Merc
MEROITIC_HIEROGLYPHS, // Mero
MALAYALAM, // Mlym
MODI, // Modi
MONGOLIAN, // Mong
MRO, // Mroo
MEETEI_MAYEK, // Mtei
MULTANI, // Mult
MYANMAR, // Mymr
NAG_MUNDARI, // Nagm
NANDINAGARI, // Nand
OLD_NORTH_ARABIAN, // Narb
NABATAEAN, // Nbat
NEWA, // Newa
NKO, // Nkoo
NUSHU, // Nshu
OGHAM, // Ogam
OL_CHIKI, // Olck
OLD_TURKIC, // Orkh
ORIYA, // Orya
OSAGE, // Osge
OSMANYA, // Osma
OLD_UYGHUR, // Ougr
PALMYRENE, // Palm
PAU_CIN_HAU, // Pauc
OLD_PERMIC, // Perm
PHAGS_PA, // Phag
INSCRIPTIONAL_PAHLAVI, // Phli
PSALTER_PAHLAVI, // Phlp
PHOENICIAN, // Phnx
MIAO, // Plrd
INSCRIPTIONAL_PARTHIAN, // Prti
REJANG, // Rjng
HANIFI_ROHINGYA, // Rohg
RUNIC, // Runr
SAMARITAN, // Samr
OLD_SOUTH_ARABIAN, // Sarb
SAURASHTRA, // Saur
SIGNWRITING, // Sgnw
SHAVIAN, // Shaw
SHARADA, // Shrd
SIDDHAM, // Sidd
KHUDAWADI, // Sind
SINHALA, // Sinh
SOGDIAN, // Sogd
OLD_SOGDIAN, // Sogo
SORA_SOMPENG, // Sora
SOYOMBO, // Soyo
SUNDANESE, // Sund
SYLOTI_NAGRI, // Sylo
SYRIAC, // Syrc
TAGBANWA, // Tagb
TAKRI, // Takr
TAI_LE, // Tale
NEW_TAI_LUE, // Talu
TAMIL, // Taml
TANGUT, // Tang
TAI_VIET, // Tavt
TELUGU, // Telu
TIFINAGH, // Tfng
TAGALOG, // Tglg
THAANA, // Thaa
THAI, // Thai
TIBETAN, // Tibt
TIRHUTA, // Tirh
TANGSA, // Tnsa
TOTO, // Toto
UGARITIC, // Ugar
VAI, // Vaii
VITHKUQI, // Vith
WARANG_CITI, // Wara
WANCHO, // Wcho
OLD_PERSIAN, // Xpeo
CUNEIFORM, // Xsux
YEZIDI, // Yezi
YI, // Yiii
ZANABAZAR_SQUARE, // Zanb
MATH, // Zmth
};
// Returns the [[general_category]] corresponding to this rune.
export fn rune_script(rn: rune) script = {
return get_ucdrecord(rn).script: script;
};
// Returns the four-character code associated with a [[script]] value.
export fn script_code(sc: script) const str = {
switch (sc) {
case script::COMMON => return "Zyyy";
case script::INHERITED => return "Zinh";
case script::UNKNOWN => return "Zzzz";
case script::ARABIC => return "Arab";
case script::ARMENIAN => return "Armn";
case script::BENGALI => return "Beng";
case script::CYRILLIC => return "Cyrl";
case script::DEVANAGARI => return "Deva";
case script::GEORGIAN => return "Geor";
case script::GREEK => return "Grek";
case script::GUJARATI => return "Gujr";
case script::GURMUKHI => return "Guru";
case script::HANGUL => return "Hang";
case script::HAN => return "Hani";
case script::HEBREW => return "Hebr";
case script::HIRAGANA => return "Hira";
case script::KANNADA => return "Knda";
case script::KATAKANA => return "Kana";
case script::LAO => return "Laoo";
case script::LATIN => return "Latn";
case script::MALAYALAM => return "Mlym";
case script::ORIYA => return "Orya";
case script::TAMIL => return "Taml";
case script::TELUGU => return "Telu";
case script::THAI => return "Thai";
case script::TIBETAN => return "Tibt";
case script::BOPOMOFO => return "Bopo";
case script::BRAILLE => return "Brai";
case script::CANADIAN_SYLLABICS => return "Cans";
case script::CHEROKEE => return "Cher";
case script::ETHIOPIC => return "Ethi";
case script::KHMER => return "Khmr";
case script::MONGOLIAN => return "Mong";
case script::MYANMAR => return "Mymr";
case script::OGHAM => return "Ogam";
case script::RUNIC => return "Runr";
case script::SINHALA => return "Sinh";
case script::SYRIAC => return "Syrc";
case script::THAANA => return "Thaa";
case script::YI => return "Yiii";
case script::DESERET => return "Dsrt";
case script::GOTHIC => return "Goth";
case script::OLD_ITALIC => return "Ital";
case script::BUHID => return "Buhd";
case script::HANUNOO => return "Hano";
case script::TAGALOG => return "Tglg";
case script::TAGBANWA => return "Tagb";
case script::CYPRIOT => return "Cprt";
case script::LIMBU => return "Limb";
case script::LINEAR_B => return "Linb";
case script::OSMANYA => return "Osma";
case script::SHAVIAN => return "Shaw";
case script::TAI_LE => return "Tale";
case script::UGARITIC => return "Ugar";
case script::BUGINESE => return "Bugi";
case script::COPTIC => return "Copt";
case script::GLAGOLITIC => return "Glag";
case script::KHAROSHTHI => return "Khar";
case script::NEW_TAI_LUE => return "Talu";
case script::OLD_PERSIAN => return "Xpeo";
case script::SYLOTI_NAGRI => return "Sylo";
case script::TIFINAGH => return "Tfng";
case script::BALINESE => return "Bali";
case script::CUNEIFORM => return "Xsux";
case script::NKO => return "Nkoo";
case script::PHAGS_PA => return "Phag";
case script::PHOENICIAN => return "Phnx";
case script::CARIAN => return "Cari";
case script::CHAM => return "Cham";
case script::KAYAH_LI => return "Kali";
case script::LEPCHA => return "Lepc";
case script::LYCIAN => return "Lyci";
case script::LYDIAN => return "Lydi";
case script::OL_CHIKI => return "Olck";
case script::REJANG => return "Rjng";
case script::SAURASHTRA => return "Saur";
case script::SUNDANESE => return "Sund";
case script::VAI => return "Vaii";
case script::AVESTAN => return "Avst";
case script::BAMUM => return "Bamu";
case script::EGYPTIAN_HIEROGLYPHS => return "Egyp";
case script::IMPERIAL_ARAMAIC => return "Armi";
case script::INSCRIPTIONAL_PAHLAVI => return "Phli";
case script::INSCRIPTIONAL_PARTHIAN => return "Prti";
case script::JAVANESE => return "Java";
case script::KAITHI => return "Kthi";
case script::LISU => return "Lisu";
case script::MEETEI_MAYEK => return "Mtei";
case script::OLD_SOUTH_ARABIAN => return "Sarb";
case script::OLD_TURKIC => return "Orkh";
case script::SAMARITAN => return "Samr";
case script::TAI_THAM => return "Lana";
case script::TAI_VIET => return "Tavt";
case script::BATAK => return "Batk";
case script::BRAHMI => return "Brah";
case script::MANDAIC => return "Mand";
case script::CHAKMA => return "Cakm";
case script::MEROITIC_CURSIVE => return "Merc";
case script::MEROITIC_HIEROGLYPHS => return "Mero";
case script::MIAO => return "Plrd";
case script::SHARADA => return "Shrd";
case script::SORA_SOMPENG => return "Sora";
case script::TAKRI => return "Takr";
case script::BASSA_VAH => return "Bass";
case script::CAUCASIAN_ALBANIAN => return "Aghb";
case script::DUPLOYAN => return "Dupl";
case script::ELBASAN => return "Elba";
case script::GRANTHA => return "Gran";
case script::KHOJKI => return "Khoj";
case script::KHUDAWADI => return "Sind";
case script::LINEAR_A => return "Lina";
case script::MAHAJANI => return "Mahj";
case script::MANICHAEAN => return "Mani";
case script::MENDE_KIKAKUI => return "Mend";
case script::MODI => return "Modi";
case script::MRO => return "Mroo";
case script::NABATAEAN => return "Nbat";
case script::OLD_NORTH_ARABIAN => return "Narb";
case script::OLD_PERMIC => return "Perm";
case script::PAHAWH_HMONG => return "Hmng";
case script::PALMYRENE => return "Palm";
case script::PAU_CIN_HAU => return "Pauc";
case script::PSALTER_PAHLAVI => return "Phlp";
case script::SIDDHAM => return "Sidd";
case script::TIRHUTA => return "Tirh";
case script::WARANG_CITI => return "Wara";
case script::AHOM => return "Ahom";
case script::ANATOLIAN_HIEROGLYPHS => return "Hluw";
case script::HATRAN => return "Hatr";
case script::MULTANI => return "Mult";
case script::OLD_HUNGARIAN => return "Hung";
case script::SIGNWRITING => return "Sgnw";
case script::ADLAM => return "Adlm";
case script::BHAIKSUKI => return "Bhks";
case script::MARCHEN => return "Marc";
case script::OSAGE => return "Osge";
case script::TANGUT => return "Tang";
case script::NEWA => return "Newa";
case script::MASARAM_GONDI => return "Gonm";
case script::NUSHU => return "Nshu";
case script::SOYOMBO => return "Soyo";
case script::ZANABAZAR_SQUARE => return "Zanb";
case script::DOGRA => return "Dogr";
case script::GUNJALA_GONDI => return "Gong";
case script::HANIFI_ROHINGYA => return "Rohg";
case script::MAKASAR => return "Maka";
case script::MEDEFAIDRIN => return "Medf";
case script::OLD_SOGDIAN => return "Sogo";
case script::SOGDIAN => return "Sogd";
case script::ELYMAIC => return "Elym";
case script::NANDINAGARI => return "Nand";
case script::NYIAKENG_PUACHUE_HMONG => return "Hmnp";
case script::WANCHO => return "Wcho";
case script::CHORASMIAN => return "Chrs";
case script::DIVES_AKURU => return "Diak";
case script::KHITAN_SMALL_SCRIPT => return "Kits";
case script::YEZIDI => return "Yezi";
case script::CYPRO_MINOAN => return "Cpmn";
case script::OLD_UYGHUR => return "Ougr";
case script::TANGSA => return "Tnsa";
case script::TOTO => return "Toto";
case script::VITHKUQI => return "Vith";
case script::MATH => return "Zmth";
case script::KAWI => return "Kawi";
case script::NAG_MUNDARI => return "Nagm";
};
};

File diff suppressed because it is too large Load diff