type ucd_encodedrec = (u8, u8, u8, u8, u8, u16, u8); type ucd_record = struct { category: u8, combining: u8, bidirectional: u8, mirrored: u8, east_asian_width: u8, script: u16, line_break: u8, }; fn get_ucdrecord(rn: rune) *ucd_record = { const code = rn: u32; let index = 0u16; if (code < 0x110000) { index = index1[(code>>UCD_RECORD_SHIFT)]; index = index2[(index< return "Control"; case gc::Cf => return "Format"; case gc::Cn => return "Unassigned"; case gc::Co => return "Private use"; case gc::Cs => return "Surrogate"; case gc::Ll => return "Lowercase letter"; case gc::Lm => return "Modifier letter"; case gc::Lo => return "Other letter"; case gc::Lt => return "Titlecase letter"; case gc::Lu => return "Uppercase letter"; case gc::Mc => return "Spacing mark"; case gc::Me => return "Enclosing mark"; case gc::Mn => return "Non-spacing mark"; case gc::Nd => return "Decimal number"; case gc::Nl => return "Letter number"; case gc::No => return "Other number"; case gc::Pc => return "Connect punctuation"; case gc::Pd => return "Dash punctuation"; case gc::Pe => return "Close punctuation"; case gc::Pf => return "Final punctuation"; case gc::Pi => return "Initial punctuation"; case gc::Po => return "Other punctuation"; case gc::Ps => return "Open punctuation"; case gc::Sc => return "Currency symbol"; case gc::Sk => return "Modifier symbol"; case gc::Sm => return "Math symbol"; case gc::So => return "Other symbol"; case gc::Zl => return "Line separator"; case gc::Zp => return "Paragraph separator"; case gc::Zs => return "Space separator"; }; }; // Returns the two-character code associated with a [[gc]] value. export fn gc_code(v: gc) const str = { switch (v) { case gc::Cc => return "Cc"; case gc::Cf => return "Cf"; case gc::Cn => return "Cn"; case gc::Co => return "Co"; case gc::Cs => return "Cs"; case gc::Ll => return "Ll"; case gc::Lm => return "Lm"; case gc::Lo => return "Lo"; case gc::Lt => return "Lt"; case gc::Lu => return "Lu"; case gc::Mc => return "Mc"; case gc::Me => return "Me"; case gc::Mn => return "Mn"; case gc::Nd => return "Nd"; case gc::Nl => return "Nl"; case gc::No => return "No"; case gc::Pc => return "Pc"; case gc::Pd => return "Pd"; case gc::Pe => return "Pe"; case gc::Pf => return "Pf"; case gc::Pi => return "Pi"; case gc::Po => return "Po"; case gc::Ps => return "Ps"; case gc::Sc => return "Sc"; case gc::Sk => return "Sk"; case gc::Sm => return "Sm"; case gc::So => return "So"; case gc::Zl => return "Zl"; case gc::Zp => return "Zp"; case gc::Zs => return "Zs"; }; }; // Bidirectional classification. export type bidi = enum u8 { UNKNOWN, L, LRE, LRO, R, AL, RLE, RLO, PDF, EN, ES, ET, AN, CS, NSM, BN, B, S, WS, ON, LRI, RLI, FSI, PDI, }; // Returns the [[bidi]] classification corresponding to this rune. export fn rune_bidi(rn: rune) bidi = { return get_ucdrecord(rn).bidirectional: bidi; }; // Unicode character Script attribute. export type script = enum u16 { COMMON, // Zyyy INHERITED, // Zinh UNKNOWN, // Zzzz ADLAM, // Adlm CAUCASIAN_ALBANIAN, // Aghb AHOM, // Ahom ARABIC, // Arab IMPERIAL_ARAMAIC, // Armi ARMENIAN, // Armn AVESTAN, // Avst BALINESE, // Bali BAMUM, // Bamu BASSA_VAH, // Bass BATAK, // Batk BENGALI, // Beng BHAIKSUKI, // Bhks BOPOMOFO, // Bopo BRAHMI, // Brah BRAILLE, // Brai BUGINESE, // Bugi BUHID, // Buhd CHAKMA, // Cakm CANADIAN_SYLLABICS, // Cans CARIAN, // Cari CHAM, // Cham CHEROKEE, // Cher CHORASMIAN, // Chrs COPTIC, // Copt CYPRO_MINOAN, // Cpmn CYPRIOT, // Cprt CYRILLIC, // Cyrl DEVANAGARI, // Deva DIVES_AKURU, // Diak DOGRA, // Dogr DESERET, // Dsrt DUPLOYAN, // Dupl EGYPTIAN_HIEROGLYPHS, // Egyp ELBASAN, // Elba ELYMAIC, // Elym ETHIOPIC, // Ethi GEORGIAN, // Geor GLAGOLITIC, // Glag GUNJALA_GONDI, // Gong MASARAM_GONDI, // Gonm GOTHIC, // Goth GRANTHA, // Gran GREEK, // Grek GUJARATI, // Gujr GURMUKHI, // Guru HANGUL, // Hang HAN, // Hani HANUNOO, // Hano HATRAN, // Hatr HEBREW, // Hebr HIRAGANA, // Hira ANATOLIAN_HIEROGLYPHS, // Hluw PAHAWH_HMONG, // Hmng NYIAKENG_PUACHUE_HMONG, // Hmnp OLD_HUNGARIAN, // Hung OLD_ITALIC, // Ital JAVANESE, // Java KAYAH_LI, // Kali KATAKANA, // Kana KAWI, // Kawi KHAROSHTHI, // Khar KHMER, // Khmr KHOJKI, // Khoj KHITAN_SMALL_SCRIPT, // Kits KANNADA, // Knda KAITHI, // Kthi TAI_THAM, // Lana LAO, // Laoo LATIN, // Latn LEPCHA, // Lepc LIMBU, // Limb LINEAR_A, // Lina LINEAR_B, // Linb LISU, // Lisu LYCIAN, // Lyci LYDIAN, // Lydi MAHAJANI, // Mahj MAKASAR, // Maka MANDAIC, // Mand MANICHAEAN, // Mani MARCHEN, // Marc MEDEFAIDRIN, // Medf MENDE_KIKAKUI, // Mend MEROITIC_CURSIVE, // Merc MEROITIC_HIEROGLYPHS, // Mero MALAYALAM, // Mlym MODI, // Modi MONGOLIAN, // Mong MRO, // Mroo MEETEI_MAYEK, // Mtei MULTANI, // Mult MYANMAR, // Mymr NAG_MUNDARI, // Nagm NANDINAGARI, // Nand OLD_NORTH_ARABIAN, // Narb NABATAEAN, // Nbat NEWA, // Newa NKO, // Nkoo NUSHU, // Nshu OGHAM, // Ogam OL_CHIKI, // Olck OLD_TURKIC, // Orkh ORIYA, // Orya OSAGE, // Osge OSMANYA, // Osma OLD_UYGHUR, // Ougr PALMYRENE, // Palm PAU_CIN_HAU, // Pauc OLD_PERMIC, // Perm PHAGS_PA, // Phag INSCRIPTIONAL_PAHLAVI, // Phli PSALTER_PAHLAVI, // Phlp PHOENICIAN, // Phnx MIAO, // Plrd INSCRIPTIONAL_PARTHIAN, // Prti REJANG, // Rjng HANIFI_ROHINGYA, // Rohg RUNIC, // Runr SAMARITAN, // Samr OLD_SOUTH_ARABIAN, // Sarb SAURASHTRA, // Saur SIGNWRITING, // Sgnw SHAVIAN, // Shaw SHARADA, // Shrd SIDDHAM, // Sidd KHUDAWADI, // Sind SINHALA, // Sinh SOGDIAN, // Sogd OLD_SOGDIAN, // Sogo SORA_SOMPENG, // Sora SOYOMBO, // Soyo SUNDANESE, // Sund SYLOTI_NAGRI, // Sylo SYRIAC, // Syrc TAGBANWA, // Tagb TAKRI, // Takr TAI_LE, // Tale NEW_TAI_LUE, // Talu TAMIL, // Taml TANGUT, // Tang TAI_VIET, // Tavt TELUGU, // Telu TIFINAGH, // Tfng TAGALOG, // Tglg THAANA, // Thaa THAI, // Thai TIBETAN, // Tibt TIRHUTA, // Tirh TANGSA, // Tnsa TOTO, // Toto UGARITIC, // Ugar VAI, // Vaii VITHKUQI, // Vith WARANG_CITI, // Wara WANCHO, // Wcho OLD_PERSIAN, // Xpeo CUNEIFORM, // Xsux YEZIDI, // Yezi YI, // Yiii ZANABAZAR_SQUARE, // Zanb MATH, // Zmth }; // Returns the [[script]] corresponding to this rune. export fn rune_script(rn: rune) script = { return get_ucdrecord(rn).script: script; }; // Returns the four-character code associated with a [[script]] value. export fn script_code(sc: script) const str = { switch (sc) { case script::COMMON => return "Zyyy"; case script::INHERITED => return "Zinh"; case script::UNKNOWN => return "Zzzz"; case script::ARABIC => return "Arab"; case script::ARMENIAN => return "Armn"; case script::BENGALI => return "Beng"; case script::CYRILLIC => return "Cyrl"; case script::DEVANAGARI => return "Deva"; case script::GEORGIAN => return "Geor"; case script::GREEK => return "Grek"; case script::GUJARATI => return "Gujr"; case script::GURMUKHI => return "Guru"; case script::HANGUL => return "Hang"; case script::HAN => return "Hani"; case script::HEBREW => return "Hebr"; case script::HIRAGANA => return "Hira"; case script::KANNADA => return "Knda"; case script::KATAKANA => return "Kana"; case script::LAO => return "Laoo"; case script::LATIN => return "Latn"; case script::MALAYALAM => return "Mlym"; case script::ORIYA => return "Orya"; case script::TAMIL => return "Taml"; case script::TELUGU => return "Telu"; case script::THAI => return "Thai"; case script::TIBETAN => return "Tibt"; case script::BOPOMOFO => return "Bopo"; case script::BRAILLE => return "Brai"; case script::CANADIAN_SYLLABICS => return "Cans"; case script::CHEROKEE => return "Cher"; case script::ETHIOPIC => return "Ethi"; case script::KHMER => return "Khmr"; case script::MONGOLIAN => return "Mong"; case script::MYANMAR => return "Mymr"; case script::OGHAM => return "Ogam"; case script::RUNIC => return "Runr"; case script::SINHALA => return "Sinh"; case script::SYRIAC => return "Syrc"; case script::THAANA => return "Thaa"; case script::YI => return "Yiii"; case script::DESERET => return "Dsrt"; case script::GOTHIC => return "Goth"; case script::OLD_ITALIC => return "Ital"; case script::BUHID => return "Buhd"; case script::HANUNOO => return "Hano"; case script::TAGALOG => return "Tglg"; case script::TAGBANWA => return "Tagb"; case script::CYPRIOT => return "Cprt"; case script::LIMBU => return "Limb"; case script::LINEAR_B => return "Linb"; case script::OSMANYA => return "Osma"; case script::SHAVIAN => return "Shaw"; case script::TAI_LE => return "Tale"; case script::UGARITIC => return "Ugar"; case script::BUGINESE => return "Bugi"; case script::COPTIC => return "Copt"; case script::GLAGOLITIC => return "Glag"; case script::KHAROSHTHI => return "Khar"; case script::NEW_TAI_LUE => return "Talu"; case script::OLD_PERSIAN => return "Xpeo"; case script::SYLOTI_NAGRI => return "Sylo"; case script::TIFINAGH => return "Tfng"; case script::BALINESE => return "Bali"; case script::CUNEIFORM => return "Xsux"; case script::NKO => return "Nkoo"; case script::PHAGS_PA => return "Phag"; case script::PHOENICIAN => return "Phnx"; case script::CARIAN => return "Cari"; case script::CHAM => return "Cham"; case script::KAYAH_LI => return "Kali"; case script::LEPCHA => return "Lepc"; case script::LYCIAN => return "Lyci"; case script::LYDIAN => return "Lydi"; case script::OL_CHIKI => return "Olck"; case script::REJANG => return "Rjng"; case script::SAURASHTRA => return "Saur"; case script::SUNDANESE => return "Sund"; case script::VAI => return "Vaii"; case script::AVESTAN => return "Avst"; case script::BAMUM => return "Bamu"; case script::EGYPTIAN_HIEROGLYPHS => return "Egyp"; case script::IMPERIAL_ARAMAIC => return "Armi"; case script::INSCRIPTIONAL_PAHLAVI => return "Phli"; case script::INSCRIPTIONAL_PARTHIAN => return "Prti"; case script::JAVANESE => return "Java"; case script::KAITHI => return "Kthi"; case script::LISU => return "Lisu"; case script::MEETEI_MAYEK => return "Mtei"; case script::OLD_SOUTH_ARABIAN => return "Sarb"; case script::OLD_TURKIC => return "Orkh"; case script::SAMARITAN => return "Samr"; case script::TAI_THAM => return "Lana"; case script::TAI_VIET => return "Tavt"; case script::BATAK => return "Batk"; case script::BRAHMI => return "Brah"; case script::MANDAIC => return "Mand"; case script::CHAKMA => return "Cakm"; case script::MEROITIC_CURSIVE => return "Merc"; case script::MEROITIC_HIEROGLYPHS => return "Mero"; case script::MIAO => return "Plrd"; case script::SHARADA => return "Shrd"; case script::SORA_SOMPENG => return "Sora"; case script::TAKRI => return "Takr"; case script::BASSA_VAH => return "Bass"; case script::CAUCASIAN_ALBANIAN => return "Aghb"; case script::DUPLOYAN => return "Dupl"; case script::ELBASAN => return "Elba"; case script::GRANTHA => return "Gran"; case script::KHOJKI => return "Khoj"; case script::KHUDAWADI => return "Sind"; case script::LINEAR_A => return "Lina"; case script::MAHAJANI => return "Mahj"; case script::MANICHAEAN => return "Mani"; case script::MENDE_KIKAKUI => return "Mend"; case script::MODI => return "Modi"; case script::MRO => return "Mroo"; case script::NABATAEAN => return "Nbat"; case script::OLD_NORTH_ARABIAN => return "Narb"; case script::OLD_PERMIC => return "Perm"; case script::PAHAWH_HMONG => return "Hmng"; case script::PALMYRENE => return "Palm"; case script::PAU_CIN_HAU => return "Pauc"; case script::PSALTER_PAHLAVI => return "Phlp"; case script::SIDDHAM => return "Sidd"; case script::TIRHUTA => return "Tirh"; case script::WARANG_CITI => return "Wara"; case script::AHOM => return "Ahom"; case script::ANATOLIAN_HIEROGLYPHS => return "Hluw"; case script::HATRAN => return "Hatr"; case script::MULTANI => return "Mult"; case script::OLD_HUNGARIAN => return "Hung"; case script::SIGNWRITING => return "Sgnw"; case script::ADLAM => return "Adlm"; case script::BHAIKSUKI => return "Bhks"; case script::MARCHEN => return "Marc"; case script::OSAGE => return "Osge"; case script::TANGUT => return "Tang"; case script::NEWA => return "Newa"; case script::MASARAM_GONDI => return "Gonm"; case script::NUSHU => return "Nshu"; case script::SOYOMBO => return "Soyo"; case script::ZANABAZAR_SQUARE => return "Zanb"; case script::DOGRA => return "Dogr"; case script::GUNJALA_GONDI => return "Gong"; case script::HANIFI_ROHINGYA => return "Rohg"; case script::MAKASAR => return "Maka"; case script::MEDEFAIDRIN => return "Medf"; case script::OLD_SOGDIAN => return "Sogo"; case script::SOGDIAN => return "Sogd"; case script::ELYMAIC => return "Elym"; case script::NANDINAGARI => return "Nand"; case script::NYIAKENG_PUACHUE_HMONG => return "Hmnp"; case script::WANCHO => return "Wcho"; case script::CHORASMIAN => return "Chrs"; case script::DIVES_AKURU => return "Diak"; case script::KHITAN_SMALL_SCRIPT => return "Kits"; case script::YEZIDI => return "Yezi"; case script::CYPRO_MINOAN => return "Cpmn"; case script::OLD_UYGHUR => return "Ougr"; case script::TANGSA => return "Tnsa"; case script::TOTO => return "Toto"; case script::VITHKUQI => return "Vith"; case script::MATH => return "Zmth"; case script::KAWI => return "Kawi"; case script::NAG_MUNDARI => return "Nagm"; }; }; // Line break classification. export type line_break = enum u8 { XX, AI, BK, CJ, CR, LF, NL, SA, SG, SP, OP, CL, CP, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, HL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, RI, EB, EM, ZWJ, CB, }; // Returns the [[line_break]] classification corresponding to this rune. export fn rune_line_break(rn: rune) line_break = { return get_ucdrecord(rn).line_break: line_break; }; // Returns the two-character code associated with a [[line_break]] value. export fn line_break_code(lb: line_break) const str = { switch (lb) { case line_break::XX => return "XX"; case line_break::AI => return "AI"; case line_break::AL => return "AL"; case line_break::B2 => return "B2"; case line_break::BA => return "BA"; case line_break::BB => return "BB"; case line_break::BK => return "BK"; case line_break::CB => return "CB"; case line_break::CJ => return "CJ"; case line_break::CL => return "CL"; case line_break::CM => return "CM"; case line_break::CP => return "CP"; case line_break::CR => return "CR"; case line_break::EB => return "EB"; case line_break::EM => return "EM"; case line_break::EX => return "EX"; case line_break::GL => return "GL"; case line_break::H2 => return "H2"; case line_break::H3 => return "H3"; case line_break::HL => return "HL"; case line_break::HY => return "HY"; case line_break::ID => return "ID"; case line_break::IN => return "IN"; case line_break::IS => return "IS"; case line_break::JL => return "JL"; case line_break::JT => return "JT"; case line_break::JV => return "JV"; case line_break::LF => return "LF"; case line_break::NL => return "NL"; case line_break::NS => return "NS"; case line_break::NU => return "NU"; case line_break::OP => return "OP"; case line_break::PO => return "PO"; case line_break::PR => return "PR"; case line_break::QU => return "QU"; case line_break::RI => return "RI"; case line_break::SA => return "SA"; case line_break::SG => return "SG"; case line_break::SP => return "SP"; case line_break::SY => return "SY"; case line_break::WJ => return "WJ"; case line_break::ZW => return "ZW"; case line_break::ZWJ => return "ZWJ"; }; };