kojote/vendor/hare-unicode/unicode/ucd.ha
Lobo Torres c70ec9f648 Add 'vendor/hare-unicode/' from commit '1488c26f46f7f8568235eaee6224983ac46e78ff'
git-subtree-dir: vendor/hare-unicode
git-subtree-mainline: 57979aa6fc
git-subtree-split: 1488c26f46
2024-12-04 13:29:21 -03:00

654 lines
17 KiB
Hare

type ucd_encodedrec = (u8, u8, u8, u8, u8, u16, u8);
type ucd_record = struct {
category: u8,
combining: u8,
bidirectional: u8,
mirrored: u8,
east_asian_width: u8,
script: u16,
line_break: u8,
};
fn get_ucdrecord(rn: rune) *ucd_record = {
const code = rn: u32;
let index = 0u16;
if (code < 0x110000) {
index = index1[(code>>UCD_RECORD_SHIFT)];
index = index2[(index<<UCD_RECORD_SHIFT)+(code&((1<<UCD_RECORD_SHIFT)-1))];
};
return &ucd_records[index]: *ucd_record;
};
// Unicode character General_Category attribute
export type gc = enum u8 {
Cc, // Control
Cf, // Format
Cn, // Unassigned
Co, // Private use
Cs, // Surrogate
Ll, // Lowercase letter
Lm, // Modifier letter
Lo, // Other letter
Lt, // Titlecase letter
Lu, // Uppercase letter
Mc, // Spacing mark
Me, // Enclosing mark
Mn, // Non-spacing mark
Nd, // Decimal number
Nl, // Letter number
No, // Other number
Pc, // Connect punctuation
Pd, // Dash punctuation
Pe, // Close punctuation
Pf, // Final punctuation
Pi, // Initial punctuation
Po, // Other punctuation
Ps, // Open punctuation
Sc, // Currency symbol
Sk, // Modifier symbol
Sm, // Math symbol
So, // Other symbol
Zl, // Line separator
Zp, // Paragraph separator
Zs, // Space separator
};
// Returns the [[general_category]] corresponding to this rune.
export fn rune_gc(rn: rune) gc = {
return get_ucdrecord(rn).category: gc;
};
// Returns the name associated with a [[gc]] value.
export fn gc_name(v: gc) const str = {
switch (v) {
case gc::Cc => return "Control";
case gc::Cf => return "Format";
case gc::Cn => return "Unassigned";
case gc::Co => return "Private use";
case gc::Cs => return "Surrogate";
case gc::Ll => return "Lowercase letter";
case gc::Lm => return "Modifier letter";
case gc::Lo => return "Other letter";
case gc::Lt => return "Titlecase letter";
case gc::Lu => return "Uppercase letter";
case gc::Mc => return "Spacing mark";
case gc::Me => return "Enclosing mark";
case gc::Mn => return "Non-spacing mark";
case gc::Nd => return "Decimal number";
case gc::Nl => return "Letter number";
case gc::No => return "Other number";
case gc::Pc => return "Connect punctuation";
case gc::Pd => return "Dash punctuation";
case gc::Pe => return "Close punctuation";
case gc::Pf => return "Final punctuation";
case gc::Pi => return "Initial punctuation";
case gc::Po => return "Other punctuation";
case gc::Ps => return "Open punctuation";
case gc::Sc => return "Currency symbol";
case gc::Sk => return "Modifier symbol";
case gc::Sm => return "Math symbol";
case gc::So => return "Other symbol";
case gc::Zl => return "Line separator";
case gc::Zp => return "Paragraph separator";
case gc::Zs => return "Space separator";
};
};
// Returns the two-character code associated with a [[gc]] value.
export fn gc_code(v: gc) const str = {
switch (v) {
case gc::Cc => return "Cc";
case gc::Cf => return "Cf";
case gc::Cn => return "Cn";
case gc::Co => return "Co";
case gc::Cs => return "Cs";
case gc::Ll => return "Ll";
case gc::Lm => return "Lm";
case gc::Lo => return "Lo";
case gc::Lt => return "Lt";
case gc::Lu => return "Lu";
case gc::Mc => return "Mc";
case gc::Me => return "Me";
case gc::Mn => return "Mn";
case gc::Nd => return "Nd";
case gc::Nl => return "Nl";
case gc::No => return "No";
case gc::Pc => return "Pc";
case gc::Pd => return "Pd";
case gc::Pe => return "Pe";
case gc::Pf => return "Pf";
case gc::Pi => return "Pi";
case gc::Po => return "Po";
case gc::Ps => return "Ps";
case gc::Sc => return "Sc";
case gc::Sk => return "Sk";
case gc::Sm => return "Sm";
case gc::So => return "So";
case gc::Zl => return "Zl";
case gc::Zp => return "Zp";
case gc::Zs => return "Zs";
};
};
// Bidirectional classification.
export type bidi = enum u8 {
UNKNOWN,
L,
LRE,
LRO,
R,
AL,
RLE,
RLO,
PDF,
EN,
ES,
ET,
AN,
CS,
NSM,
BN,
B,
S,
WS,
ON,
LRI,
RLI,
FSI,
PDI,
};
// Returns the [[bidi]] classification corresponding to this rune.
export fn rune_bidi(rn: rune) bidi = {
return get_ucdrecord(rn).bidirectional: bidi;
};
// Unicode character Script attribute.
export type script = enum u16 {
COMMON, // Zyyy
INHERITED, // Zinh
UNKNOWN, // Zzzz
ADLAM, // Adlm
CAUCASIAN_ALBANIAN, // Aghb
AHOM, // Ahom
ARABIC, // Arab
IMPERIAL_ARAMAIC, // Armi
ARMENIAN, // Armn
AVESTAN, // Avst
BALINESE, // Bali
BAMUM, // Bamu
BASSA_VAH, // Bass
BATAK, // Batk
BENGALI, // Beng
BHAIKSUKI, // Bhks
BOPOMOFO, // Bopo
BRAHMI, // Brah
BRAILLE, // Brai
BUGINESE, // Bugi
BUHID, // Buhd
CHAKMA, // Cakm
CANADIAN_SYLLABICS, // Cans
CARIAN, // Cari
CHAM, // Cham
CHEROKEE, // Cher
CHORASMIAN, // Chrs
COPTIC, // Copt
CYPRO_MINOAN, // Cpmn
CYPRIOT, // Cprt
CYRILLIC, // Cyrl
DEVANAGARI, // Deva
DIVES_AKURU, // Diak
DOGRA, // Dogr
DESERET, // Dsrt
DUPLOYAN, // Dupl
EGYPTIAN_HIEROGLYPHS, // Egyp
ELBASAN, // Elba
ELYMAIC, // Elym
ETHIOPIC, // Ethi
GEORGIAN, // Geor
GLAGOLITIC, // Glag
GUNJALA_GONDI, // Gong
MASARAM_GONDI, // Gonm
GOTHIC, // Goth
GRANTHA, // Gran
GREEK, // Grek
GUJARATI, // Gujr
GURMUKHI, // Guru
HANGUL, // Hang
HAN, // Hani
HANUNOO, // Hano
HATRAN, // Hatr
HEBREW, // Hebr
HIRAGANA, // Hira
ANATOLIAN_HIEROGLYPHS, // Hluw
PAHAWH_HMONG, // Hmng
NYIAKENG_PUACHUE_HMONG, // Hmnp
OLD_HUNGARIAN, // Hung
OLD_ITALIC, // Ital
JAVANESE, // Java
KAYAH_LI, // Kali
KATAKANA, // Kana
KAWI, // Kawi
KHAROSHTHI, // Khar
KHMER, // Khmr
KHOJKI, // Khoj
KHITAN_SMALL_SCRIPT, // Kits
KANNADA, // Knda
KAITHI, // Kthi
TAI_THAM, // Lana
LAO, // Laoo
LATIN, // Latn
LEPCHA, // Lepc
LIMBU, // Limb
LINEAR_A, // Lina
LINEAR_B, // Linb
LISU, // Lisu
LYCIAN, // Lyci
LYDIAN, // Lydi
MAHAJANI, // Mahj
MAKASAR, // Maka
MANDAIC, // Mand
MANICHAEAN, // Mani
MARCHEN, // Marc
MEDEFAIDRIN, // Medf
MENDE_KIKAKUI, // Mend
MEROITIC_CURSIVE, // Merc
MEROITIC_HIEROGLYPHS, // Mero
MALAYALAM, // Mlym
MODI, // Modi
MONGOLIAN, // Mong
MRO, // Mroo
MEETEI_MAYEK, // Mtei
MULTANI, // Mult
MYANMAR, // Mymr
NAG_MUNDARI, // Nagm
NANDINAGARI, // Nand
OLD_NORTH_ARABIAN, // Narb
NABATAEAN, // Nbat
NEWA, // Newa
NKO, // Nkoo
NUSHU, // Nshu
OGHAM, // Ogam
OL_CHIKI, // Olck
OLD_TURKIC, // Orkh
ORIYA, // Orya
OSAGE, // Osge
OSMANYA, // Osma
OLD_UYGHUR, // Ougr
PALMYRENE, // Palm
PAU_CIN_HAU, // Pauc
OLD_PERMIC, // Perm
PHAGS_PA, // Phag
INSCRIPTIONAL_PAHLAVI, // Phli
PSALTER_PAHLAVI, // Phlp
PHOENICIAN, // Phnx
MIAO, // Plrd
INSCRIPTIONAL_PARTHIAN, // Prti
REJANG, // Rjng
HANIFI_ROHINGYA, // Rohg
RUNIC, // Runr
SAMARITAN, // Samr
OLD_SOUTH_ARABIAN, // Sarb
SAURASHTRA, // Saur
SIGNWRITING, // Sgnw
SHAVIAN, // Shaw
SHARADA, // Shrd
SIDDHAM, // Sidd
KHUDAWADI, // Sind
SINHALA, // Sinh
SOGDIAN, // Sogd
OLD_SOGDIAN, // Sogo
SORA_SOMPENG, // Sora
SOYOMBO, // Soyo
SUNDANESE, // Sund
SYLOTI_NAGRI, // Sylo
SYRIAC, // Syrc
TAGBANWA, // Tagb
TAKRI, // Takr
TAI_LE, // Tale
NEW_TAI_LUE, // Talu
TAMIL, // Taml
TANGUT, // Tang
TAI_VIET, // Tavt
TELUGU, // Telu
TIFINAGH, // Tfng
TAGALOG, // Tglg
THAANA, // Thaa
THAI, // Thai
TIBETAN, // Tibt
TIRHUTA, // Tirh
TANGSA, // Tnsa
TOTO, // Toto
UGARITIC, // Ugar
VAI, // Vaii
VITHKUQI, // Vith
WARANG_CITI, // Wara
WANCHO, // Wcho
OLD_PERSIAN, // Xpeo
CUNEIFORM, // Xsux
YEZIDI, // Yezi
YI, // Yiii
ZANABAZAR_SQUARE, // Zanb
MATH, // Zmth
};
// Returns the [[script]] corresponding to this rune.
export fn rune_script(rn: rune) script = {
return get_ucdrecord(rn).script: script;
};
// Returns the four-character code associated with a [[script]] value.
export fn script_code(sc: script) const str = {
switch (sc) {
case script::COMMON => return "Zyyy";
case script::INHERITED => return "Zinh";
case script::UNKNOWN => return "Zzzz";
case script::ARABIC => return "Arab";
case script::ARMENIAN => return "Armn";
case script::BENGALI => return "Beng";
case script::CYRILLIC => return "Cyrl";
case script::DEVANAGARI => return "Deva";
case script::GEORGIAN => return "Geor";
case script::GREEK => return "Grek";
case script::GUJARATI => return "Gujr";
case script::GURMUKHI => return "Guru";
case script::HANGUL => return "Hang";
case script::HAN => return "Hani";
case script::HEBREW => return "Hebr";
case script::HIRAGANA => return "Hira";
case script::KANNADA => return "Knda";
case script::KATAKANA => return "Kana";
case script::LAO => return "Laoo";
case script::LATIN => return "Latn";
case script::MALAYALAM => return "Mlym";
case script::ORIYA => return "Orya";
case script::TAMIL => return "Taml";
case script::TELUGU => return "Telu";
case script::THAI => return "Thai";
case script::TIBETAN => return "Tibt";
case script::BOPOMOFO => return "Bopo";
case script::BRAILLE => return "Brai";
case script::CANADIAN_SYLLABICS => return "Cans";
case script::CHEROKEE => return "Cher";
case script::ETHIOPIC => return "Ethi";
case script::KHMER => return "Khmr";
case script::MONGOLIAN => return "Mong";
case script::MYANMAR => return "Mymr";
case script::OGHAM => return "Ogam";
case script::RUNIC => return "Runr";
case script::SINHALA => return "Sinh";
case script::SYRIAC => return "Syrc";
case script::THAANA => return "Thaa";
case script::YI => return "Yiii";
case script::DESERET => return "Dsrt";
case script::GOTHIC => return "Goth";
case script::OLD_ITALIC => return "Ital";
case script::BUHID => return "Buhd";
case script::HANUNOO => return "Hano";
case script::TAGALOG => return "Tglg";
case script::TAGBANWA => return "Tagb";
case script::CYPRIOT => return "Cprt";
case script::LIMBU => return "Limb";
case script::LINEAR_B => return "Linb";
case script::OSMANYA => return "Osma";
case script::SHAVIAN => return "Shaw";
case script::TAI_LE => return "Tale";
case script::UGARITIC => return "Ugar";
case script::BUGINESE => return "Bugi";
case script::COPTIC => return "Copt";
case script::GLAGOLITIC => return "Glag";
case script::KHAROSHTHI => return "Khar";
case script::NEW_TAI_LUE => return "Talu";
case script::OLD_PERSIAN => return "Xpeo";
case script::SYLOTI_NAGRI => return "Sylo";
case script::TIFINAGH => return "Tfng";
case script::BALINESE => return "Bali";
case script::CUNEIFORM => return "Xsux";
case script::NKO => return "Nkoo";
case script::PHAGS_PA => return "Phag";
case script::PHOENICIAN => return "Phnx";
case script::CARIAN => return "Cari";
case script::CHAM => return "Cham";
case script::KAYAH_LI => return "Kali";
case script::LEPCHA => return "Lepc";
case script::LYCIAN => return "Lyci";
case script::LYDIAN => return "Lydi";
case script::OL_CHIKI => return "Olck";
case script::REJANG => return "Rjng";
case script::SAURASHTRA => return "Saur";
case script::SUNDANESE => return "Sund";
case script::VAI => return "Vaii";
case script::AVESTAN => return "Avst";
case script::BAMUM => return "Bamu";
case script::EGYPTIAN_HIEROGLYPHS => return "Egyp";
case script::IMPERIAL_ARAMAIC => return "Armi";
case script::INSCRIPTIONAL_PAHLAVI => return "Phli";
case script::INSCRIPTIONAL_PARTHIAN => return "Prti";
case script::JAVANESE => return "Java";
case script::KAITHI => return "Kthi";
case script::LISU => return "Lisu";
case script::MEETEI_MAYEK => return "Mtei";
case script::OLD_SOUTH_ARABIAN => return "Sarb";
case script::OLD_TURKIC => return "Orkh";
case script::SAMARITAN => return "Samr";
case script::TAI_THAM => return "Lana";
case script::TAI_VIET => return "Tavt";
case script::BATAK => return "Batk";
case script::BRAHMI => return "Brah";
case script::MANDAIC => return "Mand";
case script::CHAKMA => return "Cakm";
case script::MEROITIC_CURSIVE => return "Merc";
case script::MEROITIC_HIEROGLYPHS => return "Mero";
case script::MIAO => return "Plrd";
case script::SHARADA => return "Shrd";
case script::SORA_SOMPENG => return "Sora";
case script::TAKRI => return "Takr";
case script::BASSA_VAH => return "Bass";
case script::CAUCASIAN_ALBANIAN => return "Aghb";
case script::DUPLOYAN => return "Dupl";
case script::ELBASAN => return "Elba";
case script::GRANTHA => return "Gran";
case script::KHOJKI => return "Khoj";
case script::KHUDAWADI => return "Sind";
case script::LINEAR_A => return "Lina";
case script::MAHAJANI => return "Mahj";
case script::MANICHAEAN => return "Mani";
case script::MENDE_KIKAKUI => return "Mend";
case script::MODI => return "Modi";
case script::MRO => return "Mroo";
case script::NABATAEAN => return "Nbat";
case script::OLD_NORTH_ARABIAN => return "Narb";
case script::OLD_PERMIC => return "Perm";
case script::PAHAWH_HMONG => return "Hmng";
case script::PALMYRENE => return "Palm";
case script::PAU_CIN_HAU => return "Pauc";
case script::PSALTER_PAHLAVI => return "Phlp";
case script::SIDDHAM => return "Sidd";
case script::TIRHUTA => return "Tirh";
case script::WARANG_CITI => return "Wara";
case script::AHOM => return "Ahom";
case script::ANATOLIAN_HIEROGLYPHS => return "Hluw";
case script::HATRAN => return "Hatr";
case script::MULTANI => return "Mult";
case script::OLD_HUNGARIAN => return "Hung";
case script::SIGNWRITING => return "Sgnw";
case script::ADLAM => return "Adlm";
case script::BHAIKSUKI => return "Bhks";
case script::MARCHEN => return "Marc";
case script::OSAGE => return "Osge";
case script::TANGUT => return "Tang";
case script::NEWA => return "Newa";
case script::MASARAM_GONDI => return "Gonm";
case script::NUSHU => return "Nshu";
case script::SOYOMBO => return "Soyo";
case script::ZANABAZAR_SQUARE => return "Zanb";
case script::DOGRA => return "Dogr";
case script::GUNJALA_GONDI => return "Gong";
case script::HANIFI_ROHINGYA => return "Rohg";
case script::MAKASAR => return "Maka";
case script::MEDEFAIDRIN => return "Medf";
case script::OLD_SOGDIAN => return "Sogo";
case script::SOGDIAN => return "Sogd";
case script::ELYMAIC => return "Elym";
case script::NANDINAGARI => return "Nand";
case script::NYIAKENG_PUACHUE_HMONG => return "Hmnp";
case script::WANCHO => return "Wcho";
case script::CHORASMIAN => return "Chrs";
case script::DIVES_AKURU => return "Diak";
case script::KHITAN_SMALL_SCRIPT => return "Kits";
case script::YEZIDI => return "Yezi";
case script::CYPRO_MINOAN => return "Cpmn";
case script::OLD_UYGHUR => return "Ougr";
case script::TANGSA => return "Tnsa";
case script::TOTO => return "Toto";
case script::VITHKUQI => return "Vith";
case script::MATH => return "Zmth";
case script::KAWI => return "Kawi";
case script::NAG_MUNDARI => return "Nagm";
};
};
// Line break classification.
export type line_break = enum u8 {
XX,
AI,
BK,
CJ,
CR,
LF,
NL,
SA,
SG,
SP,
OP,
CL,
CP,
QU,
GL,
NS,
EX,
SY,
IS,
PR,
PO,
NU,
AL,
HL,
ID,
IN,
HY,
BA,
BB,
B2,
ZW,
CM,
WJ,
H2,
H3,
JL,
JV,
JT,
RI,
EB,
EM,
ZWJ,
CB,
};
// Returns the [[line_break]] classification corresponding to this rune.
export fn rune_line_break(rn: rune) line_break = {
return get_ucdrecord(rn).line_break: line_break;
};
// Returns the two-character code associated with a [[line_break]] value.
export fn line_break_code(lb: line_break) const str = {
switch (lb) {
case line_break::XX =>
return "XX";
case line_break::AI =>
return "AI";
case line_break::AL =>
return "AL";
case line_break::B2 =>
return "B2";
case line_break::BA =>
return "BA";
case line_break::BB =>
return "BB";
case line_break::BK =>
return "BK";
case line_break::CB =>
return "CB";
case line_break::CJ =>
return "CJ";
case line_break::CL =>
return "CL";
case line_break::CM =>
return "CM";
case line_break::CP =>
return "CP";
case line_break::CR =>
return "CR";
case line_break::EB =>
return "EB";
case line_break::EM =>
return "EM";
case line_break::EX =>
return "EX";
case line_break::GL =>
return "GL";
case line_break::H2 =>
return "H2";
case line_break::H3 =>
return "H3";
case line_break::HL =>
return "HL";
case line_break::HY =>
return "HY";
case line_break::ID =>
return "ID";
case line_break::IN =>
return "IN";
case line_break::IS =>
return "IS";
case line_break::JL =>
return "JL";
case line_break::JT =>
return "JT";
case line_break::JV =>
return "JV";
case line_break::LF =>
return "LF";
case line_break::NL =>
return "NL";
case line_break::NS =>
return "NS";
case line_break::NU =>
return "NU";
case line_break::OP =>
return "OP";
case line_break::PO =>
return "PO";
case line_break::PR =>
return "PR";
case line_break::QU =>
return "QU";
case line_break::RI =>
return "RI";
case line_break::SA =>
return "SA";
case line_break::SG =>
return "SG";
case line_break::SP =>
return "SP";
case line_break::SY =>
return "SY";
case line_break::WJ =>
return "WJ";
case line_break::ZW =>
return "ZW";
case line_break::ZWJ =>
return "ZWJ";
};
};