kojote/unicode/ucd.ha
2024-01-26 18:30:39 +01:00

473 lines
13 KiB
Hare

type ucd_encodedrec = (u8, u8, u8, u8, u8, u16);
type ucd_record = struct {
category: u8,
combining: u8,
bidirectional: u8,
mirrored: u8,
east_asian_width: u8,
script: u16,
};
fn get_ucdrecord(rn: rune) *ucd_record = {
const code = rn: u32;
let index = 0u16;
if (code < 0x110000) {
index = index1[(code>>UCD_RECORD_SHIFT)];
index = index2[(index<<UCD_RECORD_SHIFT)+(code&((1<<UCD_RECORD_SHIFT)-1))];
};
return &ucd_records[index]: *ucd_record;
};
// Unicode character General_Category attribute
export type gc = enum u8 {
CONTROL, // Cc
FORMAT, // Cf
UNASSIGNED, // Cn
PRIVATE_USE, // Co
SURROGATE, // Cs
LOWERCASE_LETTER, // Ll
MODIFIER_LETTER, // Lm
OTHER_LETTER, // Lo
TITLECASE_LETTER, // Lt
UPPERCASE_LETTER, // Lu
SPACING_MARK, // Mc
ENCLOSING_MARK, // Me
NON_SPACING_MARK, // Mn
DECIMAL_NUMBER, // Nd
LETTER_NUMBER, // Nl
OTHER_NUMBER, // No
CONNECT_PUNCTUATION, // Pc
DASH_PUNCTUATION, // Pd
CLOSE_PUNCTUATION, // Pe
FINAL_PUNCTUATION, // Pf
INITIAL_PUNCTUATION, // Pi
OTHER_PUNCTUATION, // Po
OPEN_PUNCTUATION, // Ps
CURRENCY_SYMBOL, // Sc
MODIFIER_SYMBOL, // Sk
MATH_SYMBOL, // Sm
OTHER_SYMBOL, // So
LINE_SEPARATOR, // Zl
PARAGRAPH_SEPARATOR, // Zp
SPACE_SEPARATOR, // Zs
};
// Returns the [[general_category]] corresponding to this rune.
export fn rune_gc(rn: rune) gc = {
return get_ucdrecord(rn).category: gc;
};
// Returns the two-character code associated with a [[gc]] value.
export fn gc_code(v: gc) const str = {
switch (v) {
case gc::CONTROL => return "Cc";
case gc::FORMAT => return "Cf";
case gc::UNASSIGNED => return "Cn";
case gc::PRIVATE_USE => return "Co";
case gc::SURROGATE => return "Cs";
case gc::LOWERCASE_LETTER => return "Ll";
case gc::MODIFIER_LETTER => return "Lm";
case gc::OTHER_LETTER => return "Lo";
case gc::TITLECASE_LETTER => return "Lt";
case gc::UPPERCASE_LETTER => return "Lu";
case gc::SPACING_MARK => return "Mc";
case gc::ENCLOSING_MARK => return "Me";
case gc::NON_SPACING_MARK => return "Mn";
case gc::DECIMAL_NUMBER => return "Nd";
case gc::LETTER_NUMBER => return "Nl";
case gc::OTHER_NUMBER => return "No";
case gc::CONNECT_PUNCTUATION => return "Pc";
case gc::DASH_PUNCTUATION => return "Pd";
case gc::CLOSE_PUNCTUATION => return "Pe";
case gc::FINAL_PUNCTUATION => return "Pf";
case gc::INITIAL_PUNCTUATION => return "Pi";
case gc::OTHER_PUNCTUATION => return "Po";
case gc::OPEN_PUNCTUATION => return "Ps";
case gc::CURRENCY_SYMBOL => return "Sc";
case gc::MODIFIER_SYMBOL => return "Sk";
case gc::MATH_SYMBOL => return "Sm";
case gc::OTHER_SYMBOL => return "So";
case gc::LINE_SEPARATOR => return "Zl";
case gc::PARAGRAPH_SEPARATOR => return "Zp";
case gc::SPACE_SEPARATOR => return "Zs";
};
};
// Bidirectional classification.
export type bidi = enum u8 {
UNKNOWN,
L,
LRE,
LRO,
R,
AL,
RLE,
RLO,
PDF,
EN,
ES,
ET,
AN,
CS,
NSM,
BN,
B,
S,
WS,
ON,
LRI,
RLI,
FSI,
PDI,
};
// Returns the [[bidi]] classification corresponding to this rune.
export fn rune_bidi(rn: rune) bidi = {
return get_ucdrecord(rn).bidirectional: bidi;
};
// Unicode character Script attribute.
export type script = enum u16 {
COMMON, // Zyyy
INHERITED, // Zinh
UNKNOWN, // Zzzz
ADLAM, // Adlm
CAUCASIAN_ALBANIAN, // Aghb
AHOM, // Ahom
ARABIC, // Arab
IMPERIAL_ARAMAIC, // Armi
ARMENIAN, // Armn
AVESTAN, // Avst
BALINESE, // Bali
BAMUM, // Bamu
BASSA_VAH, // Bass
BATAK, // Batk
BENGALI, // Beng
BHAIKSUKI, // Bhks
BOPOMOFO, // Bopo
BRAHMI, // Brah
BRAILLE, // Brai
BUGINESE, // Bugi
BUHID, // Buhd
CHAKMA, // Cakm
CANADIAN_SYLLABICS, // Cans
CARIAN, // Cari
CHAM, // Cham
CHEROKEE, // Cher
CHORASMIAN, // Chrs
COPTIC, // Copt
CYPRO_MINOAN, // Cpmn
CYPRIOT, // Cprt
CYRILLIC, // Cyrl
DEVANAGARI, // Deva
DIVES_AKURU, // Diak
DOGRA, // Dogr
DESERET, // Dsrt
DUPLOYAN, // Dupl
EGYPTIAN_HIEROGLYPHS, // Egyp
ELBASAN, // Elba
ELYMAIC, // Elym
ETHIOPIC, // Ethi
GEORGIAN, // Geor
GLAGOLITIC, // Glag
GUNJALA_GONDI, // Gong
MASARAM_GONDI, // Gonm
GOTHIC, // Goth
GRANTHA, // Gran
GREEK, // Grek
GUJARATI, // Gujr
GURMUKHI, // Guru
HANGUL, // Hang
HAN, // Hani
HANUNOO, // Hano
HATRAN, // Hatr
HEBREW, // Hebr
HIRAGANA, // Hira
ANATOLIAN_HIEROGLYPHS, // Hluw
PAHAWH_HMONG, // Hmng
NYIAKENG_PUACHUE_HMONG, // Hmnp
OLD_HUNGARIAN, // Hung
OLD_ITALIC, // Ital
JAVANESE, // Java
KAYAH_LI, // Kali
KATAKANA, // Kana
KAWI, // Kawi
KHAROSHTHI, // Khar
KHMER, // Khmr
KHOJKI, // Khoj
KHITAN_SMALL_SCRIPT, // Kits
KANNADA, // Knda
KAITHI, // Kthi
TAI_THAM, // Lana
LAO, // Laoo
LATIN, // Latn
LEPCHA, // Lepc
LIMBU, // Limb
LINEAR_A, // Lina
LINEAR_B, // Linb
LISU, // Lisu
LYCIAN, // Lyci
LYDIAN, // Lydi
MAHAJANI, // Mahj
MAKASAR, // Maka
MANDAIC, // Mand
MANICHAEAN, // Mani
MARCHEN, // Marc
MEDEFAIDRIN, // Medf
MENDE_KIKAKUI, // Mend
MEROITIC_CURSIVE, // Merc
MEROITIC_HIEROGLYPHS, // Mero
MALAYALAM, // Mlym
MODI, // Modi
MONGOLIAN, // Mong
MRO, // Mroo
MEETEI_MAYEK, // Mtei
MULTANI, // Mult
MYANMAR, // Mymr
NAG_MUNDARI, // Nagm
NANDINAGARI, // Nand
OLD_NORTH_ARABIAN, // Narb
NABATAEAN, // Nbat
NEWA, // Newa
NKO, // Nkoo
NUSHU, // Nshu
OGHAM, // Ogam
OL_CHIKI, // Olck
OLD_TURKIC, // Orkh
ORIYA, // Orya
OSAGE, // Osge
OSMANYA, // Osma
OLD_UYGHUR, // Ougr
PALMYRENE, // Palm
PAU_CIN_HAU, // Pauc
OLD_PERMIC, // Perm
PHAGS_PA, // Phag
INSCRIPTIONAL_PAHLAVI, // Phli
PSALTER_PAHLAVI, // Phlp
PHOENICIAN, // Phnx
MIAO, // Plrd
INSCRIPTIONAL_PARTHIAN, // Prti
REJANG, // Rjng
HANIFI_ROHINGYA, // Rohg
RUNIC, // Runr
SAMARITAN, // Samr
OLD_SOUTH_ARABIAN, // Sarb
SAURASHTRA, // Saur
SIGNWRITING, // Sgnw
SHAVIAN, // Shaw
SHARADA, // Shrd
SIDDHAM, // Sidd
KHUDAWADI, // Sind
SINHALA, // Sinh
SOGDIAN, // Sogd
OLD_SOGDIAN, // Sogo
SORA_SOMPENG, // Sora
SOYOMBO, // Soyo
SUNDANESE, // Sund
SYLOTI_NAGRI, // Sylo
SYRIAC, // Syrc
TAGBANWA, // Tagb
TAKRI, // Takr
TAI_LE, // Tale
NEW_TAI_LUE, // Talu
TAMIL, // Taml
TANGUT, // Tang
TAI_VIET, // Tavt
TELUGU, // Telu
TIFINAGH, // Tfng
TAGALOG, // Tglg
THAANA, // Thaa
THAI, // Thai
TIBETAN, // Tibt
TIRHUTA, // Tirh
TANGSA, // Tnsa
TOTO, // Toto
UGARITIC, // Ugar
VAI, // Vaii
VITHKUQI, // Vith
WARANG_CITI, // Wara
WANCHO, // Wcho
OLD_PERSIAN, // Xpeo
CUNEIFORM, // Xsux
YEZIDI, // Yezi
YI, // Yiii
ZANABAZAR_SQUARE, // Zanb
MATH, // Zmth
};
// Returns the [[general_category]] corresponding to this rune.
export fn rune_script(rn: rune) script = {
return get_ucdrecord(rn).script: script;
};
// Returns the four-character code associated with a [[script]] value.
export fn script_code(sc: script) const str = {
switch (sc) {
case script::COMMON => return "Zyyy";
case script::INHERITED => return "Zinh";
case script::UNKNOWN => return "Zzzz";
case script::ARABIC => return "Arab";
case script::ARMENIAN => return "Armn";
case script::BENGALI => return "Beng";
case script::CYRILLIC => return "Cyrl";
case script::DEVANAGARI => return "Deva";
case script::GEORGIAN => return "Geor";
case script::GREEK => return "Grek";
case script::GUJARATI => return "Gujr";
case script::GURMUKHI => return "Guru";
case script::HANGUL => return "Hang";
case script::HAN => return "Hani";
case script::HEBREW => return "Hebr";
case script::HIRAGANA => return "Hira";
case script::KANNADA => return "Knda";
case script::KATAKANA => return "Kana";
case script::LAO => return "Laoo";
case script::LATIN => return "Latn";
case script::MALAYALAM => return "Mlym";
case script::ORIYA => return "Orya";
case script::TAMIL => return "Taml";
case script::TELUGU => return "Telu";
case script::THAI => return "Thai";
case script::TIBETAN => return "Tibt";
case script::BOPOMOFO => return "Bopo";
case script::BRAILLE => return "Brai";
case script::CANADIAN_SYLLABICS => return "Cans";
case script::CHEROKEE => return "Cher";
case script::ETHIOPIC => return "Ethi";
case script::KHMER => return "Khmr";
case script::MONGOLIAN => return "Mong";
case script::MYANMAR => return "Mymr";
case script::OGHAM => return "Ogam";
case script::RUNIC => return "Runr";
case script::SINHALA => return "Sinh";
case script::SYRIAC => return "Syrc";
case script::THAANA => return "Thaa";
case script::YI => return "Yiii";
case script::DESERET => return "Dsrt";
case script::GOTHIC => return "Goth";
case script::OLD_ITALIC => return "Ital";
case script::BUHID => return "Buhd";
case script::HANUNOO => return "Hano";
case script::TAGALOG => return "Tglg";
case script::TAGBANWA => return "Tagb";
case script::CYPRIOT => return "Cprt";
case script::LIMBU => return "Limb";
case script::LINEAR_B => return "Linb";
case script::OSMANYA => return "Osma";
case script::SHAVIAN => return "Shaw";
case script::TAI_LE => return "Tale";
case script::UGARITIC => return "Ugar";
case script::BUGINESE => return "Bugi";
case script::COPTIC => return "Copt";
case script::GLAGOLITIC => return "Glag";
case script::KHAROSHTHI => return "Khar";
case script::NEW_TAI_LUE => return "Talu";
case script::OLD_PERSIAN => return "Xpeo";
case script::SYLOTI_NAGRI => return "Sylo";
case script::TIFINAGH => return "Tfng";
case script::BALINESE => return "Bali";
case script::CUNEIFORM => return "Xsux";
case script::NKO => return "Nkoo";
case script::PHAGS_PA => return "Phag";
case script::PHOENICIAN => return "Phnx";
case script::CARIAN => return "Cari";
case script::CHAM => return "Cham";
case script::KAYAH_LI => return "Kali";
case script::LEPCHA => return "Lepc";
case script::LYCIAN => return "Lyci";
case script::LYDIAN => return "Lydi";
case script::OL_CHIKI => return "Olck";
case script::REJANG => return "Rjng";
case script::SAURASHTRA => return "Saur";
case script::SUNDANESE => return "Sund";
case script::VAI => return "Vaii";
case script::AVESTAN => return "Avst";
case script::BAMUM => return "Bamu";
case script::EGYPTIAN_HIEROGLYPHS => return "Egyp";
case script::IMPERIAL_ARAMAIC => return "Armi";
case script::INSCRIPTIONAL_PAHLAVI => return "Phli";
case script::INSCRIPTIONAL_PARTHIAN => return "Prti";
case script::JAVANESE => return "Java";
case script::KAITHI => return "Kthi";
case script::LISU => return "Lisu";
case script::MEETEI_MAYEK => return "Mtei";
case script::OLD_SOUTH_ARABIAN => return "Sarb";
case script::OLD_TURKIC => return "Orkh";
case script::SAMARITAN => return "Samr";
case script::TAI_THAM => return "Lana";
case script::TAI_VIET => return "Tavt";
case script::BATAK => return "Batk";
case script::BRAHMI => return "Brah";
case script::MANDAIC => return "Mand";
case script::CHAKMA => return "Cakm";
case script::MEROITIC_CURSIVE => return "Merc";
case script::MEROITIC_HIEROGLYPHS => return "Mero";
case script::MIAO => return "Plrd";
case script::SHARADA => return "Shrd";
case script::SORA_SOMPENG => return "Sora";
case script::TAKRI => return "Takr";
case script::BASSA_VAH => return "Bass";
case script::CAUCASIAN_ALBANIAN => return "Aghb";
case script::DUPLOYAN => return "Dupl";
case script::ELBASAN => return "Elba";
case script::GRANTHA => return "Gran";
case script::KHOJKI => return "Khoj";
case script::KHUDAWADI => return "Sind";
case script::LINEAR_A => return "Lina";
case script::MAHAJANI => return "Mahj";
case script::MANICHAEAN => return "Mani";
case script::MENDE_KIKAKUI => return "Mend";
case script::MODI => return "Modi";
case script::MRO => return "Mroo";
case script::NABATAEAN => return "Nbat";
case script::OLD_NORTH_ARABIAN => return "Narb";
case script::OLD_PERMIC => return "Perm";
case script::PAHAWH_HMONG => return "Hmng";
case script::PALMYRENE => return "Palm";
case script::PAU_CIN_HAU => return "Pauc";
case script::PSALTER_PAHLAVI => return "Phlp";
case script::SIDDHAM => return "Sidd";
case script::TIRHUTA => return "Tirh";
case script::WARANG_CITI => return "Wara";
case script::AHOM => return "Ahom";
case script::ANATOLIAN_HIEROGLYPHS => return "Hluw";
case script::HATRAN => return "Hatr";
case script::MULTANI => return "Mult";
case script::OLD_HUNGARIAN => return "Hung";
case script::SIGNWRITING => return "Sgnw";
case script::ADLAM => return "Adlm";
case script::BHAIKSUKI => return "Bhks";
case script::MARCHEN => return "Marc";
case script::OSAGE => return "Osge";
case script::TANGUT => return "Tang";
case script::NEWA => return "Newa";
case script::MASARAM_GONDI => return "Gonm";
case script::NUSHU => return "Nshu";
case script::SOYOMBO => return "Soyo";
case script::ZANABAZAR_SQUARE => return "Zanb";
case script::DOGRA => return "Dogr";
case script::GUNJALA_GONDI => return "Gong";
case script::HANIFI_ROHINGYA => return "Rohg";
case script::MAKASAR => return "Maka";
case script::MEDEFAIDRIN => return "Medf";
case script::OLD_SOGDIAN => return "Sogo";
case script::SOGDIAN => return "Sogd";
case script::ELYMAIC => return "Elym";
case script::NANDINAGARI => return "Nand";
case script::NYIAKENG_PUACHUE_HMONG => return "Hmnp";
case script::WANCHO => return "Wcho";
case script::CHORASMIAN => return "Chrs";
case script::DIVES_AKURU => return "Diak";
case script::KHITAN_SMALL_SCRIPT => return "Kits";
case script::YEZIDI => return "Yezi";
case script::CYPRO_MINOAN => return "Cpmn";
case script::OLD_UYGHUR => return "Ougr";
case script::TANGSA => return "Tnsa";
case script::TOTO => return "Toto";
case script::VITHKUQI => return "Vith";
case script::MATH => return "Zmth";
case script::KAWI => return "Kawi";
case script::NAG_MUNDARI => return "Nagm";
};
};