kojote/vendor/hare-unicode/unicode/ucd.ha

type ucd_encodedrec = (u8, u8, u8, u8, u8, u16, u8);

type ucd_record = struct {
	category: u8,
	combining: u8,
	bidirectional: u8,
	mirrored: u8,
	east_asian_width: u8,
	script: u16,
	line_break: u8,
};

fn get_ucdrecord(rn: rune) *ucd_record = {
	const code = rn: u32;
	let index = 0u16;
	if (code < 0x110000) {
		index = index1[(code>>UCD_RECORD_SHIFT)];
		index = index2[(index<<UCD_RECORD_SHIFT)+(code&((1<<UCD_RECORD_SHIFT)-1))];
	};
	return &ucd_records[index]: *ucd_record;
};

// Unicode character General_Category attribute
export type gc = enum u8 {
	Cc,	// Control
	Cf,	// Format
	Cn,	// Unassigned
	Co,	// Private use
	Cs,	// Surrogate
	Ll,	// Lowercase letter
	Lm,	// Modifier letter
	Lo,	// Other letter
	Lt,	// Titlecase letter
	Lu,	// Uppercase letter
	Mc,	// Spacing mark
	Me,	// Enclosing mark
	Mn,	// Non-spacing mark
	Nd,	// Decimal number
	Nl,	// Letter number
	No,	// Other number
	Pc,	// Connect punctuation
	Pd,	// Dash punctuation
	Pe,	// Close punctuation
	Pf,	// Final punctuation
	Pi,	// Initial punctuation
	Po,	// Other punctuation
	Ps,	// Open punctuation
	Sc,	// Currency symbol
	Sk,	// Modifier symbol
	Sm,	// Math symbol
	So,	// Other symbol
	Zl,	// Line separator
	Zp,	// Paragraph separator
	Zs,	// Space separator
};

// Returns the [[general_category]] corresponding to this rune.
export fn rune_gc(rn: rune) gc = {
	return get_ucdrecord(rn).category: gc;
};

// Returns the name associated with a [[gc]] value.
export fn gc_name(v: gc) const str = {
	switch (v) {
	case gc::Cc => return "Control";
	case gc::Cf => return "Format";
	case gc::Cn => return "Unassigned";
	case gc::Co => return "Private use";
	case gc::Cs => return "Surrogate";
	case gc::Ll => return "Lowercase letter";
	case gc::Lm => return "Modifier letter";
	case gc::Lo => return "Other letter";
	case gc::Lt => return "Titlecase letter";
	case gc::Lu => return "Uppercase letter";
	case gc::Mc => return "Spacing mark";
	case gc::Me => return "Enclosing mark";
	case gc::Mn => return "Non-spacing mark";
	case gc::Nd => return "Decimal number";
	case gc::Nl => return "Letter number";
	case gc::No => return "Other number";
	case gc::Pc => return "Connect punctuation";
	case gc::Pd => return "Dash punctuation";
	case gc::Pe => return "Close punctuation";
	case gc::Pf => return "Final punctuation";
	case gc::Pi => return "Initial punctuation";
	case gc::Po => return "Other punctuation";
	case gc::Ps => return "Open punctuation";
	case gc::Sc => return "Currency symbol";
	case gc::Sk => return "Modifier symbol";
	case gc::Sm => return "Math symbol";
	case gc::So => return "Other symbol";
	case gc::Zl => return "Line separator";
	case gc::Zp => return "Paragraph separator";
	case gc::Zs => return "Space separator";
	};
};

// Returns the two-character code associated with a [[gc]] value.
export fn gc_code(v: gc) const str = {
	switch (v) {
	case gc::Cc => return "Cc";
	case gc::Cf => return "Cf";
	case gc::Cn => return "Cn";
	case gc::Co => return "Co";
	case gc::Cs => return "Cs";
	case gc::Ll => return "Ll";
	case gc::Lm => return "Lm";
	case gc::Lo => return "Lo";
	case gc::Lt => return "Lt";
	case gc::Lu => return "Lu";
	case gc::Mc => return "Mc";
	case gc::Me => return "Me";
	case gc::Mn => return "Mn";
	case gc::Nd => return "Nd";
	case gc::Nl => return "Nl";
	case gc::No => return "No";
	case gc::Pc => return "Pc";
	case gc::Pd => return "Pd";
	case gc::Pe => return "Pe";
	case gc::Pf => return "Pf";
	case gc::Pi => return "Pi";
	case gc::Po => return "Po";
	case gc::Ps => return "Ps";
	case gc::Sc => return "Sc";
	case gc::Sk => return "Sk";
	case gc::Sm => return "Sm";
	case gc::So => return "So";
	case gc::Zl => return "Zl";
	case gc::Zp => return "Zp";
	case gc::Zs => return "Zs";
	};
};

// Bidirectional classification.
export type bidi = enum u8 {
	UNKNOWN,
	L,
	LRE,
	LRO,
	R,
	AL,
	RLE,
	RLO,
	PDF,
	EN,
	ES,
	ET,
	AN,
	CS,
	NSM,
	BN,
	B,
	S,
	WS,
	ON,
	LRI,
	RLI,
	FSI,
	PDI,
};

// Returns the [[bidi]] classification corresponding to this rune.
export fn rune_bidi(rn: rune) bidi = {
	return get_ucdrecord(rn).bidirectional: bidi;
};

// Unicode character Script attribute.
export type script = enum u16 {
	COMMON,			// Zyyy
	INHERITED,		// Zinh
	UNKNOWN,		// Zzzz
	ADLAM,			// Adlm
	CAUCASIAN_ALBANIAN,	// Aghb
	AHOM,			// Ahom
	ARABIC,			// Arab
	IMPERIAL_ARAMAIC,	// Armi
	ARMENIAN,		// Armn
	AVESTAN,		// Avst
	BALINESE,		// Bali
	BAMUM,			// Bamu
	BASSA_VAH,		// Bass
	BATAK,			// Batk
	BENGALI,		// Beng
	BHAIKSUKI,		// Bhks
	BOPOMOFO,		// Bopo
	BRAHMI,			// Brah
	BRAILLE,		// Brai
	BUGINESE,		// Bugi
	BUHID,			// Buhd
	CHAKMA,			// Cakm
	CANADIAN_SYLLABICS,	// Cans
	CARIAN,			// Cari
	CHAM,			// Cham
	CHEROKEE,		// Cher
	CHORASMIAN,		// Chrs
	COPTIC,			// Copt
	CYPRO_MINOAN,		// Cpmn
	CYPRIOT,		// Cprt
	CYRILLIC,		// Cyrl
	DEVANAGARI,		// Deva
	DIVES_AKURU,		// Diak
	DOGRA,			// Dogr
	DESERET,		// Dsrt
	DUPLOYAN,		// Dupl
	EGYPTIAN_HIEROGLYPHS,	// Egyp
	ELBASAN,		// Elba
	ELYMAIC,		// Elym
	ETHIOPIC,		// Ethi
	GEORGIAN,		// Geor
	GLAGOLITIC,		// Glag
	GUNJALA_GONDI,		// Gong
	MASARAM_GONDI,		// Gonm
	GOTHIC,			// Goth
	GRANTHA,		// Gran
	GREEK,			// Grek
	GUJARATI,		// Gujr
	GURMUKHI,		// Guru
	HANGUL,			// Hang
	HAN,			// Hani
	HANUNOO,		// Hano
	HATRAN,			// Hatr
	HEBREW,			// Hebr
	HIRAGANA,		// Hira
	ANATOLIAN_HIEROGLYPHS,	// Hluw
	PAHAWH_HMONG,		// Hmng
	NYIAKENG_PUACHUE_HMONG,	// Hmnp
	OLD_HUNGARIAN,		// Hung
	OLD_ITALIC,		// Ital
	JAVANESE,		// Java
	KAYAH_LI,		// Kali
	KATAKANA,		// Kana
	KAWI,			// Kawi
	KHAROSHTHI,		// Khar
	KHMER,			// Khmr
	KHOJKI,			// Khoj
	KHITAN_SMALL_SCRIPT,	// Kits
	KANNADA,		// Knda
	KAITHI,			// Kthi
	TAI_THAM,		// Lana
	LAO,			// Laoo
	LATIN,			// Latn
	LEPCHA,			// Lepc
	LIMBU,			// Limb
	LINEAR_A,		// Lina
	LINEAR_B,		// Linb
	LISU,			// Lisu
	LYCIAN,			// Lyci
	LYDIAN,			// Lydi
	MAHAJANI,		// Mahj
	MAKASAR,		// Maka
	MANDAIC,		// Mand
	MANICHAEAN,		// Mani
	MARCHEN,		// Marc
	MEDEFAIDRIN,		// Medf
	MENDE_KIKAKUI,		// Mend
	MEROITIC_CURSIVE,	// Merc
	MEROITIC_HIEROGLYPHS,	// Mero
	MALAYALAM,		// Mlym
	MODI,			// Modi
	MONGOLIAN,		// Mong
	MRO,			// Mroo
	MEETEI_MAYEK,		// Mtei
	MULTANI,		// Mult
	MYANMAR,		// Mymr
	NAG_MUNDARI,		// Nagm
	NANDINAGARI,		// Nand
	OLD_NORTH_ARABIAN,	// Narb
	NABATAEAN,		// Nbat
	NEWA,			// Newa
	NKO,			// Nkoo
	NUSHU,			// Nshu
	OGHAM,			// Ogam
	OL_CHIKI,		// Olck
	OLD_TURKIC,		// Orkh
	ORIYA,			// Orya
	OSAGE,			// Osge
	OSMANYA,		// Osma
	OLD_UYGHUR,		// Ougr
	PALMYRENE,		// Palm
	PAU_CIN_HAU,		// Pauc
	OLD_PERMIC,		// Perm
	PHAGS_PA,		// Phag
	INSCRIPTIONAL_PAHLAVI,	// Phli
	PSALTER_PAHLAVI,	// Phlp
	PHOENICIAN,		// Phnx
	MIAO,			// Plrd
	INSCRIPTIONAL_PARTHIAN,	// Prti
	REJANG,			// Rjng
	HANIFI_ROHINGYA,	// Rohg
	RUNIC,			// Runr
	SAMARITAN,		// Samr
	OLD_SOUTH_ARABIAN,	// Sarb
	SAURASHTRA,		// Saur
	SIGNWRITING,		// Sgnw
	SHAVIAN,		// Shaw
	SHARADA,		// Shrd
	SIDDHAM,		// Sidd
	KHUDAWADI,		// Sind
	SINHALA,		// Sinh
	SOGDIAN,		// Sogd
	OLD_SOGDIAN,		// Sogo
	SORA_SOMPENG,		// Sora
	SOYOMBO,		// Soyo
	SUNDANESE,		// Sund
	SYLOTI_NAGRI,		// Sylo
	SYRIAC,			// Syrc
	TAGBANWA,		// Tagb
	TAKRI,			// Takr
	TAI_LE,			// Tale
	NEW_TAI_LUE,		// Talu
	TAMIL,			// Taml
	TANGUT,			// Tang
	TAI_VIET,		// Tavt
	TELUGU,			// Telu
	TIFINAGH,		// Tfng
	TAGALOG,		// Tglg
	THAANA,			// Thaa
	THAI,			// Thai
	TIBETAN,		// Tibt
	TIRHUTA,		// Tirh
	TANGSA,			// Tnsa
	TOTO,			// Toto
	UGARITIC,		// Ugar
	VAI,			// Vaii
	VITHKUQI,		// Vith
	WARANG_CITI,		// Wara
	WANCHO,			// Wcho
	OLD_PERSIAN,		// Xpeo
	CUNEIFORM,		// Xsux
	YEZIDI,			// Yezi
	YI,			// Yiii
	ZANABAZAR_SQUARE,	// Zanb
	MATH,			// Zmth
};

// Returns the [[script]] corresponding to this rune.
export fn rune_script(rn: rune) script = {
	return get_ucdrecord(rn).script: script;
};

// Returns the four-character code associated with a [[script]] value.
export fn script_code(sc: script) const str = {
	switch (sc) {
	case script::COMMON => return "Zyyy";
	case script::INHERITED => return "Zinh";
	case script::UNKNOWN => return "Zzzz";
	case script::ARABIC => return "Arab";
	case script::ARMENIAN => return "Armn";
	case script::BENGALI => return "Beng";
	case script::CYRILLIC => return "Cyrl";
	case script::DEVANAGARI => return "Deva";
	case script::GEORGIAN => return "Geor";
	case script::GREEK => return "Grek";
	case script::GUJARATI => return "Gujr";
	case script::GURMUKHI => return "Guru";
	case script::HANGUL => return "Hang";
	case script::HAN => return "Hani";
	case script::HEBREW => return "Hebr";
	case script::HIRAGANA => return "Hira";
	case script::KANNADA => return "Knda";
	case script::KATAKANA => return "Kana";
	case script::LAO => return "Laoo";
	case script::LATIN => return "Latn";
	case script::MALAYALAM => return "Mlym";
	case script::ORIYA => return "Orya";
	case script::TAMIL => return "Taml";
	case script::TELUGU => return "Telu";
	case script::THAI => return "Thai";
	case script::TIBETAN => return "Tibt";
	case script::BOPOMOFO => return "Bopo";
	case script::BRAILLE => return "Brai";
	case script::CANADIAN_SYLLABICS => return "Cans";
	case script::CHEROKEE => return "Cher";
	case script::ETHIOPIC => return "Ethi";
	case script::KHMER => return "Khmr";
	case script::MONGOLIAN => return "Mong";
	case script::MYANMAR => return "Mymr";
	case script::OGHAM => return "Ogam";
	case script::RUNIC => return "Runr";
	case script::SINHALA => return "Sinh";
	case script::SYRIAC => return "Syrc";
	case script::THAANA => return "Thaa";
	case script::YI => return "Yiii";
	case script::DESERET => return "Dsrt";
	case script::GOTHIC => return "Goth";
	case script::OLD_ITALIC => return "Ital";
	case script::BUHID => return "Buhd";
	case script::HANUNOO => return "Hano";
	case script::TAGALOG => return "Tglg";
	case script::TAGBANWA => return "Tagb";
	case script::CYPRIOT => return "Cprt";
	case script::LIMBU => return "Limb";
	case script::LINEAR_B => return "Linb";
	case script::OSMANYA => return "Osma";
	case script::SHAVIAN => return "Shaw";
	case script::TAI_LE => return "Tale";
	case script::UGARITIC => return "Ugar";
	case script::BUGINESE => return "Bugi";
	case script::COPTIC => return "Copt";
	case script::GLAGOLITIC => return "Glag";
	case script::KHAROSHTHI => return "Khar";
	case script::NEW_TAI_LUE => return "Talu";
	case script::OLD_PERSIAN => return "Xpeo";
	case script::SYLOTI_NAGRI => return "Sylo";
	case script::TIFINAGH => return "Tfng";
	case script::BALINESE => return "Bali";
	case script::CUNEIFORM => return "Xsux";
	case script::NKO => return "Nkoo";
	case script::PHAGS_PA => return "Phag";
	case script::PHOENICIAN => return "Phnx";
	case script::CARIAN => return "Cari";
	case script::CHAM => return "Cham";
	case script::KAYAH_LI => return "Kali";
	case script::LEPCHA => return "Lepc";
	case script::LYCIAN => return "Lyci";
	case script::LYDIAN => return "Lydi";
	case script::OL_CHIKI => return "Olck";
	case script::REJANG => return "Rjng";
	case script::SAURASHTRA => return "Saur";
	case script::SUNDANESE => return "Sund";
	case script::VAI => return "Vaii";
	case script::AVESTAN => return "Avst";
	case script::BAMUM => return "Bamu";
	case script::EGYPTIAN_HIEROGLYPHS => return "Egyp";
	case script::IMPERIAL_ARAMAIC => return "Armi";
	case script::INSCRIPTIONAL_PAHLAVI => return "Phli";
	case script::INSCRIPTIONAL_PARTHIAN => return "Prti";
	case script::JAVANESE => return "Java";
	case script::KAITHI => return "Kthi";
	case script::LISU => return "Lisu";
	case script::MEETEI_MAYEK => return "Mtei";
	case script::OLD_SOUTH_ARABIAN => return "Sarb";
	case script::OLD_TURKIC => return "Orkh";
	case script::SAMARITAN => return "Samr";
	case script::TAI_THAM => return "Lana";
	case script::TAI_VIET => return "Tavt";
	case script::BATAK => return "Batk";
	case script::BRAHMI => return "Brah";
	case script::MANDAIC => return "Mand";
	case script::CHAKMA => return "Cakm";
	case script::MEROITIC_CURSIVE => return "Merc";
	case script::MEROITIC_HIEROGLYPHS => return "Mero";
	case script::MIAO => return "Plrd";
	case script::SHARADA => return "Shrd";
	case script::SORA_SOMPENG => return "Sora";
	case script::TAKRI => return "Takr";
	case script::BASSA_VAH => return "Bass";
	case script::CAUCASIAN_ALBANIAN => return "Aghb";
	case script::DUPLOYAN => return "Dupl";
	case script::ELBASAN => return "Elba";
	case script::GRANTHA => return "Gran";
	case script::KHOJKI => return "Khoj";
	case script::KHUDAWADI => return "Sind";
	case script::LINEAR_A => return "Lina";
	case script::MAHAJANI => return "Mahj";
	case script::MANICHAEAN => return "Mani";
	case script::MENDE_KIKAKUI => return "Mend";
	case script::MODI => return "Modi";
	case script::MRO => return "Mroo";
	case script::NABATAEAN => return "Nbat";
	case script::OLD_NORTH_ARABIAN => return "Narb";
	case script::OLD_PERMIC => return "Perm";
	case script::PAHAWH_HMONG => return "Hmng";
	case script::PALMYRENE => return "Palm";
	case script::PAU_CIN_HAU => return "Pauc";
	case script::PSALTER_PAHLAVI => return "Phlp";
	case script::SIDDHAM => return "Sidd";
	case script::TIRHUTA => return "Tirh";
	case script::WARANG_CITI => return "Wara";
	case script::AHOM => return "Ahom";
	case script::ANATOLIAN_HIEROGLYPHS => return "Hluw";
	case script::HATRAN => return "Hatr";
	case script::MULTANI => return "Mult";
	case script::OLD_HUNGARIAN => return "Hung";
	case script::SIGNWRITING => return "Sgnw";
	case script::ADLAM => return "Adlm";
	case script::BHAIKSUKI => return "Bhks";
	case script::MARCHEN => return "Marc";
	case script::OSAGE => return "Osge";
	case script::TANGUT => return "Tang";
	case script::NEWA => return "Newa";
	case script::MASARAM_GONDI => return "Gonm";
	case script::NUSHU => return "Nshu";
	case script::SOYOMBO => return "Soyo";
	case script::ZANABAZAR_SQUARE => return "Zanb";
	case script::DOGRA => return "Dogr";
	case script::GUNJALA_GONDI => return "Gong";
	case script::HANIFI_ROHINGYA => return "Rohg";
	case script::MAKASAR => return "Maka";
	case script::MEDEFAIDRIN => return "Medf";
	case script::OLD_SOGDIAN => return "Sogo";
	case script::SOGDIAN => return "Sogd";
	case script::ELYMAIC => return "Elym";
	case script::NANDINAGARI => return "Nand";
	case script::NYIAKENG_PUACHUE_HMONG => return "Hmnp";
	case script::WANCHO => return "Wcho";
	case script::CHORASMIAN => return "Chrs";
	case script::DIVES_AKURU => return "Diak";
	case script::KHITAN_SMALL_SCRIPT => return "Kits";
	case script::YEZIDI => return "Yezi";
	case script::CYPRO_MINOAN => return "Cpmn";
	case script::OLD_UYGHUR => return "Ougr";
	case script::TANGSA => return "Tnsa";
	case script::TOTO => return "Toto";
	case script::VITHKUQI => return "Vith";
	case script::MATH => return "Zmth";
	case script::KAWI => return "Kawi";
	case script::NAG_MUNDARI => return "Nagm";
	};
};

// Line break classification.
export type line_break = enum u8 {
	XX,
	AI,
	BK,
	CJ,
	CR,
	LF,
	NL,
	SA,
	SG,
	SP,
	OP,
	CL,
	CP,
	QU,
	GL,
	NS,
	EX,
	SY,
	IS,
	PR,
	PO,
	NU,
	AL,
	HL,
	ID,
	IN,
	HY,
	BA,
	BB,
	B2,
	ZW,
	CM,
	WJ,
	H2,
	H3,
	JL,
	JV,
	JT,
	RI,
	EB,
	EM,
	ZWJ,
	CB,
};

// Returns the [[line_break]] classification corresponding to this rune.
export fn rune_line_break(rn: rune) line_break = {
	return get_ucdrecord(rn).line_break: line_break;
};

// Returns the two-character code associated with a [[line_break]] value.
export fn line_break_code(lb: line_break) const str = {
	switch (lb) {
	case line_break::XX =>
		return "XX";
	case line_break::AI =>
		return "AI";
	case line_break::AL =>
		return "AL";
	case line_break::B2 =>
		return "B2";
	case line_break::BA =>
		return "BA";
	case line_break::BB =>
		return "BB";
	case line_break::BK =>
		return "BK";
	case line_break::CB =>
		return "CB";
	case line_break::CJ =>
		return "CJ";
	case line_break::CL =>
		return "CL";
	case line_break::CM =>
		return "CM";
	case line_break::CP =>
		return "CP";
	case line_break::CR =>
		return "CR";
	case line_break::EB =>
		return "EB";
	case line_break::EM =>
		return "EM";
	case line_break::EX =>
		return "EX";
	case line_break::GL =>
		return "GL";
	case line_break::H2 =>
		return "H2";
	case line_break::H3 =>
		return "H3";
	case line_break::HL =>
		return "HL";
	case line_break::HY =>
		return "HY";
	case line_break::ID =>
		return "ID";
	case line_break::IN =>
		return "IN";
	case line_break::IS =>
		return "IS";
	case line_break::JL =>
		return "JL";
	case line_break::JT =>
		return "JT";
	case line_break::JV =>
		return "JV";
	case line_break::LF =>
		return "LF";
	case line_break::NL =>
		return "NL";
	case line_break::NS =>
		return "NS";
	case line_break::NU =>
		return "NU";
	case line_break::OP =>
		return "OP";
	case line_break::PO =>
		return "PO";
	case line_break::PR =>
		return "PR";
	case line_break::QU =>
		return "QU";
	case line_break::RI =>
		return "RI";
	case line_break::SA =>
		return "SA";
	case line_break::SG =>
		return "SG";
	case line_break::SP =>
		return "SP";
	case line_break::SY =>
		return "SY";
	case line_break::WJ =>
		return "WJ";
	case line_break::ZW =>
		return "ZW";
	case line_break::ZWJ =>
		return "ZWJ";
	};
};