95 lines
2.7 KiB
Hare
95 lines
2.7 KiB
Hare
|
type ucd_encodedrec = (u8, u8, u8, u8, u8);
|
||
|
|
||
|
type ucd_record = struct {
|
||
|
category: u8,
|
||
|
combining: u8,
|
||
|
bidirectional: u8,
|
||
|
mirrored: u8,
|
||
|
east_asian_width: u8,
|
||
|
};
|
||
|
|
||
|
fn get_ucdrecord(rn: rune) *ucd_record = {
|
||
|
const code = rn: u32;
|
||
|
let index = 0u8;
|
||
|
if (code < 0x110000) {
|
||
|
index = index1[(code>>UCD_RECORD_SHIFT)];
|
||
|
index = index2[(index<<UCD_RECORD_SHIFT)+(code&((1<<UCD_RECORD_SHIFT)-1))];
|
||
|
};
|
||
|
return &ucd_records[index]: *ucd_record;
|
||
|
};
|
||
|
|
||
|
// Unicode character General_Category attribute
|
||
|
export type gc = enum u8 {
|
||
|
CONTROL, // Cc
|
||
|
FORMAT, // Cf
|
||
|
UNASSIGNED, // Cn
|
||
|
PRIVATE_USE, // Co
|
||
|
SURROGATE, // Cs
|
||
|
LOWERCASE_LETTER, // Ll
|
||
|
MODIFIER_LETTER, // Lm
|
||
|
OTHER_LETTER, // Lo
|
||
|
TITLECASE_LETTER, // Lt
|
||
|
UPPERCASE_LETTER, // Lu
|
||
|
SPACING_MARK, // Mc
|
||
|
ENCLOSING_MARK, // Me
|
||
|
NON_SPACING_MARK, // Mn
|
||
|
DECIMAL_NUMBER, // Nd
|
||
|
LETTER_NUMBER, // Nl
|
||
|
OTHER_NUMBER, // No
|
||
|
CONNECT_PUNCTUATION, // Pc
|
||
|
DASH_PUNCTUATION, // Pd
|
||
|
CLOSE_PUNCTUATION, // Pe
|
||
|
FINAL_PUNCTUATION, // Pf
|
||
|
INITIAL_PUNCTUATION, // Pi
|
||
|
OTHER_PUNCTUATION, // Po
|
||
|
OPEN_PUNCTUATION, // Ps
|
||
|
CURRENCY_SYMBOL, // Sc
|
||
|
MODIFIER_SYMBOL, // Sk
|
||
|
MATH_SYMBOL, // Sm
|
||
|
OTHER_SYMBOL, // So
|
||
|
LINE_SEPARATOR, // Zl
|
||
|
PARAGRAPH_SEPARATOR, // Zp
|
||
|
SPACE_SEPARATOR, // Zs
|
||
|
};
|
||
|
|
||
|
// Returns the [[general_category]] corresponding to this rune.
|
||
|
export fn rune_gc(rn: rune) gc = {
|
||
|
return get_ucdrecord(rn).category: gc;
|
||
|
};
|
||
|
|
||
|
// Returns the two-character code associated with a [[gc]] value.
|
||
|
export fn gc_code(v: gc) const str = {
|
||
|
switch (v) {
|
||
|
case gc::CONTROL => return "Cc";
|
||
|
case gc::FORMAT => return "Cf";
|
||
|
case gc::UNASSIGNED => return "Cn";
|
||
|
case gc::PRIVATE_USE => return "Co";
|
||
|
case gc::SURROGATE => return "Cs";
|
||
|
case gc::LOWERCASE_LETTER => return "Ll";
|
||
|
case gc::MODIFIER_LETTER => return "Lm";
|
||
|
case gc::OTHER_LETTER => return "Lo";
|
||
|
case gc::TITLECASE_LETTER => return "Lt";
|
||
|
case gc::UPPERCASE_LETTER => return "Lu";
|
||
|
case gc::SPACING_MARK => return "Mc";
|
||
|
case gc::ENCLOSING_MARK => return "Me";
|
||
|
case gc::NON_SPACING_MARK => return "Mn";
|
||
|
case gc::DECIMAL_NUMBER => return "Nd";
|
||
|
case gc::LETTER_NUMBER => return "Nl";
|
||
|
case gc::OTHER_NUMBER => return "No";
|
||
|
case gc::CONNECT_PUNCTUATION => return "Pc";
|
||
|
case gc::DASH_PUNCTUATION => return "Pd";
|
||
|
case gc::CLOSE_PUNCTUATION => return "Pe";
|
||
|
case gc::FINAL_PUNCTUATION => return "Pf";
|
||
|
case gc::INITIAL_PUNCTUATION => return "Pi";
|
||
|
case gc::OTHER_PUNCTUATION => return "Po";
|
||
|
case gc::OPEN_PUNCTUATION => return "Ps";
|
||
|
case gc::CURRENCY_SYMBOL => return "Sc";
|
||
|
case gc::MODIFIER_SYMBOL => return "Sk";
|
||
|
case gc::MATH_SYMBOL => return "Sm";
|
||
|
case gc::OTHER_SYMBOL => return "So";
|
||
|
case gc::LINE_SEPARATOR => return "Zl";
|
||
|
case gc::PARAGRAPH_SEPARATOR => return "Zp";
|
||
|
case gc::SPACE_SEPARATOR => return "Zs";
|
||
|
};
|
||
|
};
|