From c56f5d40afd5a5303707febe1cfd346d89dae7ae Mon Sep 17 00:00:00 2001 From: Drew DeVault Date: Tue, 16 Apr 2024 18:32:11 +0200 Subject: [PATCH] unicode::gc: use two-character identifiers Signed-off-by: Drew DeVault --- unicode/ucd.ha | 156 ++++++++++++++++++++++++++++++------------------- 1 file changed, 96 insertions(+), 60 deletions(-) diff --git a/unicode/ucd.ha b/unicode/ucd.ha index 0f05fc5..a20743b 100644 --- a/unicode/ucd.ha +++ b/unicode/ucd.ha @@ -22,36 +22,36 @@ fn get_ucdrecord(rn: rune) *ucd_record = { // Unicode character General_Category attribute export type gc = enum u8 { - CONTROL, // Cc - FORMAT, // Cf - UNASSIGNED, // Cn - PRIVATE_USE, // Co - SURROGATE, // Cs - LOWERCASE_LETTER, // Ll - MODIFIER_LETTER, // Lm - OTHER_LETTER, // Lo - TITLECASE_LETTER, // Lt - UPPERCASE_LETTER, // Lu - SPACING_MARK, // Mc - ENCLOSING_MARK, // Me - NON_SPACING_MARK, // Mn - DECIMAL_NUMBER, // Nd - LETTER_NUMBER, // Nl - OTHER_NUMBER, // No - CONNECT_PUNCTUATION, // Pc - DASH_PUNCTUATION, // Pd - CLOSE_PUNCTUATION, // Pe - FINAL_PUNCTUATION, // Pf - INITIAL_PUNCTUATION, // Pi - OTHER_PUNCTUATION, // Po - OPEN_PUNCTUATION, // Ps - CURRENCY_SYMBOL, // Sc - MODIFIER_SYMBOL, // Sk - MATH_SYMBOL, // Sm - OTHER_SYMBOL, // So - LINE_SEPARATOR, // Zl - PARAGRAPH_SEPARATOR, // Zp - SPACE_SEPARATOR, // Zs + Cc, // Control + Cf, // Format + Cn, // Unassigned + Co, // Private use + Cs, // Surrogate + Ll, // Lowercase letter + Lm, // Modifier letter + Lo, // Other letter + Lt, // Titlecase letter + Lu, // Uppercase letter + Mc, // Spacing mark + Me, // Enclosing mark + Mn, // Non-spacing mark + Nd, // Decimal number + Nl, // Letter number + No, // Other number + Pc, // Connect punctuation + Pd, // Dash punctuation + Pe, // Close punctuation + Pf, // Final punctuation + Pi, // Initial punctuation + Po, // Other punctuation + Ps, // Open punctuation + Sc, // Currency symbol + Sk, // Modifier symbol + Sm, // Math symbol + So, // Other symbol + Zl, // Line separator + Zp, // Paragraph separator + Zs, // Space separator }; // Returns the [[general_category]] corresponding to this rune. @@ -59,39 +59,75 @@ export fn rune_gc(rn: rune) gc = { return get_ucdrecord(rn).category: gc; }; +// Returns the name associated with a [[gc]] value. +export fn gc_name(v: gc) const str = { + switch (v) { + case gc::Cc => return "Control"; + case gc::Cf => return "Format"; + case gc::Cn => return "Unassigned"; + case gc::Co => return "Private use"; + case gc::Cs => return "Surrogate"; + case gc::Ll => return "Lowercase letter"; + case gc::Lm => return "Modifier letter"; + case gc::Lo => return "Other letter"; + case gc::Lt => return "Titlecase letter"; + case gc::Lu => return "Uppercase letter"; + case gc::Mc => return "Spacing mark"; + case gc::Me => return "Enclosing mark"; + case gc::Mn => return "Non-spacing mark"; + case gc::Nd => return "Decimal number"; + case gc::Nl => return "Letter number"; + case gc::No => return "Other number"; + case gc::Pc => return "Connect punctuation"; + case gc::Pd => return "Dash punctuation"; + case gc::Pe => return "Close punctuation"; + case gc::Pf => return "Final punctuation"; + case gc::Pi => return "Initial punctuation"; + case gc::Po => return "Other punctuation"; + case gc::Ps => return "Open punctuation"; + case gc::Sc => return "Currency symbol"; + case gc::Sk => return "Modifier symbol"; + case gc::Sm => return "Math symbol"; + case gc::So => return "Other symbol"; + case gc::Zl => return "Line separator"; + case gc::Zp => return "Paragraph separator"; + case gc::Zs => return "Space separator"; + }; +}; + // Returns the two-character code associated with a [[gc]] value. export fn gc_code(v: gc) const str = { switch (v) { - case gc::CONTROL => return "Cc"; - case gc::FORMAT => return "Cf"; - case gc::UNASSIGNED => return "Cn"; - case gc::PRIVATE_USE => return "Co"; - case gc::SURROGATE => return "Cs"; - case gc::LOWERCASE_LETTER => return "Ll"; - case gc::MODIFIER_LETTER => return "Lm"; - case gc::OTHER_LETTER => return "Lo"; - case gc::TITLECASE_LETTER => return "Lt"; - case gc::UPPERCASE_LETTER => return "Lu"; - case gc::SPACING_MARK => return "Mc"; - case gc::ENCLOSING_MARK => return "Me"; - case gc::NON_SPACING_MARK => return "Mn"; - case gc::DECIMAL_NUMBER => return "Nd"; - case gc::LETTER_NUMBER => return "Nl"; - case gc::OTHER_NUMBER => return "No"; - case gc::CONNECT_PUNCTUATION => return "Pc"; - case gc::DASH_PUNCTUATION => return "Pd"; - case gc::CLOSE_PUNCTUATION => return "Pe"; - case gc::FINAL_PUNCTUATION => return "Pf"; - case gc::INITIAL_PUNCTUATION => return "Pi"; - case gc::OTHER_PUNCTUATION => return "Po"; - case gc::OPEN_PUNCTUATION => return "Ps"; - case gc::CURRENCY_SYMBOL => return "Sc"; - case gc::MODIFIER_SYMBOL => return "Sk"; - case gc::MATH_SYMBOL => return "Sm"; - case gc::OTHER_SYMBOL => return "So"; - case gc::LINE_SEPARATOR => return "Zl"; - case gc::PARAGRAPH_SEPARATOR => return "Zp"; - case gc::SPACE_SEPARATOR => return "Zs"; + case gc::Cc => return "Cc"; + case gc::Cf => return "Cf"; + case gc::Cn => return "Cn"; + case gc::Co => return "Co"; + case gc::Cs => return "Cs"; + case gc::Ll => return "Ll"; + case gc::Lm => return "Lm"; + case gc::Lo => return "Lo"; + case gc::Lt => return "Lt"; + case gc::Lu => return "Lu"; + case gc::Mc => return "Mc"; + case gc::Me => return "Me"; + case gc::Mn => return "Mn"; + case gc::Nd => return "Nd"; + case gc::Nl => return "Nl"; + case gc::No => return "No"; + case gc::Pc => return "Pc"; + case gc::Pd => return "Pd"; + case gc::Pe => return "Pe"; + case gc::Pf => return "Pf"; + case gc::Pi => return "Pi"; + case gc::Po => return "Po"; + case gc::Ps => return "Ps"; + case gc::Sc => return "Sc"; + case gc::Sk => return "Sk"; + case gc::Sm => return "Sm"; + case gc::So => return "So"; + case gc::Zl => return "Zl"; + case gc::Zp => return "Zp"; + case gc::Zs => return "Zs"; }; };