Add 'vendor/hare-unicode/' from commit '1488c26f46f7f8568235eaee6224983ac46e78ff'

git-subtree-dir: vendor/hare-unicode
git-subtree-mainline: 57979aa6fc
git-subtree-split: 1488c26f46
This commit is contained in:
Lobo Torres 2024-12-04 13:29:21 -03:00
commit c70ec9f648
9 changed files with 6329 additions and 0 deletions

2
vendor/hare-unicode/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
*.zip
/.data

367
vendor/hare-unicode/COPYING vendored Normal file
View file

@ -0,0 +1,367 @@
Mozilla Public License Version 2.0
==================================
1. Definitions
--------------
1.1. "Contributor"
means each individual or legal entity that creates, contributes to
the creation of, or owns Covered Software.
1.2. "Contributor Version"
means the combination of the Contributions of others (if any) used
by a Contributor and that particular Contributor's Contribution.
1.3. "Contribution"
means Covered Software of a particular Contributor.
1.4. "Covered Software"
means Source Code Form to which the initial Contributor has attached
the notice in Exhibit A, the Executable Form of such Source Code
Form, and Modifications of such Source Code Form, in each case
including portions thereof.
1.5. "Incompatible With Secondary Licenses"
means
(a) that the initial Contributor has attached the notice described
in Exhibit B to the Covered Software; or
(b) that the Covered Software was made available under the terms of
version 1.1 or earlier of the License, but not also under the
terms of a Secondary License.
1.6. "Executable Form"
means any form of the work other than Source Code Form.
1.7. "Larger Work"
means a work that combines Covered Software with other material, in
a separate file or files, that is not Covered Software.
1.8. "License"
means this document.
1.9. "Licensable"
means having the right to grant, to the maximum extent possible,
whether at the time of the initial grant or subsequently, any and
all of the rights conveyed by this License.
1.10. "Modifications"
means any of the following:
(a) any file in Source Code Form that results from an addition to,
deletion from, or modification of the contents of Covered
Software; or
(b) any new file in Source Code Form that contains any Covered
Software.
1.11. "Patent Claims" of a Contributor
means any patent claim(s), including without limitation, method,
process, and apparatus claims, in any patent Licensable by such
Contributor that would be infringed, but for the grant of the
License, by the making, using, selling, offering for sale, having
made, import, or transfer of either its Contributions or its
Contributor Version.
1.12. "Secondary License"
means either the GNU General Public License, Version 2.0, the GNU
Lesser General Public License, Version 2.1, the GNU Affero General
Public License, Version 3.0, or any later versions of those
licenses.
1.13. "Source Code Form"
means the form of the work preferred for making modifications.
1.14. "You" (or "Your")
means an individual or a legal entity exercising rights under this
License. For legal entities, "You" includes any entity that
controls, is controlled by, or is under common control with You. For
purposes of this definition, "control" means (a) the power, direct
or indirect, to cause the direction or management of such entity,
whether by contract or otherwise, or (b) ownership of more than
fifty percent (50%) of the outstanding shares or beneficial
ownership of such entity.
2. License Grants and Conditions
--------------------------------
2.1. Grants
Each Contributor hereby grants You a world-wide, royalty-free,
non-exclusive license:
(a) under intellectual property rights (other than patent or trademark)
Licensable by such Contributor to use, reproduce, make available,
modify, display, perform, distribute, and otherwise exploit its
Contributions, either on an unmodified basis, with Modifications, or
as part of a Larger Work; and
(b) under Patent Claims of such Contributor to make, use, sell, offer
for sale, have made, import, and otherwise transfer either its
Contributions or its Contributor Version.
2.2. Effective Date
The licenses granted in Section 2.1 with respect to any Contribution
become effective for each Contribution on the date the Contributor first
distributes such Contribution.
2.3. Limitations on Grant Scope
The licenses granted in this Section 2 are the only rights granted under
this License. No additional rights or licenses will be implied from the
distribution or licensing of Covered Software under this License.
Notwithstanding Section 2.1(b) above, no patent license is granted by a
Contributor:
(a) for any code that a Contributor has removed from Covered Software;
or
(b) for infringements caused by: (i) Your and any other third party's
modifications of Covered Software, or (ii) the combination of its
Contributions with other software (except as part of its Contributor
Version); or
(c) under Patent Claims infringed by Covered Software in the absence of
its Contributions.
This License does not grant any rights in the trademarks, service marks,
or logos of any Contributor (except as may be necessary to comply with
the notice requirements in Section 3.4).
2.4. Subsequent Licenses
No Contributor makes additional grants as a result of Your choice to
distribute the Covered Software under a subsequent version of this
License (see Section 10.2) or under the terms of a Secondary License (if
permitted under the terms of Section 3.3).
2.5. Representation
Each Contributor represents that the Contributor believes its
Contributions are its original creation(s) or it has sufficient rights
to grant the rights to its Contributions conveyed by this License.
2.6. Fair Use
This License is not intended to limit any rights You have under
applicable copyright doctrines of fair use, fair dealing, or other
equivalents.
2.7. Conditions
Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
in Section 2.1.
3. Responsibilities
-------------------
3.1. Distribution of Source Form
All distribution of Covered Software in Source Code Form, including any
Modifications that You create or to which You contribute, must be under
the terms of this License. You must inform recipients that the Source
Code Form of the Covered Software is governed by the terms of this
License, and how they can obtain a copy of this License. You may not
attempt to alter or restrict the recipients' rights in the Source Code
Form.
3.2. Distribution of Executable Form
If You distribute Covered Software in Executable Form then:
(a) such Covered Software must also be made available in Source Code
Form, as described in Section 3.1, and You must inform recipients of
the Executable Form how they can obtain a copy of such Source Code
Form by reasonable means in a timely manner, at a charge no more
than the cost of distribution to the recipient; and
(b) You may distribute such Executable Form under the terms of this
License, or sublicense it under different terms, provided that the
license for the Executable Form does not attempt to limit or alter
the recipients' rights in the Source Code Form under this License.
3.3. Distribution of a Larger Work
You may create and distribute a Larger Work under terms of Your choice,
provided that You also comply with the requirements of this License for
the Covered Software. If the Larger Work is a combination of Covered
Software with a work governed by one or more Secondary Licenses, and the
Covered Software is not Incompatible With Secondary Licenses, this
License permits You to additionally distribute such Covered Software
under the terms of such Secondary License(s), so that the recipient of
the Larger Work may, at their option, further distribute the Covered
Software under the terms of either this License or such Secondary
License(s).
3.4. Notices
You may not remove or alter the substance of any license notices
(including copyright notices, patent notices, disclaimers of warranty,
or limitations of liability) contained within the Source Code Form of
the Covered Software, except that You may alter any license notices to
the extent required to remedy known factual inaccuracies.
3.5. Application of Additional Terms
You may choose to offer, and to charge a fee for, warranty, support,
indemnity or liability obligations to one or more recipients of Covered
Software. However, You may do so only on Your own behalf, and not on
behalf of any Contributor. You must make it absolutely clear that any
such warranty, support, indemnity, or liability obligation is offered by
You alone, and You hereby agree to indemnify every Contributor for any
liability incurred by such Contributor as a result of warranty, support,
indemnity or liability terms You offer. You may include additional
disclaimers of warranty and limitations of liability specific to any
jurisdiction.
4. Inability to Comply Due to Statute or Regulation
---------------------------------------------------
If it is impossible for You to comply with any of the terms of this
License with respect to some or all of the Covered Software due to
statute, judicial order, or regulation then You must: (a) comply with
the terms of this License to the maximum extent possible; and (b)
describe the limitations and the code they affect. Such description must
be placed in a text file included with all distributions of the Covered
Software under this License. Except to the extent prohibited by statute
or regulation, such description must be sufficiently detailed for a
recipient of ordinary skill to be able to understand it.
5. Termination
--------------
5.1. The rights granted under this License will terminate automatically
if You fail to comply with any of its terms. However, if You become
compliant, then the rights granted under this License from a particular
Contributor are reinstated (a) provisionally, unless and until such
Contributor explicitly and finally terminates Your grants, and (b) on an
ongoing basis, if such Contributor fails to notify You of the
non-compliance by some reasonable means prior to 60 days after You have
come back into compliance. Moreover, Your grants from a particular
Contributor are reinstated on an ongoing basis if such Contributor
notifies You of the non-compliance by some reasonable means, this is the
first time You have received notice of non-compliance with this License
from such Contributor, and You become compliant prior to 30 days after
Your receipt of the notice.
5.2. If You initiate litigation against any entity by asserting a patent
infringement claim (excluding declaratory judgment actions,
counter-claims, and cross-claims) alleging that a Contributor Version
directly or indirectly infringes any patent, then the rights granted to
You by any and all Contributors for the Covered Software under Section
2.1 of this License shall terminate.
5.3. In the event of termination under Sections 5.1 or 5.2 above, all
end user license agreements (excluding distributors and resellers) which
have been validly granted by You or Your distributors under this License
prior to termination shall survive termination.
************************************************************************
* *
* 6. Disclaimer of Warranty *
* ------------------------- *
* *
* Covered Software is provided under this License on an "as is" *
* basis, without warranty of any kind, either expressed, implied, or *
* statutory, including, without limitation, warranties that the *
* Covered Software is free of defects, merchantable, fit for a *
* particular purpose or non-infringing. The entire risk as to the *
* quality and performance of the Covered Software is with You. *
* Should any Covered Software prove defective in any respect, You *
* (not any Contributor) assume the cost of any necessary servicing, *
* repair, or correction. This disclaimer of warranty constitutes an *
* essential part of this License. No use of any Covered Software is *
* authorized under this License except under this disclaimer. *
* *
************************************************************************
************************************************************************
* *
* 7. Limitation of Liability *
* -------------------------- *
* *
* Under no circumstances and under no legal theory, whether tort *
* (including negligence), contract, or otherwise, shall any *
* Contributor, or anyone who distributes Covered Software as *
* permitted above, be liable to You for any direct, indirect, *
* special, incidental, or consequential damages of any character *
* including, without limitation, damages for lost profits, loss of *
* goodwill, work stoppage, computer failure or malfunction, or any *
* and all other commercial damages or losses, even if such party *
* shall have been informed of the possibility of such damages. This *
* limitation of liability shall not apply to liability for death or *
* personal injury resulting from such party's negligence to the *
* extent applicable law prohibits such limitation. Some *
* jurisdictions do not allow the exclusion or limitation of *
* incidental or consequential damages, so this exclusion and *
* limitation may not apply to You. *
* *
************************************************************************
8. Litigation
-------------
Any litigation relating to this License may be brought only in the
courts of a jurisdiction where the defendant maintains its principal
place of business and such litigation shall be governed by laws of that
jurisdiction, without reference to its conflict-of-law provisions.
Nothing in this Section shall prevent a party's ability to bring
cross-claims or counter-claims.
9. Miscellaneous
----------------
This License represents the complete agreement concerning the subject
matter hereof. If any provision of this License is held to be
unenforceable, such provision shall be reformed only to the extent
necessary to make it enforceable. Any law or regulation which provides
that the language of a contract shall be construed against the drafter
shall not be used to construe this License against a Contributor.
10. Versions of the License
---------------------------
10.1. New Versions
Mozilla Foundation is the license steward. Except as provided in Section
10.3, no one other than the license steward has the right to modify or
publish new versions of this License. Each version will be given a
distinguishing version number.
10.2. Effect of New Versions
You may distribute the Covered Software under the terms of the version
of the License under which You originally received the Covered Software,
or under the terms of any subsequent version published by the license
steward.
10.3. Modified Versions
If you create software not governed by this License, and you want to
create a new license for such software, you may create and use a
modified version of this License if you rename the license and remove
any references to the name of the license steward (except to note that
such modified license differs from this License).
10.4. Distributing Source Code Form that is Incompatible With Secondary
Licenses
If You choose to distribute Source Code Form that is Incompatible With
Secondary Licenses under the terms of this version of the License, the
notice described in Exhibit B of this License must be attached.
Exhibit A - Source Code Form License Notice
-------------------------------------------
This Source Code Form is subject to the terms of the Mozilla Public
License, v. 2.0. If a copy of the MPL was not distributed with this
file, You can obtain one at http://mozilla.org/MPL/2.0/.
If it is not possible or desirable to put the notice in a particular
file, then You may include the notice in a location (such as a LICENSE
file in a relevant directory) where a recipient would be likely to look
for such a notice.
You may add additional accurate notices of copyright ownership.

View file

@ -0,0 +1,40 @@
use encoding::hex;
use fmt;
use os;
use strings;
use unicode;
export fn main() void = {
const input = os::args[1];
const data = strings::toutf8(input);
hex::dump(os::stdout, data)!;
fmt::println(input)!;
let ix = 0u;
const lb = unicode::new_line_breaker(input);
for (const (pos, _, mand) => unicode::next_line_break(&lb)) {
for (ix < pos; ix += 1) {
fmt::print(' ')!;
};
ix += 1;
if (mand) {
fmt::println('|')!;
} else {
fmt::print('^')!;
};
};
fmt::println()!;
fmt::println()!;
fmt::println("Line break opportunities:")!;
const lb = unicode::new_line_breaker(input);
for (const (pos, bpos, mand) => unicode::next_line_break(&lb)) {
fmt::printfln("- {}:{} {} (before '{}'/0x{:x})", pos, bpos,
if (mand) "(mandatory)" else "",
strings::sub(input, pos, pos+1),
data[bpos])!;
};
};

25
vendor/hare-unicode/cmd/ucdtest/main.ha vendored Normal file
View file

@ -0,0 +1,25 @@
use fmt;
use os;
use strings;
use unicode;
export fn main() void = {
const in = os::args[1];
const iter = strings::iter(in);
for (true) {
const rn = match (strings::next(&iter)) {
case let rn: rune =>
yield rn;
case => break;
};
const gc = unicode::rune_gc(rn);
const sc = unicode::rune_script(rn);
const lb = unicode::rune_line_break(rn);
fmt::printfln("'{}'/{:x}: {} : {} : {}",
rn, rn: u32,
unicode::gc_code(gc),
unicode::script_code(sc),
unicode::line_break_code(lb))!;
};
};

860
vendor/hare-unicode/scripts/gen-ucd.py vendored Executable file
View file

@ -0,0 +1,860 @@
#!/usr/bin/python3
# Based on CPython's unicodedata generation script,
# Tools/unicode/makeunicodedata.py, forked and adapted for Hare
#
# PSF License
#
# (re)generate unicode property and type databases
#
# This script converts Unicode database files to Modules/unicodedata_db.h,
# Modules/unicodename_db.h, and Objects/unicodetype_db.h
#
# history:
# 2000-09-24 fl created (based on bits and pieces from unidb)
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
# 2000-09-25 fl added character type table
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
# 2000-11-03 fl expand first/last ranges
# 2001-01-19 fl added character name tables (2.1)
# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
# 2002-09-11 wd use string methods
# 2002-10-18 mvl update to Unicode 3.2
# 2002-10-22 mvl generate NFC tables
# 2002-11-24 mvl expand all ranges, sort names version-independently
# 2002-11-25 mvl add UNIDATA_VERSION
# 2004-05-29 perky add east asian width information
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
# 2011-10-21 ezio add support for name aliases and named sequences
# 2012-01 benjamin add full case mappings
#
# written by Fredrik Lundh (fredrik@pythonware.com)
#
import dataclasses
import os
import sys
import zipfile
from functools import partial
from textwrap import dedent
from typing import Iterator, List, Optional, Set, Tuple
SCRIPT = sys.argv[0]
VERSION = "3.3"
# The Unicode Database
# --------------------
# When changing UCD version please update
# * Doc/library/stdtypes.rst, and
# * Doc/library/unicodedata.rst
# * Doc/reference/lexical_analysis.rst (two occurrences)
UNIDATA_VERSION = "13.0.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
UNIHAN = "Unihan%s.zip"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt"
NAME_ALIASES = "NameAliases%s.txt"
NAMED_SEQUENCES = "NamedSequences%s.txt"
SPECIAL_CASING = "SpecialCasing%s.txt"
CASE_FOLDING = "CaseFolding%s.txt"
SCRIPTS = "Scripts%s.txt"
# Private Use Areas -- in planes 1, 15, 16
PUA_1 = range(0xE000, 0xF900)
PUA_15 = range(0xF0000, 0xFFFFE)
PUA_16 = range(0x100000, 0x10FFFE)
# we use this ranges of PUA_15 to store name aliases and named sequences
NAME_ALIASES_START = 0xF0000
NAMED_SEQUENCES_START = 0xF0200
old_versions = []
# Order must match ucd.ha
CATEGORY_NAMES = [
"Cc", "Cf", "Cn", "Co", "Cs", "Ll", "Lm", "Lo", "Lt", "Lu", "Mc", "Me",
"Mn", "Nd", "Nl", "No", "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps", "Sc",
"Sk", "Sm", "So", "Zl", "Zp", "Zs",
]
BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
"ON", "LRI", "RLI", "FSI", "PDI" ]
EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
LINE_BREAKS = [
"XX", "AI", "BK", "CJ", "CR", "LF", "NL", "SA", "SG", "SP", "OP", "CL",
"CP", "QU", "GL", "NS", "EX", "SY", "IS", "PR", "PO", "NU", "AL", "HL",
"ID", "IN", "HY", "BA", "BB", "B2", "ZW", "CM", "WJ", "H2", "H3", "JL",
"JV", "JT", "RI", "EB", "EM", "ZWJ", "CB",
]
SCRIPT_NAMES = [
"Common",
"Inherited",
"Unknown",
"Adlam",
"Caucasian_Albanian",
"Ahom",
"Arabic",
"Imperial_Aramaic",
"Armenian",
"Avestan",
"Balinese",
"Bamum",
"Bassa_Vah",
"Batak",
"Bengali",
"Bhaiksuki",
"Bopomofo",
"Brahmi",
"Braille",
"Buginese",
"Buhid",
"Chakma",
"Canadian_Aboriginal",
"Carian",
"Cham",
"Cherokee",
"Chorasmian",
"Coptic",
"Cypro_Minoan",
"Cypriot",
"Cyrillic",
"Devanagari",
"Dives_Akuru",
"Dogra",
"Deseret",
"Duployan",
"Egyptian_Hieroglyphs",
"Elbasan",
"Elymaic",
"Ethiopic",
"Georgian",
"Glagolitic",
"Gunjala_Gondi",
"Masaram_Gondi",
"Gothic",
"Grantha",
"Greek",
"Gujarati",
"Gurmukhi",
"Hangul",
"Han",
"Hanunoo",
"Hatran",
"Hebrew",
"Hiragana",
"Anatolian_Hieroglyphs",
"Pahawh_Hmong",
"Nyiakeng_Puachue_Hmong",
"Old_Hungarian",
"Old_Italic",
"Javanese",
"Kayah_Li",
"Katakana",
"Kawi",
"Kharoshthi",
"Khmer",
"Khojki",
"Khitan_Small_Script",
"Kannada",
"Kaithi",
"Tai_Tham",
"Lao",
"Latin",
"Lepcha",
"Limbu",
"Linear_A",
"Linear_B",
"Lisu",
"Lycian",
"Lydian",
"Mahajani",
"Makasar",
"Mandaic",
"Manichaean",
"Marchen",
"Medefaidrin",
"Mende_Kikakui",
"Meroitic_Cursive",
"Meroitic_Hieroglyphs",
"Malayalam",
"Modi",
"Mongolian",
"Mro",
"Meetei_Mayek",
"Multani",
"Myanmar",
"Nag_Mundari",
"Nandinagari",
"Old_North_Arabian",
"Nabataean",
"Newa",
"Nko",
"Nushu",
"Ogham",
"Ol_Chiki",
"Old_Turkic",
"Oriya",
"Osage",
"Osmanya",
"Old_Uyghur",
"Palmyrene",
"Pau_Cin_Hau",
"Old_Permic",
"Phags_Pa",
"Inscriptional_Pahlavi",
"Psalter_Pahlavi",
"Phoenician",
"Miao",
"Inscriptional_Parthian",
"Rejang",
"Hanifi_Rohingya",
"Runic",
"Samaritan",
"Old_South_Arabian",
"Saurashtra",
"SignWriting",
"Shavian",
"Sharada",
"Siddham",
"Khudawadi",
"Sinhala",
"Sogdian",
"Old_Sogdian",
"Sora_Sompeng",
"Soyombo",
"Sundanese",
"Syloti_Nagri",
"Syriac",
"Tagbanwa",
"Takri",
"Tai_Le",
"New_Tai_Lue",
"Tamil",
"Tangut",
"Tai_Viet",
"Telugu",
"Tifinagh",
"Tagalog",
"Thaana",
"Thai",
"Tibetan",
"Tirhuta",
"Tangsa",
"Toto",
"Ugaritic",
"Vai",
"Vithkuqi",
"Warang_Citi",
"Wancho",
"Old_Persian",
"Cuneiform",
"Yezidi",
"Yi",
"Zanabazar_Square",
]
# note: should match definitions in Objects/unicodectype.c
ALPHA_MASK = 0x01
DECIMAL_MASK = 0x02
DIGIT_MASK = 0x04
LOWER_MASK = 0x08
LINEBREAK_MASK = 0x10
SPACE_MASK = 0x20
TITLE_MASK = 0x40
UPPER_MASK = 0x80
XID_START_MASK = 0x100
XID_CONTINUE_MASK = 0x200
PRINTABLE_MASK = 0x400
NUMERIC_MASK = 0x800
CASE_IGNORABLE_MASK = 0x1000
CASED_MASK = 0x2000
EXTENDED_CASE_MASK = 0x4000
# these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [
('3400', '4DBF'),
('4E00', '9FFC'),
('20000', '2A6DD'),
('2A700', '2B734'),
('2B740', '2B81D'),
('2B820', '2CEA1'),
('2CEB0', '2EBE0'),
('30000', '3134A'),
]
def maketables(trace=0):
print("--- Reading", UNICODE_DATA % "", "...")
unicode = UnicodeData(UNIDATA_VERSION)
print(len(list(filter(None, unicode.table))), "characters")
makeunicodedata(unicode, trace)
# --------------------------------------------------------------------
# unicode character properties
def makeunicodedata(unicode, trace):
dummy = (0, 0, 0, 0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
FILE = "unicode/ucd_gen.ha"
print("--- Preparing", FILE, "...")
for char in unicode.chars:
record = unicode.table[char]
if record:
# extract database properties
category = CATEGORY_NAMES.index(record.general_category)
combining = int(record.canonical_combining_class)
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
mirrored = record.bidi_mirrored == "Y"
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
script = SCRIPT_NAMES.index(record.script or "Unknown")
line_break = LINE_BREAKS.index(record.line_break)
item = (
category, combining, bidirectional,
mirrored, eastasianwidth, script,
line_break,
)
# add entry to index and item tables
i = cache.get(item)
if i is None:
cache[item] = i = len(table)
table.append(item)
index[char] = i
print(len(table), "unique properties")
print("--- Writing", FILE, "...")
with open(FILE, "w") as fp:
fprint = partial(print, file=fp)
fprint("// Generated by scripts/gen-ucd.py")
fprint()
fprint('// Unicode database version supported by this module')
fprint('export def UNIDATA_VERSION: str = "%s";' % UNIDATA_VERSION)
fprint('')
fprint("// List of unique database records")
fprint("const ucd_records: [_]ucd_encodedrec = [")
for item in table:
fprint(" (%d, %d, %d, %d, %d, %d, %d)," % item)
fprint("];")
fprint()
# split record index table
index1, index2, shift = splitbins(index, trace)
fprint("// index tables for the database records")
fprint("def UCD_RECORD_SHIFT: size = %d;" % shift)
Array("index1", index1).dump(fp, trace)
Array("index2", index2).dump(fp, trace)
DATA_DIR = os.path.join('.data')
def open_data(template, version):
local = os.path.join(DATA_DIR, template % ('-'+version,))
if not os.path.exists(local):
import urllib.request
if version == '3.2.0':
# irregular url structure
url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
else:
url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
os.makedirs(DATA_DIR, exist_ok=True)
urllib.request.urlretrieve(url, filename=local)
if local.endswith('.txt'):
return open(local, encoding='utf-8')
else:
# Unihan.zip
return open(local, 'rb')
def expand_range(char_range: str) -> Iterator[int]:
'''
Parses ranges of code points, as described in UAX #44:
https://www.unicode.org/reports/tr44/#Code_Point_Ranges
'''
if '..' in char_range:
first, last = [int(c, 16) for c in char_range.split('..')]
else:
first = last = int(char_range, 16)
for char in range(first, last+1):
yield char
class UcdFile:
'''
A file in the standard format of the UCD.
See: https://www.unicode.org/reports/tr44/#Format_Conventions
Note that, as described there, the Unihan data files have their
own separate format.
'''
def __init__(self, template: str, version: str) -> None:
self.template = template
self.version = version
def records(self) -> Iterator[List[str]]:
with open_data(self.template, self.version) as file:
for line in file:
line = line.split('#', 1)[0].strip()
if not line:
continue
yield [field.strip() for field in line.split(';')]
def __iter__(self) -> Iterator[List[str]]:
return self.records()
def expanded(self) -> Iterator[Tuple[int, List[str]]]:
for record in self.records():
char_range, rest = record[0], record[1:]
for char in expand_range(char_range):
yield char, rest
@dataclasses.dataclass
class UcdRecord:
# 15 fields from UnicodeData.txt . See:
# https://www.unicode.org/reports/tr44/#UnicodeData.txt
codepoint: str
name: str
general_category: str
canonical_combining_class: str
bidi_class: str
decomposition_type: str
decomposition_mapping: str
numeric_type: str
numeric_value: str
bidi_mirrored: str
unicode_1_name: str # obsolete
iso_comment: str # obsolete
simple_uppercase_mapping: str
simple_lowercase_mapping: str
simple_titlecase_mapping: str
# https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
east_asian_width: Optional[str]
# Binary properties, as a set of those that are true.
# Taken from multiple files:
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
# https://www.unicode.org/reports/tr44/#LineBreak.txt
binary_properties: Set[str]
# The Quick_Check properties related to normalization:
# https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
# We store them as a bitmask.
quick_check: int
# From Script.txt
script: str
# From LineBreak.txt
line_break: str
def from_row(row: List[str]) -> UcdRecord:
return UcdRecord(*row, None, set(), 0, "Unknown", "XX")
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB
# load a unicode-data file from disk
class UnicodeData:
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
def __init__(self, version, cjk_check=True):
self.changed = []
table = [None] * 0x110000
for s in UcdFile(UNICODE_DATA, version):
char = int(s[0], 16)
table[char] = from_row(s)
cjk_ranges_found = []
# expand first-last ranges
field = None
for i in range(0, 0x110000):
# The file UnicodeData.txt has its own distinct way of
# expressing ranges. See:
# https://www.unicode.org/reports/tr44/#Code_Point_Ranges
s = table[i]
if s:
if s.name[-6:] == "First>":
s.name = ""
field = dataclasses.astuple(s)[:15]
elif s.name[-5:] == "Last>":
if s.name.startswith("<CJK Ideograph"):
cjk_ranges_found.append((field[0],
s.codepoint))
s.name = ""
field = None
elif field:
table[i] = from_row(('%X' % i,) + field[1:])
if cjk_check and cjk_ranges != cjk_ranges_found:
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
# public attributes
self.filename = UNICODE_DATA % ''
self.table = table
self.chars = list(range(0x110000)) # unicode 3.2
# check for name aliases and named sequences, see #12753
# aliases and named sequences are not in 3.2.0
if version != '3.2.0':
self.aliases = []
# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
# in order to take advantage of the compression and lookup
# algorithms used for the other characters
pua_index = NAME_ALIASES_START
for char, name, abbrev in UcdFile(NAME_ALIASES, version):
char = int(char, 16)
self.aliases.append((name, char))
# also store the name in the PUA 1
self.table[pua_index].name = name
pua_index += 1
assert pua_index - NAME_ALIASES_START == len(self.aliases)
self.named_sequences = []
# store named sequences in the PUA 1, in range U+F0100..,
# in order to take advantage of the compression and lookup
# algorithms used for the other characters.
assert pua_index < NAMED_SEQUENCES_START
pua_index = NAMED_SEQUENCES_START
for name, chars in UcdFile(NAMED_SEQUENCES, version):
chars = tuple(int(char, 16) for char in chars.split())
# check that the structure defined in makeunicodename is OK
assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
"the NamedSequence struct and in unicodedata_lookup")
self.named_sequences.append((name, chars))
# also store these in the PUA 1
self.table[pua_index].name = name
pua_index += 1
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
self.exclusions = {}
for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
char = int(char, 16)
self.exclusions[char] = 1
widths = [None] * 0x110000
for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
widths[char] = width
for i in range(0, 0x110000):
if table[i] is not None:
table[i].east_asian_width = widths[i]
scripts = [None] * 0x110000
for char, (script,) in UcdFile(SCRIPTS, version).expanded():
scripts[char] = script
for i in range(0, 0x110000):
if table[i] is not None:
table[i].script = scripts[i]
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
if table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
table[char].binary_properties.add(p)
for char_range, value in UcdFile(LINE_BREAK, version):
for char in expand_range(char_range):
if not table[char]:
continue
if value in MANDATORY_LINE_BREAKS:
table[char].binary_properties.add('Line_Break')
table[char].line_break = value
# We only want the quickcheck properties
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
# Yes is the default, hence only N and M occur
# In 3.2.0, the format was different (NF?_NO)
# The parsing will incorrectly determine these as
# "yes", however, unicodedata.c will not perform quickchecks
# for older versions, and no delta records will be created.
quickchecks = [0] * 0x110000
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
if len(s) < 2 or s[1] not in qc_order:
continue
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
quickcheck_shift = qc_order.index(s[1])*2
quickcheck <<= quickcheck_shift
for char in expand_range(s[0]):
assert not (quickchecks[char]>>quickcheck_shift)&3
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i].quick_check = quickchecks[i]
with open_data(UNIHAN, version) as file:
zip = zipfile.ZipFile(file)
if version == '3.2.0':
data = zip.open('Unihan-3.2.0.txt').read()
else:
data = zip.open('Unihan_NumericValues.txt').read()
for line in data.decode("utf-8").splitlines():
if not line.startswith('U+'):
continue
code, tag, value = line.split(None, 3)[:3]
if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
'kOtherNumeric'):
continue
value = value.strip().replace(',', '')
i = int(code[2:], 16)
# Patch the numeric field
if table[i] is not None:
table[i].numeric_value = value
sc = self.special_casing = {}
for data in UcdFile(SPECIAL_CASING, version):
if data[4]:
# We ignore all conditionals (since they depend on
# languages) except for one, which is hardcoded. See
# handle_capital_sigma in unicodeobject.c.
continue
c = int(data[0], 16)
lower = [int(char, 16) for char in data[1].split()]
title = [int(char, 16) for char in data[2].split()]
upper = [int(char, 16) for char in data[3].split()]
sc[c] = (lower, title, upper)
cf = self.case_folding = {}
if version != '3.2.0':
for data in UcdFile(CASE_FOLDING, version):
if data[1] in "CF":
c = int(data[0], 16)
cf[c] = [int(char, 16) for char in data[2].split()]
def uselatin1(self):
# restrict character range to ISO Latin 1
self.chars = list(range(256))
# hash table tools
# this is a straight-forward reimplementation of Python's built-in
# dictionary type, using a static data structure, and a custom string
# hash algorithm.
def myhash(s, magic):
h = 0
for c in map(ord, s.upper()):
h = (h * magic) + c
ix = h & 0xff000000
if ix:
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
return h
SIZES = [
(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
(2097152,5), (4194304,3), (8388608,33), (16777216,27)
]
class Hash:
def __init__(self, name, data, magic):
# turn a (key, value) list into a static hash table structure
# determine table size
for size, poly in SIZES:
if size > len(data):
poly = size + poly
break
else:
raise AssertionError("ran out of polynomials")
print(size, "slots in hash table")
table = [None] * size
mask = size-1
n = 0
hash = myhash
# initialize hash table
for key, value in data:
h = hash(key, magic)
i = (~h) & mask
v = table[i]
if v is None:
table[i] = value
continue
incr = (h ^ (h >> 3)) & mask
if not incr:
incr = mask
while 1:
n = n + 1
i = (i + incr) & mask
v = table[i]
if v is None:
table[i] = value
break
incr = incr << 1
if incr > mask:
incr = incr ^ poly
print(n, "collisions")
self.collisions = n
for i in range(len(table)):
if table[i] is None:
table[i] = 0
self.data = Array(name + "_hash", table)
self.magic = magic
self.name = name
self.size = size
self.poly = poly
def dump(self, file, trace):
# write data to file, as a C array
self.data.dump(file, trace)
file.write("#define %s_magic %d\n" % (self.name, self.magic))
file.write("#define %s_size %d\n" % (self.name, self.size))
file.write("#define %s_poly %d\n" % (self.name, self.poly))
# stuff to deal with arrays of unsigned integers
class Array:
def __init__(self, name, data):
self.name = name
self.data = data
def dump(self, file, trace=0):
# write data to file, as a C array
size = getsize(self.data)
if trace:
print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
file.write("const " + self.name + ": [_]")
if size == 1:
file.write("u8")
elif size == 2:
file.write("u16")
else:
file.write("u32")
file.write(" = [\n")
if self.data:
s = " "
for item in self.data:
i = str(item) + ", "
if len(s) + len(i) > 78:
file.write(s.rstrip() + "\n")
s = " " + i
else:
s = s + i
if s.strip():
file.write(s.rstrip() + "\n")
file.write("];\n\n")
def getsize(data):
# return smallest possible integer size for the given array
maxdata = max(data)
if maxdata < 256:
return 1
elif maxdata < 65536:
return 2
else:
return 4
def splitbins(t, trace=0):
"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
t is a sequence of ints. This function can be useful to save space if
many of the ints are the same. t1 and t2 are lists of ints, and shift
is an int, chosen to minimize the combined size of t1 and t2 (in C
code), and where for each i in range(len(t)),
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
where mask is a bitmask isolating the last "shift" bits.
If optional arg trace is non-zero (default zero), progress info
is printed to sys.stderr. The higher the value, the more info
you'll get.
"""
if trace:
def dump(t1, t2, shift, bytes):
print("%d+%d bins at shift %d; %d bytes" % (
len(t1), len(t2), shift, bytes), file=sys.stderr)
print("Size of original table:", len(t)*getsize(t), "bytes",
file=sys.stderr)
n = len(t)-1 # last valid index
maxshift = 0 # the most we can shift n and still have something left
if n > 0:
while n >> 1:
n >>= 1
maxshift += 1
del n
bytes = sys.maxsize # smallest total size so far
t = tuple(t) # so slices can be dict keys
for shift in range(maxshift + 1):
t1 = []
t2 = []
size = 2**shift
bincache = {}
for i in range(0, len(t), size):
bin = t[i:i+size]
index = bincache.get(bin)
if index is None:
index = len(t2)
bincache[bin] = index
t2.extend(bin)
t1.append(index >> shift)
# determine memory size
b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
if trace > 1:
dump(t1, t2, shift, b)
if b < bytes:
best = t1, t2, shift
bytes = b
t1, t2, shift = best
if trace:
print("Best:", end=' ', file=sys.stderr)
dump(t1, t2, shift, bytes)
if __debug__:
# exhaustively verify that the decomposition is correct
mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
for i in range(len(t)):
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
return best
if __name__ == "__main__":
maketables(1)

196
vendor/hare-unicode/unicode/linebreak.ha vendored Normal file
View file

@ -0,0 +1,196 @@
use encoding::utf8;
use strings;
export type line_breaker = struct {
input: str,
iter: strings::iterator,
// Current position
pos: size,
// Current position, bytes
bpos: size,
// Current line break class
cur: line_break,
// Next line break class
next: line_break,
// State for specific rules
lb8a: bool,
lb21a: bool,
lb30a: uint,
};
// Creates a new line breaking algorithm state machine. See [[next_line_break]]
// to enumerate the line break opportunities in the input string.
export fn new_line_breaker(input: str) line_breaker = {
return line_breaker {
input = input,
...
};
};
// Returns the next line break opportunity as a tuple of the rune-wise index,
// byte-wise index, and a boolean indicating whether or not the break is
// mandatory at this location. The line break opportunity directly precedes the
// index returned from this function.
//
// Hello world!
// ^ Line break opportunity at index 6
export fn next_line_break(lb: *line_breaker) ((size, size, bool) | done) = {
if (lb.pos == 0) {
if (len(lb.input) == 0) {
return done; // special case
};
lb.iter = strings::iter(lb.input);
const (class, rn) = next_lb1_class(lb) as (line_break, rune);
class = resolve_lb2_class(class);
lb.cur = class;
lb.next = class;
lb.lb8a = class == line_break::ZWJ;
};
for (const (next, rn) => next_lb1_class(lb)) {
const prev = lb.next;
lb.next = next;
const rnsz = utf8::runesz(rn);
defer {
lb.pos += 1;
lb.bpos += rnsz;
};
const mandatory = lb.cur == line_break::BK
|| (lb.cur == line_break::CR
&& lb.next != line_break::LF);
if (mandatory) {
lb.cur = resolve_lb2_class(next);
return (lb.pos + 1, lb.bpos + rnsz, true);
};
lb.lb8a = next == line_break::ZWJ;
let can_break = lb_simple_case(lb);
match (can_break) {
case bool => void;
case void =>
can_break = lb_complex_case(lb, prev);
};
assert(can_break is bool);
const can_break = can_break as bool;
if (can_break) {
return (lb.pos + 1, lb.bpos + rnsz, false);
};
};
return done;
};
// Applies LB1 suggested rules for resolving context-dependent classes.
fn next_lb1_class(lb: *line_breaker) ((line_break, rune) | done) = {
const rn = match (strings::next(&lb.iter)) {
case let rn: rune =>
yield rn;
case done =>
return done;
};
const class = rune_line_break(rn);
switch (class) {
case line_break::AI, line_break::SG, line_break::XX =>
return (line_break::AL, rn);
case line_break::SA =>
switch (rune_gc(rn)) {
case gc::Mn, gc::Mc =>
return (line_break::CM, rn);
case =>
return (line_break::AL, rn);
};
case line_break::CJ =>
return (line_break::NS, rn);
case =>
return (class, rn);
};
};
// Applies LB2 suggested rules for resolving the start-of-text line-break class.
fn resolve_lb2_class(lb: line_break) line_break = {
switch (lb) {
case line_break::LF, line_break::NL =>
return line_break::BK;
case line_break::SP =>
return line_break::WJ;
case =>
return lb;
};
};
// If this is a simple case, return whether or not this is a break opportunity
// as a boolean. Returns void for special cases.
fn lb_simple_case(lb: *line_breaker) (bool | void) = {
switch (lb.next) {
case line_break::SP =>
return false;
case line_break::BK, line_break::LF, line_break::NL =>
lb.cur = line_break::BK;
return false;
case line_break::CR =>
lb.cur = line_break::CR;
return false;
case =>
return;
};
};
// Handles more complex rules, including pair table lookups via
// linebreak_table.ha.
fn lb_complex_case(lb: *line_breaker, prev: line_break) bool = {
let can_break = false;
const ucur = lb.cur: uint - line_break::OP: uint;
const unext = lb.next: uint - line_break::OP: uint;
if (ucur < len(lb_pairs) && unext < len(lb_pairs[0])) {
switch (lb_pairs[ucur][unext]) {
case bo::DI => // Direct break
can_break = true;
case bo::IN => // Indirect break opportunity
can_break = prev == line_break::SP;
case bo::CI => // Indirect opportunity for combining marks
can_break = prev == line_break::SP;
if (!can_break) {
return false;
};
case bo::CP => // Prohibited for combining marks
if (prev != line_break::SP) {
return false;
};
case bo::PR => void;
};
};
// Rule LB8a
if (lb.lb8a) {
can_break = false;
};
// Rule LB21a
if (lb.lb21a && (lb.cur == line_break::HY || lb.cur == line_break::BA)) {
can_break = false;
lb.lb21a = false;
} else {
lb.lb21a = lb.cur == line_break::HL;
};
// Rule LB30a
if (lb.cur == line_break::RI) {
lb.lb30a += 1;
if (lb.lb30a == 2 && lb.next == line_break::RI) {
can_break = true;
lb.lb30a = 0;
};
} else {
lb.lb30a = 0;
};
lb.cur = lb.next;
return can_break;
};

View file

@ -0,0 +1,63 @@
// Break opportunity
type bo = enum {
// Direct opportunity
DI,
// Indirect opportunity
IN,
// Indirect opportunity for combining marks
CI,
// Prohibited break for combining marks
CP,
// Prohibited break
PR,
};
// Based on JavaScript implementation here:
//
// https://github.com/foliojs/linebreak/blob/master/src/pairs.js
//
// This is itself based on the example pair table from Unicode, which was last
// published in revision 37 of the line break algorithm, and has since been
// touched up by the JavaScript maintainers to incorporate later changes to the
// algorithm.
//
// - ZWJ special processing for LB8a of Revision 41
// - CB manually added as per Rule LB20
// - CL, CP, NS, SY, IS, PR, PO, HY, BA, B2 and RI manually adjusted as per LB22 of Revision 45
const lb_pairs = [
//OP , CL , CP , QU , GL , NS , EX , SY , IS , PR , PO , NU , AL , HL , ID , IN , HY , BA , BB , B2 , ZW , CM , WJ , H2 , H3 , JL , JV , JT , RI , EB , EM , ZWJ , CB
[bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::CP, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR, bo::PR], // OP
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::PR, bo::IN, bo::IN, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // CL
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // CP
[bo::PR, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::PR, bo::CI, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN], // QU
[bo::IN, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::PR, bo::CI, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN], // GL
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // NS
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // EX
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::DI, bo::IN, bo::DI, bo::IN, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // SY
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // IS
[bo::IN, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI], // PR
[bo::IN, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // PO
[bo::IN, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // NU
[bo::IN, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // AL
[bo::IN, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // HL
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::IN, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // ID
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // IN
[bo::DI, bo::PR, bo::PR, bo::IN, bo::DI, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::DI, bo::IN, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // HY
[bo::DI, bo::PR, bo::PR, bo::IN, bo::DI, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // BA
[bo::IN, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::PR, bo::CI, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::DI], // BB
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::PR, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // B2
[bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI], // ZW
[bo::IN, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // CM
[bo::IN, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::PR, bo::CI, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN], // WJ
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::IN, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // H2
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::IN, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // H3
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::IN, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // JL
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::IN, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // JV
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::IN, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // JT
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI, bo::DI, bo::IN, bo::DI], // RI
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::IN, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::DI], // EB
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::DI, bo::IN, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // EM
[bo::IN, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::PR, bo::PR, bo::PR, bo::IN, bo::IN, bo::IN, bo::IN, bo::IN, bo::DI, bo::IN, bo::IN, bo::IN, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI], // ZWJ
[bo::DI, bo::PR, bo::PR, bo::IN, bo::IN, bo::DI, bo::PR, bo::PR, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::PR, bo::CI, bo::PR, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::DI, bo::IN, bo::DI] // CB
];

654
vendor/hare-unicode/unicode/ucd.ha vendored Normal file
View file

@ -0,0 +1,654 @@
type ucd_encodedrec = (u8, u8, u8, u8, u8, u16, u8);
type ucd_record = struct {
category: u8,
combining: u8,
bidirectional: u8,
mirrored: u8,
east_asian_width: u8,
script: u16,
line_break: u8,
};
fn get_ucdrecord(rn: rune) *ucd_record = {
const code = rn: u32;
let index = 0u16;
if (code < 0x110000) {
index = index1[(code>>UCD_RECORD_SHIFT)];
index = index2[(index<<UCD_RECORD_SHIFT)+(code&((1<<UCD_RECORD_SHIFT)-1))];
};
return &ucd_records[index]: *ucd_record;
};
// Unicode character General_Category attribute
export type gc = enum u8 {
Cc, // Control
Cf, // Format
Cn, // Unassigned
Co, // Private use
Cs, // Surrogate
Ll, // Lowercase letter
Lm, // Modifier letter
Lo, // Other letter
Lt, // Titlecase letter
Lu, // Uppercase letter
Mc, // Spacing mark
Me, // Enclosing mark
Mn, // Non-spacing mark
Nd, // Decimal number
Nl, // Letter number
No, // Other number
Pc, // Connect punctuation
Pd, // Dash punctuation
Pe, // Close punctuation
Pf, // Final punctuation
Pi, // Initial punctuation
Po, // Other punctuation
Ps, // Open punctuation
Sc, // Currency symbol
Sk, // Modifier symbol
Sm, // Math symbol
So, // Other symbol
Zl, // Line separator
Zp, // Paragraph separator
Zs, // Space separator
};
// Returns the [[general_category]] corresponding to this rune.
export fn rune_gc(rn: rune) gc = {
return get_ucdrecord(rn).category: gc;
};
// Returns the name associated with a [[gc]] value.
export fn gc_name(v: gc) const str = {
switch (v) {
case gc::Cc => return "Control";
case gc::Cf => return "Format";
case gc::Cn => return "Unassigned";
case gc::Co => return "Private use";
case gc::Cs => return "Surrogate";
case gc::Ll => return "Lowercase letter";
case gc::Lm => return "Modifier letter";
case gc::Lo => return "Other letter";
case gc::Lt => return "Titlecase letter";
case gc::Lu => return "Uppercase letter";
case gc::Mc => return "Spacing mark";
case gc::Me => return "Enclosing mark";
case gc::Mn => return "Non-spacing mark";
case gc::Nd => return "Decimal number";
case gc::Nl => return "Letter number";
case gc::No => return "Other number";
case gc::Pc => return "Connect punctuation";
case gc::Pd => return "Dash punctuation";
case gc::Pe => return "Close punctuation";
case gc::Pf => return "Final punctuation";
case gc::Pi => return "Initial punctuation";
case gc::Po => return "Other punctuation";
case gc::Ps => return "Open punctuation";
case gc::Sc => return "Currency symbol";
case gc::Sk => return "Modifier symbol";
case gc::Sm => return "Math symbol";
case gc::So => return "Other symbol";
case gc::Zl => return "Line separator";
case gc::Zp => return "Paragraph separator";
case gc::Zs => return "Space separator";
};
};
// Returns the two-character code associated with a [[gc]] value.
export fn gc_code(v: gc) const str = {
switch (v) {
case gc::Cc => return "Cc";
case gc::Cf => return "Cf";
case gc::Cn => return "Cn";
case gc::Co => return "Co";
case gc::Cs => return "Cs";
case gc::Ll => return "Ll";
case gc::Lm => return "Lm";
case gc::Lo => return "Lo";
case gc::Lt => return "Lt";
case gc::Lu => return "Lu";
case gc::Mc => return "Mc";
case gc::Me => return "Me";
case gc::Mn => return "Mn";
case gc::Nd => return "Nd";
case gc::Nl => return "Nl";
case gc::No => return "No";
case gc::Pc => return "Pc";
case gc::Pd => return "Pd";
case gc::Pe => return "Pe";
case gc::Pf => return "Pf";
case gc::Pi => return "Pi";
case gc::Po => return "Po";
case gc::Ps => return "Ps";
case gc::Sc => return "Sc";
case gc::Sk => return "Sk";
case gc::Sm => return "Sm";
case gc::So => return "So";
case gc::Zl => return "Zl";
case gc::Zp => return "Zp";
case gc::Zs => return "Zs";
};
};
// Bidirectional classification.
export type bidi = enum u8 {
UNKNOWN,
L,
LRE,
LRO,
R,
AL,
RLE,
RLO,
PDF,
EN,
ES,
ET,
AN,
CS,
NSM,
BN,
B,
S,
WS,
ON,
LRI,
RLI,
FSI,
PDI,
};
// Returns the [[bidi]] classification corresponding to this rune.
export fn rune_bidi(rn: rune) bidi = {
return get_ucdrecord(rn).bidirectional: bidi;
};
// Unicode character Script attribute.
export type script = enum u16 {
COMMON, // Zyyy
INHERITED, // Zinh
UNKNOWN, // Zzzz
ADLAM, // Adlm
CAUCASIAN_ALBANIAN, // Aghb
AHOM, // Ahom
ARABIC, // Arab
IMPERIAL_ARAMAIC, // Armi
ARMENIAN, // Armn
AVESTAN, // Avst
BALINESE, // Bali
BAMUM, // Bamu
BASSA_VAH, // Bass
BATAK, // Batk
BENGALI, // Beng
BHAIKSUKI, // Bhks
BOPOMOFO, // Bopo
BRAHMI, // Brah
BRAILLE, // Brai
BUGINESE, // Bugi
BUHID, // Buhd
CHAKMA, // Cakm
CANADIAN_SYLLABICS, // Cans
CARIAN, // Cari
CHAM, // Cham
CHEROKEE, // Cher
CHORASMIAN, // Chrs
COPTIC, // Copt
CYPRO_MINOAN, // Cpmn
CYPRIOT, // Cprt
CYRILLIC, // Cyrl
DEVANAGARI, // Deva
DIVES_AKURU, // Diak
DOGRA, // Dogr
DESERET, // Dsrt
DUPLOYAN, // Dupl
EGYPTIAN_HIEROGLYPHS, // Egyp
ELBASAN, // Elba
ELYMAIC, // Elym
ETHIOPIC, // Ethi
GEORGIAN, // Geor
GLAGOLITIC, // Glag
GUNJALA_GONDI, // Gong
MASARAM_GONDI, // Gonm
GOTHIC, // Goth
GRANTHA, // Gran
GREEK, // Grek
GUJARATI, // Gujr
GURMUKHI, // Guru
HANGUL, // Hang
HAN, // Hani
HANUNOO, // Hano
HATRAN, // Hatr
HEBREW, // Hebr
HIRAGANA, // Hira
ANATOLIAN_HIEROGLYPHS, // Hluw
PAHAWH_HMONG, // Hmng
NYIAKENG_PUACHUE_HMONG, // Hmnp
OLD_HUNGARIAN, // Hung
OLD_ITALIC, // Ital
JAVANESE, // Java
KAYAH_LI, // Kali
KATAKANA, // Kana
KAWI, // Kawi
KHAROSHTHI, // Khar
KHMER, // Khmr
KHOJKI, // Khoj
KHITAN_SMALL_SCRIPT, // Kits
KANNADA, // Knda
KAITHI, // Kthi
TAI_THAM, // Lana
LAO, // Laoo
LATIN, // Latn
LEPCHA, // Lepc
LIMBU, // Limb
LINEAR_A, // Lina
LINEAR_B, // Linb
LISU, // Lisu
LYCIAN, // Lyci
LYDIAN, // Lydi
MAHAJANI, // Mahj
MAKASAR, // Maka
MANDAIC, // Mand
MANICHAEAN, // Mani
MARCHEN, // Marc
MEDEFAIDRIN, // Medf
MENDE_KIKAKUI, // Mend
MEROITIC_CURSIVE, // Merc
MEROITIC_HIEROGLYPHS, // Mero
MALAYALAM, // Mlym
MODI, // Modi
MONGOLIAN, // Mong
MRO, // Mroo
MEETEI_MAYEK, // Mtei
MULTANI, // Mult
MYANMAR, // Mymr
NAG_MUNDARI, // Nagm
NANDINAGARI, // Nand
OLD_NORTH_ARABIAN, // Narb
NABATAEAN, // Nbat
NEWA, // Newa
NKO, // Nkoo
NUSHU, // Nshu
OGHAM, // Ogam
OL_CHIKI, // Olck
OLD_TURKIC, // Orkh
ORIYA, // Orya
OSAGE, // Osge
OSMANYA, // Osma
OLD_UYGHUR, // Ougr
PALMYRENE, // Palm
PAU_CIN_HAU, // Pauc
OLD_PERMIC, // Perm
PHAGS_PA, // Phag
INSCRIPTIONAL_PAHLAVI, // Phli
PSALTER_PAHLAVI, // Phlp
PHOENICIAN, // Phnx
MIAO, // Plrd
INSCRIPTIONAL_PARTHIAN, // Prti
REJANG, // Rjng
HANIFI_ROHINGYA, // Rohg
RUNIC, // Runr
SAMARITAN, // Samr
OLD_SOUTH_ARABIAN, // Sarb
SAURASHTRA, // Saur
SIGNWRITING, // Sgnw
SHAVIAN, // Shaw
SHARADA, // Shrd
SIDDHAM, // Sidd
KHUDAWADI, // Sind
SINHALA, // Sinh
SOGDIAN, // Sogd
OLD_SOGDIAN, // Sogo
SORA_SOMPENG, // Sora
SOYOMBO, // Soyo
SUNDANESE, // Sund
SYLOTI_NAGRI, // Sylo
SYRIAC, // Syrc
TAGBANWA, // Tagb
TAKRI, // Takr
TAI_LE, // Tale
NEW_TAI_LUE, // Talu
TAMIL, // Taml
TANGUT, // Tang
TAI_VIET, // Tavt
TELUGU, // Telu
TIFINAGH, // Tfng
TAGALOG, // Tglg
THAANA, // Thaa
THAI, // Thai
TIBETAN, // Tibt
TIRHUTA, // Tirh
TANGSA, // Tnsa
TOTO, // Toto
UGARITIC, // Ugar
VAI, // Vaii
VITHKUQI, // Vith
WARANG_CITI, // Wara
WANCHO, // Wcho
OLD_PERSIAN, // Xpeo
CUNEIFORM, // Xsux
YEZIDI, // Yezi
YI, // Yiii
ZANABAZAR_SQUARE, // Zanb
MATH, // Zmth
};
// Returns the [[script]] corresponding to this rune.
export fn rune_script(rn: rune) script = {
return get_ucdrecord(rn).script: script;
};
// Returns the four-character code associated with a [[script]] value.
export fn script_code(sc: script) const str = {
switch (sc) {
case script::COMMON => return "Zyyy";
case script::INHERITED => return "Zinh";
case script::UNKNOWN => return "Zzzz";
case script::ARABIC => return "Arab";
case script::ARMENIAN => return "Armn";
case script::BENGALI => return "Beng";
case script::CYRILLIC => return "Cyrl";
case script::DEVANAGARI => return "Deva";
case script::GEORGIAN => return "Geor";
case script::GREEK => return "Grek";
case script::GUJARATI => return "Gujr";
case script::GURMUKHI => return "Guru";
case script::HANGUL => return "Hang";
case script::HAN => return "Hani";
case script::HEBREW => return "Hebr";
case script::HIRAGANA => return "Hira";
case script::KANNADA => return "Knda";
case script::KATAKANA => return "Kana";
case script::LAO => return "Laoo";
case script::LATIN => return "Latn";
case script::MALAYALAM => return "Mlym";
case script::ORIYA => return "Orya";
case script::TAMIL => return "Taml";
case script::TELUGU => return "Telu";
case script::THAI => return "Thai";
case script::TIBETAN => return "Tibt";
case script::BOPOMOFO => return "Bopo";
case script::BRAILLE => return "Brai";
case script::CANADIAN_SYLLABICS => return "Cans";
case script::CHEROKEE => return "Cher";
case script::ETHIOPIC => return "Ethi";
case script::KHMER => return "Khmr";
case script::MONGOLIAN => return "Mong";
case script::MYANMAR => return "Mymr";
case script::OGHAM => return "Ogam";
case script::RUNIC => return "Runr";
case script::SINHALA => return "Sinh";
case script::SYRIAC => return "Syrc";
case script::THAANA => return "Thaa";
case script::YI => return "Yiii";
case script::DESERET => return "Dsrt";
case script::GOTHIC => return "Goth";
case script::OLD_ITALIC => return "Ital";
case script::BUHID => return "Buhd";
case script::HANUNOO => return "Hano";
case script::TAGALOG => return "Tglg";
case script::TAGBANWA => return "Tagb";
case script::CYPRIOT => return "Cprt";
case script::LIMBU => return "Limb";
case script::LINEAR_B => return "Linb";
case script::OSMANYA => return "Osma";
case script::SHAVIAN => return "Shaw";
case script::TAI_LE => return "Tale";
case script::UGARITIC => return "Ugar";
case script::BUGINESE => return "Bugi";
case script::COPTIC => return "Copt";
case script::GLAGOLITIC => return "Glag";
case script::KHAROSHTHI => return "Khar";
case script::NEW_TAI_LUE => return "Talu";
case script::OLD_PERSIAN => return "Xpeo";
case script::SYLOTI_NAGRI => return "Sylo";
case script::TIFINAGH => return "Tfng";
case script::BALINESE => return "Bali";
case script::CUNEIFORM => return "Xsux";
case script::NKO => return "Nkoo";
case script::PHAGS_PA => return "Phag";
case script::PHOENICIAN => return "Phnx";
case script::CARIAN => return "Cari";
case script::CHAM => return "Cham";
case script::KAYAH_LI => return "Kali";
case script::LEPCHA => return "Lepc";
case script::LYCIAN => return "Lyci";
case script::LYDIAN => return "Lydi";
case script::OL_CHIKI => return "Olck";
case script::REJANG => return "Rjng";
case script::SAURASHTRA => return "Saur";
case script::SUNDANESE => return "Sund";
case script::VAI => return "Vaii";
case script::AVESTAN => return "Avst";
case script::BAMUM => return "Bamu";
case script::EGYPTIAN_HIEROGLYPHS => return "Egyp";
case script::IMPERIAL_ARAMAIC => return "Armi";
case script::INSCRIPTIONAL_PAHLAVI => return "Phli";
case script::INSCRIPTIONAL_PARTHIAN => return "Prti";
case script::JAVANESE => return "Java";
case script::KAITHI => return "Kthi";
case script::LISU => return "Lisu";
case script::MEETEI_MAYEK => return "Mtei";
case script::OLD_SOUTH_ARABIAN => return "Sarb";
case script::OLD_TURKIC => return "Orkh";
case script::SAMARITAN => return "Samr";
case script::TAI_THAM => return "Lana";
case script::TAI_VIET => return "Tavt";
case script::BATAK => return "Batk";
case script::BRAHMI => return "Brah";
case script::MANDAIC => return "Mand";
case script::CHAKMA => return "Cakm";
case script::MEROITIC_CURSIVE => return "Merc";
case script::MEROITIC_HIEROGLYPHS => return "Mero";
case script::MIAO => return "Plrd";
case script::SHARADA => return "Shrd";
case script::SORA_SOMPENG => return "Sora";
case script::TAKRI => return "Takr";
case script::BASSA_VAH => return "Bass";
case script::CAUCASIAN_ALBANIAN => return "Aghb";
case script::DUPLOYAN => return "Dupl";
case script::ELBASAN => return "Elba";
case script::GRANTHA => return "Gran";
case script::KHOJKI => return "Khoj";
case script::KHUDAWADI => return "Sind";
case script::LINEAR_A => return "Lina";
case script::MAHAJANI => return "Mahj";
case script::MANICHAEAN => return "Mani";
case script::MENDE_KIKAKUI => return "Mend";
case script::MODI => return "Modi";
case script::MRO => return "Mroo";
case script::NABATAEAN => return "Nbat";
case script::OLD_NORTH_ARABIAN => return "Narb";
case script::OLD_PERMIC => return "Perm";
case script::PAHAWH_HMONG => return "Hmng";
case script::PALMYRENE => return "Palm";
case script::PAU_CIN_HAU => return "Pauc";
case script::PSALTER_PAHLAVI => return "Phlp";
case script::SIDDHAM => return "Sidd";
case script::TIRHUTA => return "Tirh";
case script::WARANG_CITI => return "Wara";
case script::AHOM => return "Ahom";
case script::ANATOLIAN_HIEROGLYPHS => return "Hluw";
case script::HATRAN => return "Hatr";
case script::MULTANI => return "Mult";
case script::OLD_HUNGARIAN => return "Hung";
case script::SIGNWRITING => return "Sgnw";
case script::ADLAM => return "Adlm";
case script::BHAIKSUKI => return "Bhks";
case script::MARCHEN => return "Marc";
case script::OSAGE => return "Osge";
case script::TANGUT => return "Tang";
case script::NEWA => return "Newa";
case script::MASARAM_GONDI => return "Gonm";
case script::NUSHU => return "Nshu";
case script::SOYOMBO => return "Soyo";
case script::ZANABAZAR_SQUARE => return "Zanb";
case script::DOGRA => return "Dogr";
case script::GUNJALA_GONDI => return "Gong";
case script::HANIFI_ROHINGYA => return "Rohg";
case script::MAKASAR => return "Maka";
case script::MEDEFAIDRIN => return "Medf";
case script::OLD_SOGDIAN => return "Sogo";
case script::SOGDIAN => return "Sogd";
case script::ELYMAIC => return "Elym";
case script::NANDINAGARI => return "Nand";
case script::NYIAKENG_PUACHUE_HMONG => return "Hmnp";
case script::WANCHO => return "Wcho";
case script::CHORASMIAN => return "Chrs";
case script::DIVES_AKURU => return "Diak";
case script::KHITAN_SMALL_SCRIPT => return "Kits";
case script::YEZIDI => return "Yezi";
case script::CYPRO_MINOAN => return "Cpmn";
case script::OLD_UYGHUR => return "Ougr";
case script::TANGSA => return "Tnsa";
case script::TOTO => return "Toto";
case script::VITHKUQI => return "Vith";
case script::MATH => return "Zmth";
case script::KAWI => return "Kawi";
case script::NAG_MUNDARI => return "Nagm";
};
};
// Line break classification.
export type line_break = enum u8 {
XX,
AI,
BK,
CJ,
CR,
LF,
NL,
SA,
SG,
SP,
OP,
CL,
CP,
QU,
GL,
NS,
EX,
SY,
IS,
PR,
PO,
NU,
AL,
HL,
ID,
IN,
HY,
BA,
BB,
B2,
ZW,
CM,
WJ,
H2,
H3,
JL,
JV,
JT,
RI,
EB,
EM,
ZWJ,
CB,
};
// Returns the [[line_break]] classification corresponding to this rune.
export fn rune_line_break(rn: rune) line_break = {
return get_ucdrecord(rn).line_break: line_break;
};
// Returns the two-character code associated with a [[line_break]] value.
export fn line_break_code(lb: line_break) const str = {
switch (lb) {
case line_break::XX =>
return "XX";
case line_break::AI =>
return "AI";
case line_break::AL =>
return "AL";
case line_break::B2 =>
return "B2";
case line_break::BA =>
return "BA";
case line_break::BB =>
return "BB";
case line_break::BK =>
return "BK";
case line_break::CB =>
return "CB";
case line_break::CJ =>
return "CJ";
case line_break::CL =>
return "CL";
case line_break::CM =>
return "CM";
case line_break::CP =>
return "CP";
case line_break::CR =>
return "CR";
case line_break::EB =>
return "EB";
case line_break::EM =>
return "EM";
case line_break::EX =>
return "EX";
case line_break::GL =>
return "GL";
case line_break::H2 =>
return "H2";
case line_break::H3 =>
return "H3";
case line_break::HL =>
return "HL";
case line_break::HY =>
return "HY";
case line_break::ID =>
return "ID";
case line_break::IN =>
return "IN";
case line_break::IS =>
return "IS";
case line_break::JL =>
return "JL";
case line_break::JT =>
return "JT";
case line_break::JV =>
return "JV";
case line_break::LF =>
return "LF";
case line_break::NL =>
return "NL";
case line_break::NS =>
return "NS";
case line_break::NU =>
return "NU";
case line_break::OP =>
return "OP";
case line_break::PO =>
return "PO";
case line_break::PR =>
return "PR";
case line_break::QU =>
return "QU";
case line_break::RI =>
return "RI";
case line_break::SA =>
return "SA";
case line_break::SG =>
return "SG";
case line_break::SP =>
return "SP";
case line_break::SY =>
return "SY";
case line_break::WJ =>
return "WJ";
case line_break::ZW =>
return "ZW";
case line_break::ZWJ =>
return "ZWJ";
};
};

4122
vendor/hare-unicode/unicode/ucd_gen.ha vendored Normal file

File diff suppressed because it is too large Load diff