Lobo Torres
c70ec9f648
git-subtree-dir: vendor/hare-unicode git-subtree-mainline:57979aa6fc
git-subtree-split:1488c26f46
860 lines
26 KiB
Python
Executable file
860 lines
26 KiB
Python
Executable file
#!/usr/bin/python3
|
|
# Based on CPython's unicodedata generation script,
|
|
# Tools/unicode/makeunicodedata.py, forked and adapted for Hare
|
|
#
|
|
# PSF License
|
|
#
|
|
# (re)generate unicode property and type databases
|
|
#
|
|
# This script converts Unicode database files to Modules/unicodedata_db.h,
|
|
# Modules/unicodename_db.h, and Objects/unicodetype_db.h
|
|
#
|
|
# history:
|
|
# 2000-09-24 fl created (based on bits and pieces from unidb)
|
|
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
|
|
# 2000-09-25 fl added character type table
|
|
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
|
|
# 2000-11-03 fl expand first/last ranges
|
|
# 2001-01-19 fl added character name tables (2.1)
|
|
# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
|
|
# 2002-09-11 wd use string methods
|
|
# 2002-10-18 mvl update to Unicode 3.2
|
|
# 2002-10-22 mvl generate NFC tables
|
|
# 2002-11-24 mvl expand all ranges, sort names version-independently
|
|
# 2002-11-25 mvl add UNIDATA_VERSION
|
|
# 2004-05-29 perky add east asian width information
|
|
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
|
|
# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
|
|
# 2011-10-21 ezio add support for name aliases and named sequences
|
|
# 2012-01 benjamin add full case mappings
|
|
#
|
|
# written by Fredrik Lundh (fredrik@pythonware.com)
|
|
#
|
|
|
|
import dataclasses
|
|
import os
|
|
import sys
|
|
import zipfile
|
|
|
|
from functools import partial
|
|
from textwrap import dedent
|
|
from typing import Iterator, List, Optional, Set, Tuple
|
|
|
|
SCRIPT = sys.argv[0]
|
|
VERSION = "3.3"
|
|
|
|
# The Unicode Database
|
|
# --------------------
|
|
# When changing UCD version please update
|
|
# * Doc/library/stdtypes.rst, and
|
|
# * Doc/library/unicodedata.rst
|
|
# * Doc/reference/lexical_analysis.rst (two occurrences)
|
|
UNIDATA_VERSION = "13.0.0"
|
|
UNICODE_DATA = "UnicodeData%s.txt"
|
|
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
|
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
|
UNIHAN = "Unihan%s.zip"
|
|
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
|
|
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
|
|
LINE_BREAK = "LineBreak%s.txt"
|
|
NAME_ALIASES = "NameAliases%s.txt"
|
|
NAMED_SEQUENCES = "NamedSequences%s.txt"
|
|
SPECIAL_CASING = "SpecialCasing%s.txt"
|
|
CASE_FOLDING = "CaseFolding%s.txt"
|
|
SCRIPTS = "Scripts%s.txt"
|
|
|
|
# Private Use Areas -- in planes 1, 15, 16
|
|
PUA_1 = range(0xE000, 0xF900)
|
|
PUA_15 = range(0xF0000, 0xFFFFE)
|
|
PUA_16 = range(0x100000, 0x10FFFE)
|
|
|
|
# we use this ranges of PUA_15 to store name aliases and named sequences
|
|
NAME_ALIASES_START = 0xF0000
|
|
NAMED_SEQUENCES_START = 0xF0200
|
|
|
|
old_versions = []
|
|
|
|
# Order must match ucd.ha
|
|
CATEGORY_NAMES = [
|
|
"Cc", "Cf", "Cn", "Co", "Cs", "Ll", "Lm", "Lo", "Lt", "Lu", "Mc", "Me",
|
|
"Mn", "Nd", "Nl", "No", "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps", "Sc",
|
|
"Sk", "Sm", "So", "Zl", "Zp", "Zs",
|
|
]
|
|
|
|
BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
|
|
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
|
|
"ON", "LRI", "RLI", "FSI", "PDI" ]
|
|
|
|
EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
|
|
|
|
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
|
|
|
|
LINE_BREAKS = [
|
|
"XX", "AI", "BK", "CJ", "CR", "LF", "NL", "SA", "SG", "SP", "OP", "CL",
|
|
"CP", "QU", "GL", "NS", "EX", "SY", "IS", "PR", "PO", "NU", "AL", "HL",
|
|
"ID", "IN", "HY", "BA", "BB", "B2", "ZW", "CM", "WJ", "H2", "H3", "JL",
|
|
"JV", "JT", "RI", "EB", "EM", "ZWJ", "CB",
|
|
]
|
|
|
|
SCRIPT_NAMES = [
|
|
"Common",
|
|
"Inherited",
|
|
"Unknown",
|
|
"Adlam",
|
|
"Caucasian_Albanian",
|
|
"Ahom",
|
|
"Arabic",
|
|
"Imperial_Aramaic",
|
|
"Armenian",
|
|
"Avestan",
|
|
"Balinese",
|
|
"Bamum",
|
|
"Bassa_Vah",
|
|
"Batak",
|
|
"Bengali",
|
|
"Bhaiksuki",
|
|
"Bopomofo",
|
|
"Brahmi",
|
|
"Braille",
|
|
"Buginese",
|
|
"Buhid",
|
|
"Chakma",
|
|
"Canadian_Aboriginal",
|
|
"Carian",
|
|
"Cham",
|
|
"Cherokee",
|
|
"Chorasmian",
|
|
"Coptic",
|
|
"Cypro_Minoan",
|
|
"Cypriot",
|
|
"Cyrillic",
|
|
"Devanagari",
|
|
"Dives_Akuru",
|
|
"Dogra",
|
|
"Deseret",
|
|
"Duployan",
|
|
"Egyptian_Hieroglyphs",
|
|
"Elbasan",
|
|
"Elymaic",
|
|
"Ethiopic",
|
|
"Georgian",
|
|
"Glagolitic",
|
|
"Gunjala_Gondi",
|
|
"Masaram_Gondi",
|
|
"Gothic",
|
|
"Grantha",
|
|
"Greek",
|
|
"Gujarati",
|
|
"Gurmukhi",
|
|
"Hangul",
|
|
"Han",
|
|
"Hanunoo",
|
|
"Hatran",
|
|
"Hebrew",
|
|
"Hiragana",
|
|
"Anatolian_Hieroglyphs",
|
|
"Pahawh_Hmong",
|
|
"Nyiakeng_Puachue_Hmong",
|
|
"Old_Hungarian",
|
|
"Old_Italic",
|
|
"Javanese",
|
|
"Kayah_Li",
|
|
"Katakana",
|
|
"Kawi",
|
|
"Kharoshthi",
|
|
"Khmer",
|
|
"Khojki",
|
|
"Khitan_Small_Script",
|
|
"Kannada",
|
|
"Kaithi",
|
|
"Tai_Tham",
|
|
"Lao",
|
|
"Latin",
|
|
"Lepcha",
|
|
"Limbu",
|
|
"Linear_A",
|
|
"Linear_B",
|
|
"Lisu",
|
|
"Lycian",
|
|
"Lydian",
|
|
"Mahajani",
|
|
"Makasar",
|
|
"Mandaic",
|
|
"Manichaean",
|
|
"Marchen",
|
|
"Medefaidrin",
|
|
"Mende_Kikakui",
|
|
"Meroitic_Cursive",
|
|
"Meroitic_Hieroglyphs",
|
|
"Malayalam",
|
|
"Modi",
|
|
"Mongolian",
|
|
"Mro",
|
|
"Meetei_Mayek",
|
|
"Multani",
|
|
"Myanmar",
|
|
"Nag_Mundari",
|
|
"Nandinagari",
|
|
"Old_North_Arabian",
|
|
"Nabataean",
|
|
"Newa",
|
|
"Nko",
|
|
"Nushu",
|
|
"Ogham",
|
|
"Ol_Chiki",
|
|
"Old_Turkic",
|
|
"Oriya",
|
|
"Osage",
|
|
"Osmanya",
|
|
"Old_Uyghur",
|
|
"Palmyrene",
|
|
"Pau_Cin_Hau",
|
|
"Old_Permic",
|
|
"Phags_Pa",
|
|
"Inscriptional_Pahlavi",
|
|
"Psalter_Pahlavi",
|
|
"Phoenician",
|
|
"Miao",
|
|
"Inscriptional_Parthian",
|
|
"Rejang",
|
|
"Hanifi_Rohingya",
|
|
"Runic",
|
|
"Samaritan",
|
|
"Old_South_Arabian",
|
|
"Saurashtra",
|
|
"SignWriting",
|
|
"Shavian",
|
|
"Sharada",
|
|
"Siddham",
|
|
"Khudawadi",
|
|
"Sinhala",
|
|
"Sogdian",
|
|
"Old_Sogdian",
|
|
"Sora_Sompeng",
|
|
"Soyombo",
|
|
"Sundanese",
|
|
"Syloti_Nagri",
|
|
"Syriac",
|
|
"Tagbanwa",
|
|
"Takri",
|
|
"Tai_Le",
|
|
"New_Tai_Lue",
|
|
"Tamil",
|
|
"Tangut",
|
|
"Tai_Viet",
|
|
"Telugu",
|
|
"Tifinagh",
|
|
"Tagalog",
|
|
"Thaana",
|
|
"Thai",
|
|
"Tibetan",
|
|
"Tirhuta",
|
|
"Tangsa",
|
|
"Toto",
|
|
"Ugaritic",
|
|
"Vai",
|
|
"Vithkuqi",
|
|
"Warang_Citi",
|
|
"Wancho",
|
|
"Old_Persian",
|
|
"Cuneiform",
|
|
"Yezidi",
|
|
"Yi",
|
|
"Zanabazar_Square",
|
|
]
|
|
|
|
# note: should match definitions in Objects/unicodectype.c
|
|
ALPHA_MASK = 0x01
|
|
DECIMAL_MASK = 0x02
|
|
DIGIT_MASK = 0x04
|
|
LOWER_MASK = 0x08
|
|
LINEBREAK_MASK = 0x10
|
|
SPACE_MASK = 0x20
|
|
TITLE_MASK = 0x40
|
|
UPPER_MASK = 0x80
|
|
XID_START_MASK = 0x100
|
|
XID_CONTINUE_MASK = 0x200
|
|
PRINTABLE_MASK = 0x400
|
|
NUMERIC_MASK = 0x800
|
|
CASE_IGNORABLE_MASK = 0x1000
|
|
CASED_MASK = 0x2000
|
|
EXTENDED_CASE_MASK = 0x4000
|
|
|
|
# these ranges need to match unicodedata.c:is_unified_ideograph
|
|
cjk_ranges = [
|
|
('3400', '4DBF'),
|
|
('4E00', '9FFC'),
|
|
('20000', '2A6DD'),
|
|
('2A700', '2B734'),
|
|
('2B740', '2B81D'),
|
|
('2B820', '2CEA1'),
|
|
('2CEB0', '2EBE0'),
|
|
('30000', '3134A'),
|
|
]
|
|
|
|
def maketables(trace=0):
|
|
|
|
print("--- Reading", UNICODE_DATA % "", "...")
|
|
|
|
unicode = UnicodeData(UNIDATA_VERSION)
|
|
|
|
print(len(list(filter(None, unicode.table))), "characters")
|
|
|
|
makeunicodedata(unicode, trace)
|
|
|
|
|
|
# --------------------------------------------------------------------
|
|
# unicode character properties
|
|
|
|
def makeunicodedata(unicode, trace):
|
|
|
|
dummy = (0, 0, 0, 0, 0, 0, 0)
|
|
table = [dummy]
|
|
cache = {0: dummy}
|
|
index = [0] * len(unicode.chars)
|
|
|
|
FILE = "unicode/ucd_gen.ha"
|
|
|
|
print("--- Preparing", FILE, "...")
|
|
|
|
for char in unicode.chars:
|
|
record = unicode.table[char]
|
|
if record:
|
|
# extract database properties
|
|
category = CATEGORY_NAMES.index(record.general_category)
|
|
combining = int(record.canonical_combining_class)
|
|
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
|
|
mirrored = record.bidi_mirrored == "Y"
|
|
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
|
|
script = SCRIPT_NAMES.index(record.script or "Unknown")
|
|
line_break = LINE_BREAKS.index(record.line_break)
|
|
item = (
|
|
category, combining, bidirectional,
|
|
mirrored, eastasianwidth, script,
|
|
line_break,
|
|
)
|
|
# add entry to index and item tables
|
|
i = cache.get(item)
|
|
if i is None:
|
|
cache[item] = i = len(table)
|
|
table.append(item)
|
|
index[char] = i
|
|
|
|
print(len(table), "unique properties")
|
|
|
|
print("--- Writing", FILE, "...")
|
|
|
|
with open(FILE, "w") as fp:
|
|
fprint = partial(print, file=fp)
|
|
|
|
fprint("// Generated by scripts/gen-ucd.py")
|
|
fprint()
|
|
fprint('// Unicode database version supported by this module')
|
|
fprint('export def UNIDATA_VERSION: str = "%s";' % UNIDATA_VERSION)
|
|
fprint('')
|
|
fprint("// List of unique database records")
|
|
fprint("const ucd_records: [_]ucd_encodedrec = [")
|
|
for item in table:
|
|
fprint(" (%d, %d, %d, %d, %d, %d, %d)," % item)
|
|
fprint("];")
|
|
fprint()
|
|
|
|
# split record index table
|
|
index1, index2, shift = splitbins(index, trace)
|
|
|
|
fprint("// index tables for the database records")
|
|
fprint("def UCD_RECORD_SHIFT: size = %d;" % shift)
|
|
Array("index1", index1).dump(fp, trace)
|
|
Array("index2", index2).dump(fp, trace)
|
|
|
|
|
|
DATA_DIR = os.path.join('.data')
|
|
|
|
def open_data(template, version):
|
|
local = os.path.join(DATA_DIR, template % ('-'+version,))
|
|
if not os.path.exists(local):
|
|
import urllib.request
|
|
if version == '3.2.0':
|
|
# irregular url structure
|
|
url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
|
|
else:
|
|
url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
|
|
os.makedirs(DATA_DIR, exist_ok=True)
|
|
urllib.request.urlretrieve(url, filename=local)
|
|
if local.endswith('.txt'):
|
|
return open(local, encoding='utf-8')
|
|
else:
|
|
# Unihan.zip
|
|
return open(local, 'rb')
|
|
|
|
|
|
def expand_range(char_range: str) -> Iterator[int]:
|
|
'''
|
|
Parses ranges of code points, as described in UAX #44:
|
|
https://www.unicode.org/reports/tr44/#Code_Point_Ranges
|
|
'''
|
|
if '..' in char_range:
|
|
first, last = [int(c, 16) for c in char_range.split('..')]
|
|
else:
|
|
first = last = int(char_range, 16)
|
|
for char in range(first, last+1):
|
|
yield char
|
|
|
|
|
|
class UcdFile:
|
|
'''
|
|
A file in the standard format of the UCD.
|
|
|
|
See: https://www.unicode.org/reports/tr44/#Format_Conventions
|
|
|
|
Note that, as described there, the Unihan data files have their
|
|
own separate format.
|
|
'''
|
|
|
|
def __init__(self, template: str, version: str) -> None:
|
|
self.template = template
|
|
self.version = version
|
|
|
|
def records(self) -> Iterator[List[str]]:
|
|
with open_data(self.template, self.version) as file:
|
|
for line in file:
|
|
line = line.split('#', 1)[0].strip()
|
|
if not line:
|
|
continue
|
|
yield [field.strip() for field in line.split(';')]
|
|
|
|
def __iter__(self) -> Iterator[List[str]]:
|
|
return self.records()
|
|
|
|
def expanded(self) -> Iterator[Tuple[int, List[str]]]:
|
|
for record in self.records():
|
|
char_range, rest = record[0], record[1:]
|
|
for char in expand_range(char_range):
|
|
yield char, rest
|
|
|
|
|
|
@dataclasses.dataclass
|
|
class UcdRecord:
|
|
# 15 fields from UnicodeData.txt . See:
|
|
# https://www.unicode.org/reports/tr44/#UnicodeData.txt
|
|
codepoint: str
|
|
name: str
|
|
general_category: str
|
|
canonical_combining_class: str
|
|
bidi_class: str
|
|
decomposition_type: str
|
|
decomposition_mapping: str
|
|
numeric_type: str
|
|
numeric_value: str
|
|
bidi_mirrored: str
|
|
unicode_1_name: str # obsolete
|
|
iso_comment: str # obsolete
|
|
simple_uppercase_mapping: str
|
|
simple_lowercase_mapping: str
|
|
simple_titlecase_mapping: str
|
|
|
|
# https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
|
|
east_asian_width: Optional[str]
|
|
|
|
# Binary properties, as a set of those that are true.
|
|
# Taken from multiple files:
|
|
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
|
|
# https://www.unicode.org/reports/tr44/#LineBreak.txt
|
|
binary_properties: Set[str]
|
|
|
|
# The Quick_Check properties related to normalization:
|
|
# https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
|
|
# We store them as a bitmask.
|
|
quick_check: int
|
|
|
|
# From Script.txt
|
|
script: str
|
|
|
|
# From LineBreak.txt
|
|
line_break: str
|
|
|
|
|
|
def from_row(row: List[str]) -> UcdRecord:
|
|
return UcdRecord(*row, None, set(), 0, "Unknown", "XX")
|
|
|
|
|
|
# --------------------------------------------------------------------
|
|
# the following support code is taken from the unidb utilities
|
|
# Copyright (c) 1999-2000 by Secret Labs AB
|
|
|
|
# load a unicode-data file from disk
|
|
|
|
class UnicodeData:
|
|
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
|
|
|
|
def __init__(self, version, cjk_check=True):
|
|
self.changed = []
|
|
table = [None] * 0x110000
|
|
for s in UcdFile(UNICODE_DATA, version):
|
|
char = int(s[0], 16)
|
|
table[char] = from_row(s)
|
|
|
|
cjk_ranges_found = []
|
|
|
|
# expand first-last ranges
|
|
field = None
|
|
for i in range(0, 0x110000):
|
|
# The file UnicodeData.txt has its own distinct way of
|
|
# expressing ranges. See:
|
|
# https://www.unicode.org/reports/tr44/#Code_Point_Ranges
|
|
s = table[i]
|
|
if s:
|
|
if s.name[-6:] == "First>":
|
|
s.name = ""
|
|
field = dataclasses.astuple(s)[:15]
|
|
elif s.name[-5:] == "Last>":
|
|
if s.name.startswith("<CJK Ideograph"):
|
|
cjk_ranges_found.append((field[0],
|
|
s.codepoint))
|
|
s.name = ""
|
|
field = None
|
|
elif field:
|
|
table[i] = from_row(('%X' % i,) + field[1:])
|
|
if cjk_check and cjk_ranges != cjk_ranges_found:
|
|
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
|
|
|
|
# public attributes
|
|
self.filename = UNICODE_DATA % ''
|
|
self.table = table
|
|
self.chars = list(range(0x110000)) # unicode 3.2
|
|
|
|
# check for name aliases and named sequences, see #12753
|
|
# aliases and named sequences are not in 3.2.0
|
|
if version != '3.2.0':
|
|
self.aliases = []
|
|
# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
|
|
# in order to take advantage of the compression and lookup
|
|
# algorithms used for the other characters
|
|
pua_index = NAME_ALIASES_START
|
|
for char, name, abbrev in UcdFile(NAME_ALIASES, version):
|
|
char = int(char, 16)
|
|
self.aliases.append((name, char))
|
|
# also store the name in the PUA 1
|
|
self.table[pua_index].name = name
|
|
pua_index += 1
|
|
assert pua_index - NAME_ALIASES_START == len(self.aliases)
|
|
|
|
self.named_sequences = []
|
|
# store named sequences in the PUA 1, in range U+F0100..,
|
|
# in order to take advantage of the compression and lookup
|
|
# algorithms used for the other characters.
|
|
|
|
assert pua_index < NAMED_SEQUENCES_START
|
|
pua_index = NAMED_SEQUENCES_START
|
|
for name, chars in UcdFile(NAMED_SEQUENCES, version):
|
|
chars = tuple(int(char, 16) for char in chars.split())
|
|
# check that the structure defined in makeunicodename is OK
|
|
assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
|
|
assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
|
|
"the NamedSequence struct and in unicodedata_lookup")
|
|
self.named_sequences.append((name, chars))
|
|
# also store these in the PUA 1
|
|
self.table[pua_index].name = name
|
|
pua_index += 1
|
|
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
|
|
|
|
self.exclusions = {}
|
|
for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
|
|
char = int(char, 16)
|
|
self.exclusions[char] = 1
|
|
|
|
widths = [None] * 0x110000
|
|
for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
|
|
widths[char] = width
|
|
|
|
for i in range(0, 0x110000):
|
|
if table[i] is not None:
|
|
table[i].east_asian_width = widths[i]
|
|
|
|
scripts = [None] * 0x110000
|
|
for char, (script,) in UcdFile(SCRIPTS, version).expanded():
|
|
scripts[char] = script
|
|
|
|
for i in range(0, 0x110000):
|
|
if table[i] is not None:
|
|
table[i].script = scripts[i]
|
|
|
|
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
|
|
if table[char]:
|
|
# Some properties (e.g. Default_Ignorable_Code_Point)
|
|
# apply to unassigned code points; ignore them
|
|
table[char].binary_properties.add(p)
|
|
|
|
for char_range, value in UcdFile(LINE_BREAK, version):
|
|
for char in expand_range(char_range):
|
|
if not table[char]:
|
|
continue
|
|
if value in MANDATORY_LINE_BREAKS:
|
|
table[char].binary_properties.add('Line_Break')
|
|
table[char].line_break = value
|
|
|
|
# We only want the quickcheck properties
|
|
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
|
|
# Yes is the default, hence only N and M occur
|
|
# In 3.2.0, the format was different (NF?_NO)
|
|
# The parsing will incorrectly determine these as
|
|
# "yes", however, unicodedata.c will not perform quickchecks
|
|
# for older versions, and no delta records will be created.
|
|
quickchecks = [0] * 0x110000
|
|
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
|
|
for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
|
|
if len(s) < 2 or s[1] not in qc_order:
|
|
continue
|
|
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
|
|
quickcheck_shift = qc_order.index(s[1])*2
|
|
quickcheck <<= quickcheck_shift
|
|
for char in expand_range(s[0]):
|
|
assert not (quickchecks[char]>>quickcheck_shift)&3
|
|
quickchecks[char] |= quickcheck
|
|
for i in range(0, 0x110000):
|
|
if table[i] is not None:
|
|
table[i].quick_check = quickchecks[i]
|
|
|
|
with open_data(UNIHAN, version) as file:
|
|
zip = zipfile.ZipFile(file)
|
|
if version == '3.2.0':
|
|
data = zip.open('Unihan-3.2.0.txt').read()
|
|
else:
|
|
data = zip.open('Unihan_NumericValues.txt').read()
|
|
for line in data.decode("utf-8").splitlines():
|
|
if not line.startswith('U+'):
|
|
continue
|
|
code, tag, value = line.split(None, 3)[:3]
|
|
if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
|
|
'kOtherNumeric'):
|
|
continue
|
|
value = value.strip().replace(',', '')
|
|
i = int(code[2:], 16)
|
|
# Patch the numeric field
|
|
if table[i] is not None:
|
|
table[i].numeric_value = value
|
|
|
|
sc = self.special_casing = {}
|
|
for data in UcdFile(SPECIAL_CASING, version):
|
|
if data[4]:
|
|
# We ignore all conditionals (since they depend on
|
|
# languages) except for one, which is hardcoded. See
|
|
# handle_capital_sigma in unicodeobject.c.
|
|
continue
|
|
c = int(data[0], 16)
|
|
lower = [int(char, 16) for char in data[1].split()]
|
|
title = [int(char, 16) for char in data[2].split()]
|
|
upper = [int(char, 16) for char in data[3].split()]
|
|
sc[c] = (lower, title, upper)
|
|
|
|
cf = self.case_folding = {}
|
|
if version != '3.2.0':
|
|
for data in UcdFile(CASE_FOLDING, version):
|
|
if data[1] in "CF":
|
|
c = int(data[0], 16)
|
|
cf[c] = [int(char, 16) for char in data[2].split()]
|
|
|
|
def uselatin1(self):
|
|
# restrict character range to ISO Latin 1
|
|
self.chars = list(range(256))
|
|
|
|
|
|
# hash table tools
|
|
|
|
# this is a straight-forward reimplementation of Python's built-in
|
|
# dictionary type, using a static data structure, and a custom string
|
|
# hash algorithm.
|
|
|
|
def myhash(s, magic):
|
|
h = 0
|
|
for c in map(ord, s.upper()):
|
|
h = (h * magic) + c
|
|
ix = h & 0xff000000
|
|
if ix:
|
|
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
|
|
return h
|
|
|
|
|
|
SIZES = [
|
|
(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
|
|
(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
|
|
(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
|
|
(2097152,5), (4194304,3), (8388608,33), (16777216,27)
|
|
]
|
|
|
|
|
|
class Hash:
|
|
def __init__(self, name, data, magic):
|
|
# turn a (key, value) list into a static hash table structure
|
|
|
|
# determine table size
|
|
for size, poly in SIZES:
|
|
if size > len(data):
|
|
poly = size + poly
|
|
break
|
|
else:
|
|
raise AssertionError("ran out of polynomials")
|
|
|
|
print(size, "slots in hash table")
|
|
|
|
table = [None] * size
|
|
|
|
mask = size-1
|
|
|
|
n = 0
|
|
|
|
hash = myhash
|
|
|
|
# initialize hash table
|
|
for key, value in data:
|
|
h = hash(key, magic)
|
|
i = (~h) & mask
|
|
v = table[i]
|
|
if v is None:
|
|
table[i] = value
|
|
continue
|
|
incr = (h ^ (h >> 3)) & mask
|
|
if not incr:
|
|
incr = mask
|
|
while 1:
|
|
n = n + 1
|
|
i = (i + incr) & mask
|
|
v = table[i]
|
|
if v is None:
|
|
table[i] = value
|
|
break
|
|
incr = incr << 1
|
|
if incr > mask:
|
|
incr = incr ^ poly
|
|
|
|
print(n, "collisions")
|
|
self.collisions = n
|
|
|
|
for i in range(len(table)):
|
|
if table[i] is None:
|
|
table[i] = 0
|
|
|
|
self.data = Array(name + "_hash", table)
|
|
self.magic = magic
|
|
self.name = name
|
|
self.size = size
|
|
self.poly = poly
|
|
|
|
def dump(self, file, trace):
|
|
# write data to file, as a C array
|
|
self.data.dump(file, trace)
|
|
file.write("#define %s_magic %d\n" % (self.name, self.magic))
|
|
file.write("#define %s_size %d\n" % (self.name, self.size))
|
|
file.write("#define %s_poly %d\n" % (self.name, self.poly))
|
|
|
|
|
|
# stuff to deal with arrays of unsigned integers
|
|
|
|
class Array:
|
|
|
|
def __init__(self, name, data):
|
|
self.name = name
|
|
self.data = data
|
|
|
|
def dump(self, file, trace=0):
|
|
# write data to file, as a C array
|
|
size = getsize(self.data)
|
|
if trace:
|
|
print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
|
|
file.write("const " + self.name + ": [_]")
|
|
if size == 1:
|
|
file.write("u8")
|
|
elif size == 2:
|
|
file.write("u16")
|
|
else:
|
|
file.write("u32")
|
|
file.write(" = [\n")
|
|
if self.data:
|
|
s = " "
|
|
for item in self.data:
|
|
i = str(item) + ", "
|
|
if len(s) + len(i) > 78:
|
|
file.write(s.rstrip() + "\n")
|
|
s = " " + i
|
|
else:
|
|
s = s + i
|
|
if s.strip():
|
|
file.write(s.rstrip() + "\n")
|
|
file.write("];\n\n")
|
|
|
|
|
|
def getsize(data):
|
|
# return smallest possible integer size for the given array
|
|
maxdata = max(data)
|
|
if maxdata < 256:
|
|
return 1
|
|
elif maxdata < 65536:
|
|
return 2
|
|
else:
|
|
return 4
|
|
|
|
|
|
def splitbins(t, trace=0):
|
|
"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
|
|
|
|
t is a sequence of ints. This function can be useful to save space if
|
|
many of the ints are the same. t1 and t2 are lists of ints, and shift
|
|
is an int, chosen to minimize the combined size of t1 and t2 (in C
|
|
code), and where for each i in range(len(t)),
|
|
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
|
|
where mask is a bitmask isolating the last "shift" bits.
|
|
|
|
If optional arg trace is non-zero (default zero), progress info
|
|
is printed to sys.stderr. The higher the value, the more info
|
|
you'll get.
|
|
"""
|
|
|
|
if trace:
|
|
def dump(t1, t2, shift, bytes):
|
|
print("%d+%d bins at shift %d; %d bytes" % (
|
|
len(t1), len(t2), shift, bytes), file=sys.stderr)
|
|
print("Size of original table:", len(t)*getsize(t), "bytes",
|
|
file=sys.stderr)
|
|
n = len(t)-1 # last valid index
|
|
maxshift = 0 # the most we can shift n and still have something left
|
|
if n > 0:
|
|
while n >> 1:
|
|
n >>= 1
|
|
maxshift += 1
|
|
del n
|
|
bytes = sys.maxsize # smallest total size so far
|
|
t = tuple(t) # so slices can be dict keys
|
|
for shift in range(maxshift + 1):
|
|
t1 = []
|
|
t2 = []
|
|
size = 2**shift
|
|
bincache = {}
|
|
for i in range(0, len(t), size):
|
|
bin = t[i:i+size]
|
|
index = bincache.get(bin)
|
|
if index is None:
|
|
index = len(t2)
|
|
bincache[bin] = index
|
|
t2.extend(bin)
|
|
t1.append(index >> shift)
|
|
# determine memory size
|
|
b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
|
|
if trace > 1:
|
|
dump(t1, t2, shift, b)
|
|
if b < bytes:
|
|
best = t1, t2, shift
|
|
bytes = b
|
|
t1, t2, shift = best
|
|
if trace:
|
|
print("Best:", end=' ', file=sys.stderr)
|
|
dump(t1, t2, shift, bytes)
|
|
if __debug__:
|
|
# exhaustively verify that the decomposition is correct
|
|
mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
|
|
for i in range(len(t)):
|
|
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
|
|
return best
|
|
|
|
|
|
if __name__ == "__main__":
|
|
maketables(1)
|