Better unicode data generation
This commit is contained in:
parent
8cda5af9fe
commit
4af5478f60
2 changed files with 969 additions and 859 deletions
|
@ -1,83 +1,143 @@
|
||||||
import regex
|
import array
|
||||||
import ctypes
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
import requests
|
||||||
|
|
||||||
class CoodepointFlags (ctypes.Structure):
|
|
||||||
_fields_ = [ # see definition in unicode.h
|
|
||||||
("is_undefined", ctypes.c_uint16, 1),
|
|
||||||
("is_number", ctypes.c_uint16, 1), # regex: \p{N}
|
|
||||||
("is_letter", ctypes.c_uint16, 1), # regex: \p{L}
|
|
||||||
("is_separator", ctypes.c_uint16, 1), # regex: \p{Z}
|
|
||||||
("is_accent_mark", ctypes.c_uint16, 1), # regex: \p{M}
|
|
||||||
("is_punctuation", ctypes.c_uint16, 1), # regex: \p{P}
|
|
||||||
("is_symbol", ctypes.c_uint16, 1), # regex: \p{S}
|
|
||||||
("is_control", ctypes.c_uint16, 1), # regex: \p{C}
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
assert (ctypes.sizeof(CoodepointFlags) == 2)
|
|
||||||
|
|
||||||
|
|
||||||
MAX_CODEPOINTS = 0x110000
|
MAX_CODEPOINTS = 0x110000
|
||||||
|
|
||||||
regex_number = regex.compile(r'\p{N}')
|
UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
|
||||||
regex_letter = regex.compile(r'\p{L}')
|
|
||||||
regex_separator = regex.compile(r'\p{Z}')
|
|
||||||
regex_accent_mark = regex.compile(r'\p{M}')
|
|
||||||
regex_punctuation = regex.compile(r'\p{P}')
|
|
||||||
regex_symbol = regex.compile(r'\p{S}')
|
|
||||||
regex_control = regex.compile(r'\p{C}')
|
|
||||||
regex_whitespace = regex.compile(r'\s')
|
|
||||||
|
|
||||||
codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
|
|
||||||
|
# see https://www.unicode.org/L2/L1999/UnicodeData.html
|
||||||
|
def unicode_data_iter():
|
||||||
|
res = requests.get(UNICODE_DATA_URL)
|
||||||
|
res.raise_for_status()
|
||||||
|
data = res.content.decode()
|
||||||
|
|
||||||
|
prev = []
|
||||||
|
|
||||||
|
for line in data.splitlines():
|
||||||
|
# ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
|
||||||
|
line = line.split(";")
|
||||||
|
|
||||||
|
cpt = int(line[0], base=16)
|
||||||
|
assert cpt < MAX_CODEPOINTS
|
||||||
|
|
||||||
|
cpt_lower = int(line[-2] or "0", base=16)
|
||||||
|
assert cpt_lower < MAX_CODEPOINTS
|
||||||
|
|
||||||
|
cpt_upper = int(line[-3] or "0", base=16)
|
||||||
|
assert cpt_upper < MAX_CODEPOINTS
|
||||||
|
|
||||||
|
categ = line[2].strip()
|
||||||
|
assert len(categ) == 2
|
||||||
|
|
||||||
|
bidir = line[4].strip()
|
||||||
|
assert len(categ) == 2
|
||||||
|
|
||||||
|
name = line[1]
|
||||||
|
if name.endswith(", First>"):
|
||||||
|
prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
|
||||||
|
continue
|
||||||
|
if name.endswith(", Last>"):
|
||||||
|
assert prev[1:] == (0, 0, categ, bidir)
|
||||||
|
for c in range(prev[0], cpt):
|
||||||
|
yield (c, cpt_lower, cpt_upper, categ, bidir)
|
||||||
|
|
||||||
|
yield (cpt, cpt_lower, cpt_upper, categ, bidir)
|
||||||
|
|
||||||
|
|
||||||
|
# see definition in unicode.h
|
||||||
|
CODEPOINT_FLAG_UNDEFINED = 0x0001 #
|
||||||
|
CODEPOINT_FLAG_NUMBER = 0x0002 # \p{N}
|
||||||
|
CODEPOINT_FLAG_LETTER = 0x0004 # \p{L}
|
||||||
|
CODEPOINT_FLAG_SEPARATOR = 0x0008 # \p{Z}
|
||||||
|
CODEPOINT_FLAG_MARK = 0x0010 # \p{M}
|
||||||
|
CODEPOINT_FLAG_PUNCTUATION = 0x0020 # \p{P}
|
||||||
|
CODEPOINT_FLAG_SYMBOL = 0x0040 # \p{S}
|
||||||
|
CODEPOINT_FLAG_CONTROL = 0x0080 # \p{C}
|
||||||
|
|
||||||
|
UNICODE_CATEGORY_TO_FLAG = {
|
||||||
|
"Cn": CODEPOINT_FLAG_UNDEFINED, # Undefined
|
||||||
|
"Cc": CODEPOINT_FLAG_CONTROL, # Control
|
||||||
|
"Cf": CODEPOINT_FLAG_CONTROL, # Format
|
||||||
|
"Co": CODEPOINT_FLAG_CONTROL, # Private Use
|
||||||
|
"Cs": CODEPOINT_FLAG_CONTROL, # Surrrogate
|
||||||
|
"Ll": CODEPOINT_FLAG_LETTER, # Lowercase Letter
|
||||||
|
"Lm": CODEPOINT_FLAG_LETTER, # Modifier Letter
|
||||||
|
"Lo": CODEPOINT_FLAG_LETTER, # Other Letter
|
||||||
|
"Lt": CODEPOINT_FLAG_LETTER, # Titlecase Letter
|
||||||
|
"Lu": CODEPOINT_FLAG_LETTER, # Uppercase Letter
|
||||||
|
"L&": CODEPOINT_FLAG_LETTER, # Cased Letter
|
||||||
|
"Mc": CODEPOINT_FLAG_MARK, # Spacing Mark
|
||||||
|
"Me": CODEPOINT_FLAG_MARK, # Enclosing Mark
|
||||||
|
"Mn": CODEPOINT_FLAG_MARK, # Nonspacing Mark
|
||||||
|
"Nd": CODEPOINT_FLAG_NUMBER, # Decimal Number
|
||||||
|
"Nl": CODEPOINT_FLAG_NUMBER, # Letter Number
|
||||||
|
"No": CODEPOINT_FLAG_NUMBER, # Other Number
|
||||||
|
"Pc": CODEPOINT_FLAG_PUNCTUATION, # Connector Punctuation
|
||||||
|
"Pd": CODEPOINT_FLAG_PUNCTUATION, # Dash Punctuation
|
||||||
|
"Pe": CODEPOINT_FLAG_PUNCTUATION, # Close Punctuation
|
||||||
|
"Pf": CODEPOINT_FLAG_PUNCTUATION, # Final Punctuation
|
||||||
|
"Pi": CODEPOINT_FLAG_PUNCTUATION, # Initial Punctuation
|
||||||
|
"Po": CODEPOINT_FLAG_PUNCTUATION, # Other Punctuation
|
||||||
|
"Ps": CODEPOINT_FLAG_PUNCTUATION, # Open Punctuation
|
||||||
|
"Sc": CODEPOINT_FLAG_SYMBOL, # Currency Symbol
|
||||||
|
"Sk": CODEPOINT_FLAG_SYMBOL, # Modifier Symbol
|
||||||
|
"Sm": CODEPOINT_FLAG_SYMBOL, # Math Symbol
|
||||||
|
"So": CODEPOINT_FLAG_SYMBOL, # Other Symbol
|
||||||
|
"Zl": CODEPOINT_FLAG_SEPARATOR, # Line Separator
|
||||||
|
"Zp": CODEPOINT_FLAG_SEPARATOR, # Paragraph Separator
|
||||||
|
"Zs": CODEPOINT_FLAG_SEPARATOR, # Space Separator
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
|
||||||
table_whitespace = []
|
table_whitespace = []
|
||||||
table_lowercase = []
|
table_lowercase = []
|
||||||
table_uppercase = []
|
table_uppercase = []
|
||||||
table_nfd = []
|
table_nfd = []
|
||||||
|
|
||||||
for codepoint in range(MAX_CODEPOINTS):
|
for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
|
||||||
# convert codepoint to unicode character
|
# convert codepoint to unicode character
|
||||||
char = chr(codepoint)
|
char = chr(cpt)
|
||||||
|
|
||||||
# regex categories
|
# codepoint category flags
|
||||||
flags = codepoint_flags[codepoint]
|
codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
|
||||||
flags.is_number = bool(regex_number.match(char))
|
|
||||||
flags.is_letter = bool(regex_letter.match(char))
|
|
||||||
flags.is_separator = bool(regex_separator.match(char))
|
|
||||||
flags.is_accent_mark = bool(regex_accent_mark.match(char))
|
|
||||||
flags.is_punctuation = bool(regex_punctuation.match(char))
|
|
||||||
flags.is_symbol = bool(regex_symbol.match(char))
|
|
||||||
flags.is_control = bool(regex_control.match(char))
|
|
||||||
flags.is_undefined = bytes(flags)[0] == 0
|
|
||||||
assert (not flags.is_undefined)
|
|
||||||
|
|
||||||
# whitespaces
|
|
||||||
if bool(regex_whitespace.match(char)):
|
|
||||||
table_whitespace.append(codepoint)
|
|
||||||
|
|
||||||
# lowercase conversion
|
# lowercase conversion
|
||||||
lower = ord(char.lower()[0])
|
if cpt_lower:
|
||||||
if codepoint != lower:
|
table_lowercase.append((cpt, cpt_lower))
|
||||||
table_lowercase.append((codepoint, lower))
|
|
||||||
|
|
||||||
# uppercase conversion
|
# uppercase conversion
|
||||||
upper = ord(char.upper()[0])
|
if cpt_upper:
|
||||||
if codepoint != upper:
|
table_uppercase.append((cpt, cpt_upper))
|
||||||
table_uppercase.append((codepoint, upper))
|
|
||||||
|
|
||||||
# NFD normalization
|
# NFD normalization
|
||||||
norm = ord(unicodedata.normalize('NFD', char)[0])
|
norm = ord(unicodedata.normalize('NFD', char)[0])
|
||||||
if codepoint != norm:
|
if cpt != norm:
|
||||||
table_nfd.append((codepoint, norm))
|
table_nfd.append((cpt, norm))
|
||||||
|
|
||||||
|
|
||||||
|
# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
|
||||||
|
table_whitespace.extend(range(0x0009, 0x000D + 1))
|
||||||
|
table_whitespace.extend(range(0x2000, 0x200A + 1))
|
||||||
|
table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
|
||||||
|
|
||||||
|
|
||||||
|
# sort by codepoint
|
||||||
|
table_whitespace.sort()
|
||||||
|
table_lowercase.sort()
|
||||||
|
table_uppercase.sort()
|
||||||
|
table_nfd.sort()
|
||||||
|
|
||||||
|
|
||||||
# group ranges with same flags
|
# group ranges with same flags
|
||||||
ranges_flags = [(0, codepoint_flags[0])] # start, flags
|
ranges_flags = [(0, codepoint_flags[0])] # start, flags
|
||||||
for codepoint, flags in enumerate(codepoint_flags):
|
for codepoint, flags in enumerate(codepoint_flags):
|
||||||
if bytes(flags) != bytes(ranges_flags[-1][1]):
|
if flags != ranges_flags[-1][1]:
|
||||||
ranges_flags.append((codepoint, flags))
|
ranges_flags.append((codepoint, flags))
|
||||||
ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
|
ranges_flags.append((MAX_CODEPOINTS, 0x0000))
|
||||||
|
|
||||||
|
|
||||||
# group ranges with same nfd
|
# group ranges with same nfd
|
||||||
|
@ -90,8 +150,8 @@ for codepoint, norm in table_nfd:
|
||||||
ranges_nfd[-1] = (start, codepoint, norm)
|
ranges_nfd[-1] = (start, codepoint, norm)
|
||||||
|
|
||||||
|
|
||||||
# Generate 'unicode-data.cpp'
|
# Generate 'unicode-data.cpp':
|
||||||
|
# python ./scripts//gen-unicode-data.py > unicode-data.cpp
|
||||||
|
|
||||||
def out(line=""):
|
def out(line=""):
|
||||||
print(line, end='\n') # noqa
|
print(line, end='\n') # noqa
|
||||||
|
@ -110,12 +170,12 @@ out("""\
|
||||||
|
|
||||||
out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
|
out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
|
||||||
for codepoint, flags in ranges_flags:
|
for codepoint, flags in ranges_flags:
|
||||||
flags = int.from_bytes(bytes(flags), "little")
|
|
||||||
out("{0x%06X, 0x%04X}," % (codepoint, flags))
|
out("{0x%06X, 0x%04X}," % (codepoint, flags))
|
||||||
out("};\n")
|
out("};\n")
|
||||||
|
|
||||||
out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
|
out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
|
||||||
out(", ".join("0x%06X" % cpt for cpt in table_whitespace))
|
for codepoint in table_whitespace:
|
||||||
|
out("0x%06X," % codepoint)
|
||||||
out("};\n")
|
out("};\n")
|
||||||
|
|
||||||
out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
|
out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
|
||||||
|
|
1652
unicode-data.cpp
1652
unicode-data.cpp
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue