Better unicode data generation

2024-06-14 23:50:14 +02:00 · 2024-06-14 23:50:14 +02:00 · 4af5478f60
commit 4af5478f60
parent 8cda5af9fe
2 changed files with 969 additions and 859 deletions
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@ -1,83 +1,143 @@
-import regex
+import array
 import ctypes
 import unicodedata
-
+import requests
 class CoodepointFlags (ctypes.Structure):
    _fields_ = [  # see definition in unicode.h
        ("is_undefined",   ctypes.c_uint16, 1),
        ("is_number",      ctypes.c_uint16, 1),  # regex: \p{N}
        ("is_letter",      ctypes.c_uint16, 1),  # regex: \p{L}
        ("is_separator",   ctypes.c_uint16, 1),  # regex: \p{Z}
        ("is_accent_mark", ctypes.c_uint16, 1),  # regex: \p{M}
        ("is_punctuation", ctypes.c_uint16, 1),  # regex: \p{P}
        ("is_symbol",      ctypes.c_uint16, 1),  # regex: \p{S}
        ("is_control",     ctypes.c_uint16, 1),  # regex: \p{C}
    ]
 assert (ctypes.sizeof(CoodepointFlags) == 2)
 MAX_CODEPOINTS = 0x110000
-regex_number      = regex.compile(r'\p{N}')
+UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
 regex_letter      = regex.compile(r'\p{L}')
 regex_separator   = regex.compile(r'\p{Z}')
 regex_accent_mark = regex.compile(r'\p{M}')
 regex_punctuation = regex.compile(r'\p{P}')
 regex_symbol      = regex.compile(r'\p{S}')
 regex_control     = regex.compile(r'\p{C}')
 regex_whitespace  = regex.compile(r'\s')
-codepoint_flags = (CoodepointFlags * MAX_CODEPOINTS)()
+
 # see https://www.unicode.org/L2/L1999/UnicodeData.html
 def unicode_data_iter():
    res = requests.get(UNICODE_DATA_URL)
    res.raise_for_status()
    data = res.content.decode()
    prev = []
    for line in data.splitlines():
        # ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
        line = line.split(";")
        cpt = int(line[0], base=16)
        assert cpt < MAX_CODEPOINTS
        cpt_lower = int(line[-2] or "0", base=16)
        assert cpt_lower < MAX_CODEPOINTS
        cpt_upper = int(line[-3] or "0", base=16)
        assert cpt_upper < MAX_CODEPOINTS
        categ = line[2].strip()
        assert len(categ) == 2
        bidir = line[4].strip()
        assert len(categ) == 2
        name = line[1]
        if name.endswith(", First>"):
            prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
            continue
        if name.endswith(", Last>"):
            assert prev[1:] == (0, 0, categ, bidir)
            for c in range(prev[0], cpt):
                yield (c, cpt_lower, cpt_upper, categ, bidir)
        yield (cpt, cpt_lower, cpt_upper, categ, bidir)
 # see definition in unicode.h
 CODEPOINT_FLAG_UNDEFINED   = 0x0001  #
 CODEPOINT_FLAG_NUMBER      = 0x0002  # \p{N}
 CODEPOINT_FLAG_LETTER      = 0x0004  # \p{L}
 CODEPOINT_FLAG_SEPARATOR   = 0x0008  # \p{Z}
 CODEPOINT_FLAG_MARK        = 0x0010  # \p{M}
 CODEPOINT_FLAG_PUNCTUATION = 0x0020  # \p{P}
 CODEPOINT_FLAG_SYMBOL      = 0x0040  # \p{S}
 CODEPOINT_FLAG_CONTROL     = 0x0080  # \p{C}
 UNICODE_CATEGORY_TO_FLAG = {
    "Cn": CODEPOINT_FLAG_UNDEFINED,    # Undefined
    "Cc": CODEPOINT_FLAG_CONTROL,      # Control
    "Cf": CODEPOINT_FLAG_CONTROL,      # Format
    "Co": CODEPOINT_FLAG_CONTROL,      # Private Use
    "Cs": CODEPOINT_FLAG_CONTROL,      # Surrrogate
    "Ll": CODEPOINT_FLAG_LETTER,       # Lowercase Letter
    "Lm": CODEPOINT_FLAG_LETTER,       # Modifier Letter
    "Lo": CODEPOINT_FLAG_LETTER,       # Other Letter
    "Lt": CODEPOINT_FLAG_LETTER,       # Titlecase Letter
    "Lu": CODEPOINT_FLAG_LETTER,       # Uppercase Letter
    "L&": CODEPOINT_FLAG_LETTER,       # Cased Letter
    "Mc": CODEPOINT_FLAG_MARK,         # Spacing Mark
    "Me": CODEPOINT_FLAG_MARK,         # Enclosing Mark
    "Mn": CODEPOINT_FLAG_MARK,         # Nonspacing Mark
    "Nd": CODEPOINT_FLAG_NUMBER,       # Decimal Number
    "Nl": CODEPOINT_FLAG_NUMBER,       # Letter Number
    "No": CODEPOINT_FLAG_NUMBER,       # Other Number
    "Pc": CODEPOINT_FLAG_PUNCTUATION,  # Connector Punctuation
    "Pd": CODEPOINT_FLAG_PUNCTUATION,  # Dash Punctuation
    "Pe": CODEPOINT_FLAG_PUNCTUATION,  # Close Punctuation
    "Pf": CODEPOINT_FLAG_PUNCTUATION,  # Final Punctuation
    "Pi": CODEPOINT_FLAG_PUNCTUATION,  # Initial Punctuation
    "Po": CODEPOINT_FLAG_PUNCTUATION,  # Other Punctuation
    "Ps": CODEPOINT_FLAG_PUNCTUATION,  # Open Punctuation
    "Sc": CODEPOINT_FLAG_SYMBOL,       # Currency Symbol
    "Sk": CODEPOINT_FLAG_SYMBOL,       # Modifier Symbol
    "Sm": CODEPOINT_FLAG_SYMBOL,       # Math Symbol
    "So": CODEPOINT_FLAG_SYMBOL,       # Other Symbol
    "Zl": CODEPOINT_FLAG_SEPARATOR,    # Line Separator
    "Zp": CODEPOINT_FLAG_SEPARATOR,    # Paragraph Separator
    "Zs": CODEPOINT_FLAG_SEPARATOR,    # Space Separator
 }
 codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
 table_whitespace = []
 table_lowercase = []
 table_uppercase = []
 table_nfd = []
-for codepoint in range(MAX_CODEPOINTS):
+for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
    # convert codepoint to unicode character
-    char = chr(codepoint)
+    char = chr(cpt)
-    # regex categories
+    # codepoint category flags
-    flags = codepoint_flags[codepoint]
+    codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
    flags.is_number      = bool(regex_number.match(char))
    flags.is_letter      = bool(regex_letter.match(char))
    flags.is_separator   = bool(regex_separator.match(char))
    flags.is_accent_mark = bool(regex_accent_mark.match(char))
    flags.is_punctuation = bool(regex_punctuation.match(char))
    flags.is_symbol      = bool(regex_symbol.match(char))
    flags.is_control     = bool(regex_control.match(char))
    flags.is_undefined   = bytes(flags)[0] == 0
    assert (not flags.is_undefined)
    # whitespaces
    if bool(regex_whitespace.match(char)):
        table_whitespace.append(codepoint)
    # lowercase conversion
-    lower = ord(char.lower()[0])
+    if cpt_lower:
-    if codepoint != lower:
+        table_lowercase.append((cpt, cpt_lower))
        table_lowercase.append((codepoint, lower))
    # uppercase conversion
-    upper = ord(char.upper()[0])
+    if cpt_upper:
-    if codepoint != upper:
+        table_uppercase.append((cpt, cpt_upper))
        table_uppercase.append((codepoint, upper))
    # NFD normalization
    norm = ord(unicodedata.normalize('NFD', char)[0])
-    if codepoint != norm:
+    if cpt != norm:
-        table_nfd.append((codepoint, norm))
+        table_nfd.append((cpt, norm))
 # whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
 table_whitespace.extend(range(0x0009, 0x000D + 1))
 table_whitespace.extend(range(0x2000, 0x200A + 1))
 table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
 # sort by codepoint
 table_whitespace.sort()
 table_lowercase.sort()
 table_uppercase.sort()
 table_nfd.sort()
 # group ranges with same flags
 ranges_flags = [(0, codepoint_flags[0])]  # start, flags
 for codepoint, flags in enumerate(codepoint_flags):
-    if bytes(flags) != bytes(ranges_flags[-1][1]):
+    if flags != ranges_flags[-1][1]:
        ranges_flags.append((codepoint, flags))
-ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
+ranges_flags.append((MAX_CODEPOINTS, 0x0000))
 # group ranges with same nfd
@ -90,8 +150,8 @@ for codepoint, norm in table_nfd:
    ranges_nfd[-1] = (start, codepoint, norm)
-# Generate 'unicode-data.cpp'
+# Generate 'unicode-data.cpp':
-
+#   python ./scripts//gen-unicode-data.py > unicode-data.cpp
 def out(line=""):
    print(line, end='\n')  # noqa
@ -110,12 +170,12 @@ out("""\
 out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1")
 for codepoint, flags in ranges_flags:
    flags = int.from_bytes(bytes(flags), "little")
    out("{0x%06X, 0x%04X}," % (codepoint, flags))
 out("};\n")
 out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
-out(", ".join("0x%06X" % cpt for cpt in table_whitespace))
+for codepoint in table_whitespace:
    out("0x%06X," % codepoint)
 out("};\n")
 out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
--- a/unicode-data.cpp
+++ b/unicode-data.cpp