Make numerous improvements

- Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
2025-10-26 03:00:57 +00:00 · 2021-09-27 22:58:51 -07:00 · 2021-09-27 22:58:51 -07:00 · 39bf41f4eb
commit 39bf41f4eb
parent fa7b4f5bd1
806 changed files with 77494 additions and 63859 deletions
--- a/third_party/python/Tools/unicode/gencjkcodecs.py
+++ b/third_party/python/Tools/unicode/gencjkcodecs.py
@ -65,4 +65,6 @@ def gencodecs(prefix):

 if __name__ == '__main__':
    import sys
-    gencodecs(sys.argv[1])
+    gencodecs(sys.argv[1]
+              if len(sys.argv) > 1 else
+              "third_party/python/Lib/encodings")
--- a/third_party/python/Tools/unicode/makeunicodedata.py
+++ b/third_party/python/Tools/unicode/makeunicodedata.py
@ -29,6 +29,7 @@

 import os
 import sys
+import zlib
 import zipfile

 from textwrap import dedent
@ -42,7 +43,7 @@ VERSION = "3.2"
 #   * Doc/library/stdtypes.rst, and
 #   * Doc/library/unicodedata.rst
 #   * Doc/reference/lexical_analysis.rst (two occurrences)
-UNIDATA_VERSION = "9.0.0"
+UNIDATA_VERSION = "13.0.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@ -106,39 +107,68 @@ cjk_ranges = [
    ('2B820', '2CEA1'),
 ]

+def bias(c):
+    # if c <= 0xffff:
+    #     return True
+    # if 0x1f600 <= c <= 0x1f64f:
+    #     return True
+    return True
+
 def maketables(trace=0):
-
    print("--- Reading", UNICODE_DATA % "", "...")
-
    version = ""
-    unicode = UnicodeData(UNIDATA_VERSION)
-
+    unicode = UnicodeData(UNIDATA_VERSION, select=bias)
    print(len(list(filter(None, unicode.table))), "characters")
-
    for version in old_versions:
        print("--- Reading", UNICODE_DATA % ("-"+version), "...")
-        old_unicode = UnicodeData(version, cjk_check=False)
+        old_unicode = UnicodeData(version, cjk_check=False, select=bias)
        print(len(list(filter(None, old_unicode.table))), "characters")
        merge_old_version(version, unicode, old_unicode)
+    with open("third_party/python/Modules/unicodedata_unidata.h", "w") as hdr:
+        print("""\
+#ifndef COSMOPOLITAN_THIRD_PARTY_PYTHON_MODULES_UNICODEDATA_UNIDATA_H_
+#define COSMOPOLITAN_THIRD_PARTY_PYTHON_MODULES_UNICODEDATA_UNIDATA_H_
+#include "third_party/python/Modules/unicodedata.h"
+COSMOPOLITAN_C_START_
+/* GENERATED BY %s %s */""" % (SCRIPT, VERSION), file=hdr)
+        print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=hdr)
+        makeunicodename(hdr, unicode, trace)
+        makeunicodedata(hdr, unicode, trace)
+        makeunicodetype(hdr, unicode, trace)
+        hdr.write("""\
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_THIRD_PARTY_PYTHON_MODULES_UNICODEDATA_UNIDATA_H_ */
+""")

-    makeunicodename(unicode, trace)
-    makeunicodedata(unicode, trace)
-    makeunicodetype(unicode, trace)
+def startfile(fp):
+    print('#include "libc/nexgen32e/kompressor.h"', file=fp)
+    print('#include "third_party/python/Modules/unicodedata.h"', file=fp)
+    print("/* clang-format off */", file=fp)
+    print("/* GENERATED BY %s %s */" % (SCRIPT, VERSION), file=fp)
+    print(file=fp)
+
+def makestringarray(name, strings, fp, hdr):
+    ml = max(len(s) for s in strings)
+    if ml < 8:
+        print('extern const char %s[%d][%d];' % (name, len(strings), ml+1), file=hdr)
+        print("const char %s[%d][%d] = {" % (name, len(strings), ml+1), file=fp)
+    else:
+        print('extern const char *const %s[%d];' % (name, len(strings)), file=hdr)
+        print("const char *const %s[%d] = {" % (name, len(strings)), file=fp)
+    for s in strings:
+        print("    \"%s\"," % (s), file=fp)
+    print("};", file=fp)

 # --------------------------------------------------------------------
 # unicode character properties

-def makeunicodedata(unicode, trace):
+def makeunicodedata(hdr, unicode, trace):

    dummy = (0, 0, 0, 0, 0, 0)
    table = [dummy]
    cache = {0: dummy}
    index = [0] * len(unicode.chars)

-    FILE = "Modules/unicodedata_db.h"
-
-    print("--- Preparing", FILE, "...")
-
    # 1) database properties

    for char in unicode.chars:
@ -256,135 +286,123 @@ def makeunicodedata(unicode, trace):
    print(total_last, "last characters in NFC")
    print(len(comp_pairs), "NFC pairs")

-    print("--- Writing", FILE, "...")
-
-    fp = open(FILE, "w")
-    print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
-    print(file=fp)
-    print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
-    print("/* a list of unique database records */", file=fp)
-    print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
-    for item in table:
-        print("    {%d, %d, %d, %d, %d, %d}," % item, file=fp)
-    print("};", file=fp)
-    print(file=fp)
-
-    print("/* Reindexing of NFC first characters. */", file=fp)
-    print("#define TOTAL_FIRST",total_first, file=fp)
-    print("#define TOTAL_LAST",total_last, file=fp)
-    print("struct reindex{int start;short count,index;};", file=fp)
-    print("static struct reindex nfc_first[] = {", file=fp)
-    for start,end in comp_first_ranges:
-        print("  { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
-    print("  {0,0,0}", file=fp)
-    print("};\n", file=fp)
-    print("static struct reindex nfc_last[] = {", file=fp)
-    for start,end in comp_last_ranges:
-        print("  { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
-    print("  {0,0,0}", file=fp)
-    print("};\n", file=fp)
-
-    # FIXME: <fl> the following tables could be made static, and
-    # the support code moved into unicodedatabase.c
-
-    print("/* string literals */", file=fp)
-    print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
-    for name in CATEGORY_NAMES:
-        print("    \"%s\"," % name, file=fp)
-    print("    NULL", file=fp)
-    print("};", file=fp)
-
-    print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
-    for name in BIDIRECTIONAL_NAMES:
-        print("    \"%s\"," % name, file=fp)
-    print("    NULL", file=fp)
-    print("};", file=fp)
-
-    print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
-    for name in EASTASIANWIDTH_NAMES:
-        print("    \"%s\"," % name, file=fp)
-    print("    NULL", file=fp)
-    print("};", file=fp)
-
-    print("static const char *decomp_prefix[] = {", file=fp)
-    for name in decomp_prefix:
-        print("    \"%s\"," % name, file=fp)
-    print("    NULL", file=fp)
-    print("};", file=fp)
-
-    # split record index table
-    index1, index2, shift = splitbins(index, trace)
-
-    print("/* index tables for the database records */", file=fp)
-    print("#define SHIFT", shift, file=fp)
-    Array("index1", index1).dump(fp, trace)
-    Array("index2", index2).dump(fp, trace)
-
-    # split decomposition index table
-    index1, index2, shift = splitbins(decomp_index, trace)
-
-    print("/* decomposition data */", file=fp)
-    Array("decomp_data", decomp_data).dump(fp, trace)
-
-    print("/* index tables for the decomposition data */", file=fp)
-    print("#define DECOMP_SHIFT", shift, file=fp)
-    Array("decomp_index1", index1).dump(fp, trace)
-    Array("decomp_index2", index2).dump(fp, trace)
-
-    index, index2, shift = splitbins(comp_data, trace)
-    print("/* NFC pairs */", file=fp)
-    print("#define COMP_SHIFT", shift, file=fp)
-    Array("comp_index", index).dump(fp, trace)
-    Array("comp_data", index2).dump(fp, trace)
-
-    # Generate delta tables for old versions
-    for version, table, normalization in unicode.changed:
-        cversion = version.replace(".","_")
-        records = [table[0]]
-        cache = {table[0]:0}
-        index = [0] * len(table)
-        for i, record in enumerate(table):
-            try:
-                index[i] = cache[record]
-            except KeyError:
-                index[i] = cache[record] = len(records)
-                records.append(record)
-        index1, index2, shift = splitbins(index, trace)
-        print("static const change_record change_records_%s[] = {" % cversion, file=fp)
-        for record in records:
-            print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
+    # a list of unique records
+    with open("third_party/python/Modules/unicodedata_records.c", "w") as fp:
+        startfile(fp)
+        print("extern const _PyUnicode_Record _PyUnicode_Records[%d];" % (len(table)), file=hdr)
+        print("const _PyUnicode_Record _PyUnicode_Records[] = {", file=fp)
+        for item in table:
+            print("    {%3d, %3d, %3d, %3d, %3d, %3d}," % item, file=fp)
        print("};", file=fp)
-        Array("changes_%s_index" % cversion, index1).dump(fp, trace)
-        Array("changes_%s_data" % cversion, index2).dump(fp, trace)
-        print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
-        print("{", file=fp)
-        print("\tint index;", file=fp)
-        print("\tif (n >= 0x110000) index = 0;", file=fp)
-        print("\telse {", file=fp)
-        print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
-        print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
-              (cversion, shift, ((1<<shift)-1)), file=fp)
-        print("\t}", file=fp)
-        print("\treturn change_records_%s+index;" % cversion, file=fp)
-        print("}\n", file=fp)
-        print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
-        print("{", file=fp)
-        print("\tswitch(n) {", file=fp)
-        for k, v in normalization:
-            print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
-        print("\tdefault: return 0;", file=fp)
-        print("\t}\n}\n", file=fp)
+        print(file=fp)
+        index1, index2, shift = splitbins(index, trace)
+        print("#define _PyUnicode_RecordsShift", shift, file=hdr)
+        Array("_PyUnicode_RecordsIndex1", index1, rle=True).dump(fp, hdr, trace)
+        Array("_PyUnicode_RecordsIndex2", index2, rle=True).dump(fp, hdr, trace)

-    fp.close()
+    print("#define UNIDATA_TOTAL_FIRST", total_first, file=hdr)
+    print("#define UNIDATA_TOTAL_LAST", total_last, file=hdr)
+
+    with open("third_party/python/Modules/unicodedata_nfcfirst.c", "w") as fp:
+        startfile(fp)
+        print("extern const _PyUnicode_Reindex _PyUnicode_NfcFirst[%d];" % (len(comp_first_ranges)), file=hdr)
+        print("const _PyUnicode_Reindex _PyUnicode_NfcFirst[] = {", file=fp)
+        for start,end in comp_first_ranges:
+            print("    {%#07x, %3d, %3d}," % (start,end-start,comp_first[start]), file=fp)
+        print("    {0}", file=fp)
+        print("};\n", file=fp)
+
+    with open("third_party/python/Modules/unicodedata_nfclast.c", "w") as fp:
+        startfile(fp)
+        print("extern const _PyUnicode_Reindex _PyUnicode_NfcLast[%d];" % (len(comp_last_ranges)), file=hdr)
+        print("const _PyUnicode_Reindex _PyUnicode_NfcLast[] = {", file=fp)
+        for start,end in comp_last_ranges:
+            print("    {%#07x, %3d, %3d}," % (start,end-start,comp_last[start]), file=fp)
+        print("    {0}", file=fp)
+        print("};\n", file=fp)
+
+    with open("third_party/python/Modules/unicodedata_categorynames.c", "w") as fp:
+        startfile(fp)
+        makestringarray("_PyUnicode_CategoryNames", CATEGORY_NAMES, fp, hdr)
+
+    with open("third_party/python/Modules/unicodedata_bidirectionalnames.c", "w") as fp:
+        startfile(fp)
+        makestringarray("_PyUnicode_BidirectionalNames", BIDIRECTIONAL_NAMES, fp, hdr)
+
+    with open("third_party/python/Modules/unicodedata_eastasianwidthnames.c", "w") as fp:
+        startfile(fp)
+        makestringarray("_PyUnicode_EastAsianWidthNames", EASTASIANWIDTH_NAMES, fp, hdr)
+
+    with open("third_party/python/Modules/unicodedata_decompprefix.c", "w") as fp:
+        startfile(fp)
+        makestringarray("_PyUnicode_DecompPrefix", decomp_prefix, fp, hdr)
+
+    with open("third_party/python/Modules/unicodedata_decomp.c", "w") as fp:
+        startfile(fp)
+        index1, index2, shift = splitbins(decomp_index, trace)
+        print("#define _PyUnicode_DecompShift", shift, file=hdr)
+        Array("_PyUnicode_Decomp", decomp_data, pack=True).dump(fp, hdr, trace)
+        Array("_PyUnicode_DecompIndex1", index1, rle=True).dump(fp, hdr, trace)
+        Array("_PyUnicode_DecompIndex2", index2).dump(fp, hdr, trace)
+
+    with open("third_party/python/Modules/unicodedata_comp.c", "w") as fp:
+        startfile(fp)
+        index, index2, shift = splitbins(comp_data, trace)
+        print("#define _PyUnicode_CompShift", shift, file=hdr)
+        Array("_PyUnicode_CompIndex", index, rle=True).dump(fp, hdr, trace)
+        Array("_PyUnicode_CompData", index2, pack=True).dump(fp, hdr, trace)
+
+    # Generate delta tables for old versions [because punycode is pinned to 3.2.0]
+    for version, table, normalization in unicode.changed:
+        with open("third_party/python/Modules/unicodedata_%s.c" % (version), "w") as fp:
+            startfile(fp)
+            cversion = version.replace(".","_")
+            records = [table[0]]
+            cache = {table[0]:0}
+            index = [0] * len(table)
+            for i, record in enumerate(table):
+                try:
+                    index[i] = cache[record]
+                except KeyError:
+                    index[i] = cache[record] = len(records)
+                    records.append(record)
+            index1, index2, shift = splitbins(index, trace)
+            print("const _PyUnicode_ChangeRecord _PyUnicode_ChangeRecords_%s[] = {" % cversion, file=fp)
+            for record in records:
+                print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
+            print("};", file=fp)
+            print(file=fp)
+            Array("_PyUnicode_ChangeIndex_%s" % cversion, index1, rle=True).dump(fp, hdr, trace)
+            Array("_PyUnicode_ChangeData_%s" % cversion, index2, rle=True).dump(fp, hdr, trace)
+            print("const _PyUnicode_ChangeRecord *_PyUnicode_GetChange_%s(Py_UCS4);" % cversion, file=hdr)
+            print("const _PyUnicode_ChangeRecord *_PyUnicode_GetChange_%s(Py_UCS4 n)" % cversion, file=fp)
+            print("{", file=fp)
+            print("    int i;", file=fp)
+            print("    if (n >= 0x110000) {", file=fp)
+            print("        i = 0;", file=fp)
+            print("    } else {", file=fp)
+            print("        i = _PyUnicode_ChangeIndex_%s[n>>%d];" % (cversion, shift), file=fp)
+            print("        i = _PyUnicode_ChangeData_%s[(i<<%d)+(n & %d)];" % (cversion, shift, ((1<<shift)-1)), file=fp)
+            print("    }", file=fp)
+            print("    return _PyUnicode_ChangeRecords_%s + i;" % cversion, file=fp)
+            print("}", file=fp)
+            print(file=fp)
+            print("Py_UCS4 _PyUnicode_Normalization_%s(Py_UCS4);" % (cversion), file=hdr)
+            print("Py_UCS4 _PyUnicode_Normalization_%s(Py_UCS4 n)" % (cversion), file=fp)
+            print("{", file=fp)
+            print("    switch(n) {", file=fp)
+            for k, v in normalization:
+                print("    case 0x%04x:" % (k), file=fp)
+                print("        return 0x%s;" % (v), file=fp)
+            print("    default:", file=fp)
+            print("        return 0;", file=fp)
+            print("    }", file=fp)
+            print("}", file=fp)

 # --------------------------------------------------------------------
 # unicode character type tables

-def makeunicodetype(unicode, trace):
-
-    FILE = "Objects/unicodetype_db.h"
-
-    print("--- Preparing", FILE, "...")
+def makeunicodetype(hdr, unicode, trace):

    # extract unicode types
    dummy = (0, 0, 0, 0, 0, 0)
@ -503,101 +521,98 @@ def makeunicodetype(unicode, trace):
    print(len(linebreaks), "linebreak code points")
    print(len(extra_casing), "extended case array")

-    print("--- Writing", FILE, "...")
+    with open("third_party/python/Modules/unicodedata_typerecords.c", "w") as fp:
+        startfile(fp)
+        print("extern const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[%d];" % (len(table)), file=hdr)
+        print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[%d] = {" % (len(table)), file=fp)
+        for item in table:
+            print("    {%3d, %3d, %3d, %3d, %3d, %3d}," % item, file=fp)
+        print("};", file=fp)
+        index1, index2, shift = splitbins(index, trace)
+        print("#define _PyUnicode_TypeRecordsShift", shift, file=hdr)
+        Array("_PyUnicode_TypeRecordsIndex1", index1, rle=True).dump(fp, hdr, trace)
+        Array("_PyUnicode_TypeRecordsIndex2", index2, rle=True).dump(fp, hdr, trace)

-    fp = open(FILE, "w")
-    print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
-    print(file=fp)
-    print("/* a list of unique character type descriptors */", file=fp)
-    print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
-    for item in table:
-        print("    {%d, %d, %d, %d, %d, %d}," % item, file=fp)
-    print("};", file=fp)
-    print(file=fp)
+    with open("third_party/python/Modules/unicodedata_extendedcase.c", "w") as fp:
+        startfile(fp)
+        type_ = "char16_t"
+        for c in extra_casing:
+            if c > 0xffff:
+                type_ = "Py_UCS4"
+                break
+        print("extern const %s _PyUnicode_ExtendedCase[%d];" % (type_, len(extra_casing)), file=hdr)
+        print("const %s _PyUnicode_ExtendedCase[%d] = {" % (type_, len(extra_casing)), file=fp)
+        for c in extra_casing:
+            print("    %d," % c, file=fp)
+        print("};", file=fp)

-    print("/* extended case mappings */", file=fp)
-    print(file=fp)
-    print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
-    for c in extra_casing:
-        print("    %d," % c, file=fp)
-    print("};", file=fp)
-    print(file=fp)
+    with open("third_party/python/Modules/unicodedata_tonumeric.c", "w") as fp:
+        startfile(fp)
+        # Generate code for _PyUnicode_ToNumeric()
+        numeric_items = sorted(numeric.items())
+        print('/* Returns the numeric value as double for Unicode characters', file=fp)
+        print(' * having this property, -1.0 otherwise.', file=fp)
+        print(' */', file=fp)
+        print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
+        print('{', file=fp)
+        print('    long a, b = 1;', file=fp)
+        print('    switch (ch) {', file=fp)
+        for value, codepoints in numeric_items:
+            # Turn text into float literals
+            parts = value.split('/')
+            codepoints.sort()
+            for codepoint in codepoints:
+                print('    case 0x%04X:' % (codepoint,), file=fp)
+            if len(parts) == 1:
+                print('        a = %s;' % (parts[0],), file=fp)
+            elif len(parts) == 2:
+                print('        a = %s;' % (parts[0],), file=fp)
+                print('        b = %s;' % (parts[1],), file=fp)
+            else:
+                assert False
+            print('        break;', file=fp)
+        print('    default:', file=fp)
+        print('        a = -1;', file=fp)
+        print('        break;', file=fp)
+        print('    }', file=fp)
+        print('    return (double)a / b;', file=fp)
+        print('}', file=fp)

-    # split decomposition index table
-    index1, index2, shift = splitbins(index, trace)
-
-    print("/* type indexes */", file=fp)
-    print("#define SHIFT", shift, file=fp)
-    Array("index1", index1).dump(fp, trace)
-    Array("index2", index2).dump(fp, trace)
-
-    # Generate code for _PyUnicode_ToNumeric()
-    numeric_items = sorted(numeric.items())
-    print('/* Returns the numeric value as double for Unicode characters', file=fp)
-    print(' * having this property, -1.0 otherwise.', file=fp)
-    print(' */', file=fp)
-    print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
-    print('{', file=fp)
-    print('    switch (ch) {', file=fp)
-    for value, codepoints in numeric_items:
-        # Turn text into float literals
-        parts = value.split('/')
-        parts = [repr(float(part)) for part in parts]
-        value = '/'.join(parts)
-
-        codepoints.sort()
-        for codepoint in codepoints:
+    with open("third_party/python/Modules/unicodedata_iswhitespace.c", "w") as fp:
+        startfile(fp)
+        print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
+        print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
+        print(" */", file=fp)
+        print('int _PyUnicode_IsWhitespace(Py_UCS4 ch)', file=fp)
+        print('{', file=fp)
+        print('    switch (ch) {', file=fp)
+        for codepoint in sorted(spaces):
            print('    case 0x%04X:' % (codepoint,), file=fp)
-        print('        return (double) %s;' % (value,), file=fp)
-    print('    }', file=fp)
-    print('    return -1.0;', file=fp)
-    print('}', file=fp)
-    print(file=fp)
+        print('        return 1;', file=fp)
+        print('    }', file=fp)
+        print('    return 0;', file=fp)
+        print('}', file=fp)

-    # Generate code for _PyUnicode_IsWhitespace()
-    print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
-    print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
-    print(" */", file=fp)
-    print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)
-    print('{', file=fp)
-    print('    switch (ch) {', file=fp)
-
-    for codepoint in sorted(spaces):
-        print('    case 0x%04X:' % (codepoint,), file=fp)
-    print('        return 1;', file=fp)
-
-    print('    }', file=fp)
-    print('    return 0;', file=fp)
-    print('}', file=fp)
-    print(file=fp)
-
-    # Generate code for _PyUnicode_IsLinebreak()
-    print("/* Returns 1 for Unicode characters having the line break", file=fp)
-    print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
-    print(" * type 'B', 0 otherwise.", file=fp)
-    print(" */", file=fp)
-    print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)
-    print('{', file=fp)
-    print('    switch (ch) {', file=fp)
-    for codepoint in sorted(linebreaks):
-        print('    case 0x%04X:' % (codepoint,), file=fp)
-    print('        return 1;', file=fp)
-
-    print('    }', file=fp)
-    print('    return 0;', file=fp)
-    print('}', file=fp)
-    print(file=fp)
-
-    fp.close()
+    with open("third_party/python/Modules/unicodedata_islinebreak.c", "w") as fp:
+        startfile(fp)
+        print("/* Returns 1 for Unicode characters having the line break", file=fp)
+        print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
+        print(" * type 'B', 0 otherwise.", file=fp)
+        print(" */", file=fp)
+        print('int _PyUnicode_IsLinebreak(Py_UCS4 ch)', file=fp)
+        print('{', file=fp)
+        print('    switch (ch) {', file=fp)
+        for codepoint in sorted(linebreaks):
+            print('    case 0x%04X:' % (codepoint,), file=fp)
+        print('        return 1;', file=fp)
+        print('    }', file=fp)
+        print('    return 0;', file=fp)
+        print('}', file=fp)

 # --------------------------------------------------------------------
 # unicode name database

-def makeunicodename(unicode, trace):
-
-    FILE = "third_party/python/Modules/unicodename_db.h"
-
-    print("--- Preparing", FILE, "...")
+def makeunicodename(hdr, unicode, trace):

    # collect names
    names = [None] * len(unicode.chars)
@ -631,7 +646,6 @@ def makeunicodename(unicode, trace):
                    words[w] = [len(words)]

    print(n, "words in text;", b, "bytes")
-
    wordlist = list(words.items())

    # sort on falling frequency, then by name
@ -650,10 +664,14 @@ def makeunicodename(unicode, trace):

    assert short > 0

+    # [jart] is this right?
+    short = min(short, len(wordlist))
+
    print(short, "short indexes in lexicon")

    # statistics
    n = 0
+    print(short)
    for i in range(short):
        n = n + len(wordlist[i][1])
    print(n, "short indexes in phrasebook")
@ -723,67 +741,50 @@ def makeunicodename(unicode, trace):
    # collisions on the current data set.  if you like, change it
    # and see what happens...

-    codehash = Hash("code", data, 47)
+    codehash = Hash("_PyUnicode_Code", data, 47)

-    print("--- Writing", FILE, "...")
-
-    fp = open(FILE, "w")
-    print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
-    print(file=fp)
-    print("#define NAME_MAXLEN", 256, file=fp)
-    print(file=fp)
-    print("/* lexicon */", file=fp)
-    Array("lexicon", lexicon).dump(fp, trace)
-    Array("lexicon_offset", lexicon_offset).dump(fp, trace)
+    print("#define UNIDATA_NAME_MAXLEN", 256, file=hdr)
+    with open("third_party/python/Modules/unicodedata_lexicon.c", "w") as fp:
+        startfile(fp)
+        Array("_PyUnicode_Lexicon", lexicon).dump(fp, hdr, trace)
+        Array("_PyUnicode_LexiconOffset", lexicon_offset).dump(fp, hdr, trace)

    # split decomposition index table
    offset1, offset2, shift = splitbins(phrasebook_offset, trace)
+    print("#define _PyUnicode_PhrasebookShift", shift, file=hdr)
+    print("#define _PyUnicode_PhrasebookShort", short, file=hdr)
+    with open("third_party/python/Modules/unicodedata_phrasebook.c", "w") as fp:
+        startfile(fp)
+        Array("_PyUnicode_Phrasebook", phrasebook).dump(fp, hdr, trace)
+        Array("_PyUnicode_PhrasebookOffset1", offset1, rle=True).dump(fp, hdr, trace)
+        Array("_PyUnicode_PhrasebookOffset2", offset2, pack=True).dump(fp, hdr, trace)

-    print("/* code->name phrasebook */", file=fp)
-    print("#define phrasebook_shift", shift, file=fp)
-    print("#define phrasebook_short", short, file=fp)
+    with open("third_party/python/Modules/unicodedata_codehash.c", "w") as fp:
+        startfile(fp)
+        codehash.dump(fp, hdr, trace)

-    Array("phrasebook", phrasebook).dump(fp, trace)
-    Array("phrasebook_offset1", offset1).dump(fp, trace)
-    Array("phrasebook_offset2", offset2).dump(fp, trace)
+    print('#define _PyUnicode_AliasesStart %#x' % (NAME_ALIASES_START), file=hdr)
+    print('#define _PyUnicode_AliasesEnd %#x' % (NAME_ALIASES_START + len(unicode.aliases)), file=hdr)
+    print('extern const unsigned int _PyUnicode_NameAliases[%d];' % (len(unicode.aliases)), file=hdr)
+    with open("third_party/python/Modules/unicodedata_aliases.c", "w") as fp:
+        startfile(fp)
+        print('const unsigned int _PyUnicode_NameAliases[%d] = {' % (len(unicode.aliases)), file=fp)
+        for name, codepoint in unicode.aliases:
+            print('    0x%04X,' % codepoint, file=fp)
+        print('};', file=fp)

-    print("/* name->code dictionary */", file=fp)
-    codehash.dump(fp, trace)
+    print('#define _PyUnicode_NamedSequencesStart %#x' % (NAMED_SEQUENCES_START), file=hdr)
+    print('#define _PyUnicode_NamedSequencesEnd %#x' %
+          (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=hdr)

-    print(file=fp)
-    print('static const unsigned int aliases_start = %#x;' %
-          NAME_ALIASES_START, file=fp)
-    print('static const unsigned int aliases_end = %#x;' %
-          (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
-
-    print('static const unsigned int name_aliases[] = {', file=fp)
-    for name, codepoint in unicode.aliases:
-        print('    0x%04X,' % codepoint, file=fp)
-    print('};', file=fp)
-
-    # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
-    # so we are using Py_UCS2 seq[4].  This needs to be updated if longer
-    # sequences or sequences with non-BMP chars are added.
-    # unicodedata_lookup should be adapted too.
-    print(dedent("""
-        typedef struct NamedSequence {
-            int seqlen;
-            Py_UCS2 seq[4];
-        } named_sequence;
-        """), file=fp)
-
-    print('static const unsigned int named_sequences_start = %#x;' %
-          NAMED_SEQUENCES_START, file=fp)
-    print('static const unsigned int named_sequences_end = %#x;' %
-          (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
-
-    print('static const named_sequence named_sequences[] = {', file=fp)
-    for name, sequence in unicode.named_sequences:
-        seq_str = ', '.join('0x%04X' % cp for cp in sequence)
-        print('    {%d, {%s}},' % (len(sequence), seq_str), file=fp)
-    print('};', file=fp)
-
-    fp.close()
+    print('extern const _PyUnicode_NamedSequence _PyUnicode_NamedSequences[%d];' % (len(unicode.named_sequences)), file=hdr)
+    with open("third_party/python/Modules/unicodedata_namedsequences.c", "w") as fp:
+        startfile(fp)
+        print('const _PyUnicode_NamedSequence _PyUnicode_NamedSequences[%d] = {' % (len(unicode.named_sequences)), file=fp)
+        for name, sequence in unicode.named_sequences:
+            seq_str = ', '.join('0x%04X' % cp for cp in sequence)
+            print('    {%d, {%s}},' % (len(sequence), seq_str), file=fp)
+        print('};', file=fp)


 def merge_old_version(version, new, old):
@ -914,7 +915,8 @@ class UnicodeData:
    def __init__(self, version,
                 linebreakprops=False,
                 expand=1,
-                 cjk_check=True):
+                 cjk_check=True,
+                 select=lambda c: True):
        self.changed = []
        table = [None] * 0x110000
        with open_data(UNICODE_DATA, version) as file:
@ -924,14 +926,19 @@ class UnicodeData:
                    break
                s = s.strip().split(";")
                char = int(s[0], 16)
-                table[char] = s
+                if select(char):
+                    table[char] = s

        cjk_ranges_found = []
+        cjk_ranger = [(a,b) for a,b in cjk_ranges
+                      if select(int(a,16)) and select(int(b,16))]

        # expand first-last ranges
        if expand:
            field = None
            for i in range(0, 0x110000):
+                if not select(i):
+                    continue
                s = table[i]
                if s:
                    if s[1][-6:] == "First>":
@ -947,8 +954,9 @@ class UnicodeData:
                    f2 = field[:]
                    f2[0] = "%X" % i
                    table[i] = f2
-            if cjk_check and cjk_ranges != cjk_ranges_found:
-                raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
+            # if cjk_check and cjk_ranger != cjk_ranges_found:
+            #     raise ValueError("CJK ranges deviate: have %r want %r" %
+            #                      (cjk_ranges_found, cjk_ranger))

        # public attributes
        self.filename = UNICODE_DATA % ''
@ -970,10 +978,11 @@ class UnicodeData:
                        continue
                    char, name, abbrev = s.split(';')
                    char = int(char, 16)
-                    self.aliases.append((name, char))
-                    # also store the name in the PUA 1
-                    self.table[pua_index][1] = name
-                    pua_index += 1
+                    if select(pua_index) and select(char):
+                        self.aliases.append((name, char))
+                        # also store the name in the PUA 1
+                        self.table[pua_index][1] = name
+                        pua_index += 1
            assert pua_index - NAME_ALIASES_START == len(self.aliases)

            self.named_sequences = []
@ -983,22 +992,24 @@ class UnicodeData:

            assert pua_index < NAMED_SEQUENCES_START
            pua_index = NAMED_SEQUENCES_START
-            with open_data(NAMED_SEQUENCES, version) as file:
-                for s in file:
-                    s = s.strip()
-                    if not s or s.startswith('#'):
-                        continue
-                    name, chars = s.split(';')
-                    chars = tuple(int(char, 16) for char in chars.split())
-                    # check that the structure defined in makeunicodename is OK
-                    assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
-                    assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
-                        "the NamedSequence struct and in unicodedata_lookup")
-                    self.named_sequences.append((name, chars))
-                    # also store these in the PUA 1
-                    self.table[pua_index][1] = name
-                    pua_index += 1
-            assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
+            if select(pua_index):
+                with open_data(NAMED_SEQUENCES, version) as file:
+                    for s in file:
+                        s = s.strip()
+                        if not s or s.startswith('#'):
+                            continue
+                        name, chars = s.split(';')
+                        chars = tuple(int(char, 16) for char in chars.split())
+                        chars = tuple(c for c in chars if select(c))
+                        # check that the strutcure defined in makeunicodename is OK
+                        assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
+                        assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
+                            "the NamedSequence struct and in unicodedata_lookup")
+                        self.named_sequences.append((name, chars))
+                        # also store these in the PUA 1
+                        self.table[pua_index][1] = name
+                        pua_index += 1
+                assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

        self.exclusions = {}
        with open_data(COMPOSITION_EXCLUSIONS, version) as file:
@ -1009,7 +1020,8 @@ class UnicodeData:
                if s[0] == '#':
                    continue
                char = int(s.split()[0],16)
-                self.exclusions[char] = 1
+                if select(char):
+                    self.exclusions[char] = 1

        widths = [None] * 0x110000
        with open_data(EASTASIAN_WIDTH, version) as file:
@ -1026,7 +1038,8 @@ class UnicodeData:
                else:
                    chars = [int(s[0], 16)]
                for char in chars:
-                    widths[char] = s[1]
+                    if select(char):
+                        widths[char] = s[1]

        for i in range(0, 0x110000):
            if table[i] is not None:
@ -1041,7 +1054,6 @@ class UnicodeData:
                s = s.split('#', 1)[0].strip()
                if not s:
                    continue
-
                r, p = s.split(";")
                r = r.strip()
                p = p.strip()
@ -1067,7 +1079,8 @@ class UnicodeData:
                else:
                    first, last = [int(c, 16) for c in s[0].split('..')]
                for char in range(first, last+1):
-                    table[char][-1].add('Line_Break')
+                    if select(char):
+                        table[char][-1].add('Line_Break')

        # We only want the quickcheck properties
        # Format: NF?_QC; Y(es)/N(o)/M(aybe)
@ -1093,8 +1106,9 @@ class UnicodeData:
                else:
                    first, last = [int(c, 16) for c in s[0].split('..')]
                for char in range(first, last+1):
-                    assert not (quickchecks[char]>>quickcheck_shift)&3
-                    quickchecks[char] |= quickcheck
+                    if select(char):
+                        assert not (quickchecks[char]>>quickcheck_shift)&3
+                        quickchecks[char] |= quickcheck
        for i in range(0, 0x110000):
            if table[i] is not None:
                table[i].append(quickchecks[i])
@ -1130,10 +1144,11 @@ class UnicodeData:
                    # handle_capital_sigma in unicodeobject.c.
                    continue
                c = int(data[0], 16)
-                lower = [int(char, 16) for char in data[1].split()]
-                title = [int(char, 16) for char in data[2].split()]
-                upper = [int(char, 16) for char in data[3].split()]
-                sc[c] = (lower, title, upper)
+                if select(c):
+                    lower = [int(char, 16) for char in data[1].split() if select(int(char, 16))]
+                    title = [int(char, 16) for char in data[2].split() if select(int(char, 16))]
+                    upper = [int(char, 16) for char in data[3].split() if select(int(char, 16))]
+                    sc[c] = (lower, title, upper)
        cf = self.case_folding = {}
        if version != '3.2.0':
            with open_data(CASE_FOLDING, version) as file:
@ -1144,7 +1159,8 @@ class UnicodeData:
                    data = s.split("; ")
                    if data[1] in "CF":
                        c = int(data[0], 16)
-                        cf[c] = [int(char, 16) for char in data[2].split()]
+                        if select(c):
+                            cf[c] = [int(char, 16) for char in data[2].split()]

    def uselatin1(self):
        # restrict character range to ISO Latin 1
@ -1223,52 +1239,122 @@ class Hash:
            if table[i] is None:
                table[i] = 0

-        self.data = Array(name + "_hash", table)
+        self.data = Array(name + "Hash", table, pack=True)
        self.magic = magic
        self.name = name
        self.size = size
        self.poly = poly

-    def dump(self, file, trace):
+    def dump(self, file, hdr, trace):
        # write data to file, as a C array
-        self.data.dump(file, trace)
-        file.write("#define %s_magic %d\n" % (self.name, self.magic))
-        file.write("#define %s_size %d\n" % (self.name, self.size))
-        file.write("#define %s_poly %d\n" % (self.name, self.poly))
+        self.data.dump(file, hdr, trace)
+        hdr.write("#define %sMagic %d\n" % (self.name, self.magic))
+        hdr.write("#define %sSize %d\n" % (self.name, self.size))
+        hdr.write("#define %sPoly %d\n" % (self.name, self.poly))

 # stuff to deal with arrays of unsigned integers

-class Array:
+def pack(data, bits, word=32):
+    assert 0 < bits < word
+    bitn = (bits * len(data) + word - 1) // word
+    bita = 0
+    for x in reversed(data):
+        bita <<= bits
+        bita |= x
+    for i in range(bitn):
+        yield bita & ((1 << word) - 1)
+        bita >>= 32

-    def __init__(self, name, data):
+def deflate(data):
+    # z = zlib.compressobj(zlib.Z_BEST_COMPRESSION, zlib.DEFLATED, -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, zlib.Z_RLE)
+    z = zlib.compressobj(zlib.Z_BEST_COMPRESSION, zlib.DEFLATED, -zlib.MAX_WBITS)
+    b = z.compress(data)
+    b += z.flush(zlib.Z_FINISH)
+    return b
+
+class Array:
+    def __init__(self, name, data, rle=False, pack=False):
        self.name = name
        self.data = data
+        self.pack = pack
+        self.rle = rle  # adds 90µs latency to startup

-    def dump(self, file, trace=0):
-        # write data to file, as a C array
+    def dump(self, file, hdr, trace=0):
+        # write data to f, as a C array
+        f = file
+        bits = max(x.bit_length() for x in self.data)
        size = getsize(self.data)
        if trace:
-            print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
-        file.write("static ")
+            print("%s: %d bits" % (self.name, bits), file=sys.stderr)
+            print("%s: size is %d bytes" % (self.name, size*len(self.data)), file=sys.stderr)
+            print("%s: packed size is %d bytes" % (self.name, (bits*len(self.data)+31)//32*4), file=sys.stderr)
+            print("%s: rle size is %d bytes" % (self.name, len(tuple(rle(self.data, (1<<(8*size))-1)))*size*2), file=sys.stderr)
+            if size == 1:
+                print("%s: deflate size is %d bytes" % (self.name, len(deflate(bytearray(self.data)))), file=sys.stderr)
+        if self.pack:
+            hdr.write("#define %sBits %d\n" % (self.name, bits))
+            self.data = tuple(pack(self.data, bits))
+            size = 4
        if size == 1:
-            file.write("unsigned char")
+            t = "unsigned char"
        elif size == 2:
-            file.write("unsigned short")
+            t = "unsigned short"
        else:
-            file.write("unsigned int")
-        file.write(" " + self.name + "[] = {\n")
-        if self.data:
-            s = "    "
-            for item in self.data:
-                i = str(item) + ", "
-                if len(s) + len(i) > 78:
-                    file.write(s + "\n")
-                    s = "    " + i
-                else:
-                    s = s + i
-            if s.strip():
-                file.write(s + "\n")
-        file.write("};\n\n")
+            t = "unsigned int"
+        hdr.write("extern const %s %s[%d];\n" % (t, self.name, len(self.data)))
+        if self.rle:
+            codes = tuple(rle(self.data, (1<<(8*size))-1))
+            f.write("%s %s[%d];\n" % (t, self.name, len(self.data)))
+            f.write("static const %s %s_rodata[%d+1][2] = { /* %g%% profit */\n" % (t, self.name, len(codes), len(codes) * size * 2 / float(len(self.data) * size) * 100))
+            for a,b in codes:
+                f.write("    {%3d, 0x%02x},\n" % (a, b))
+            f.write("    {0},\n")
+            f.write("};\n")
+            f.write("static textstartup void %s_init(void) {\n" % (self.name));
+            if size == 1:
+                f.write("    rldecode2(%s, (void *)%s_rodata);\n" % (self.name, self.name));
+            else:
+                f.write("    int i, j, k;\n");
+                f.write("    for (k = i = 0; i < %d; ++i) {\n" % (len(codes)));
+                f.write("        for (j = 0; j < %s_rodata[i][0]; ++j) {\n" % (self.name));
+                f.write("            %s[k++] = %s_rodata[i][1];\n" % (self.name, self.name));
+                f.write("        }\n");
+                f.write("    }\n");
+            f.write("}\n");
+            f.write("const void *const %s_ctor[] initarray = {\n" % (self.name));
+            f.write("    %s_init,\n" % (self.name));
+            f.write("};\n");
+            f.write("\n");
+        else:
+            f.write("const %s %s[] = {\n" % (t, self.name))
+            if self.data:
+                s = "    "
+                for item in self.data:
+                    i = str(item) + ", "
+                    if len(s) + len(i) > 78:
+                        f.write(s + "\n")
+                        s = "    " + i
+                    else:
+                        s = s + i
+                if s.strip():
+                    f.write(s + "\n")
+            f.write("};\n\n")
+
+def rle(data, maxval):
+    i = 0
+    j = 0
+    for i,x in enumerate(data):
+        if j == 0:
+            y = x
+            j = 1
+        elif y == x and j < maxval:
+            j += 1
+        else:
+            yield (j, y)
+            y = x
+            j = 1
+    if j:
+        yield (j, y)

 def getsize(data):
    # return smallest possible integer size for the given array
@ -1294,7 +1380,6 @@ def splitbins(t, trace=0):
    is printed to sys.stderr.  The higher the value, the more info
    you'll get.
    """
-
    if trace:
        def dump(t1, t2, shift, bytes):
            print("%d+%d bins at shift %d; %d bytes" % (