Make numerous improvements

- Python static hello world now 1.8mb
- Python static fully loaded now 10mb
- Python HTTPS client now uses MbedTLS
- Python REPL now completes import stmts
- Increase stack size for Python for now
- Begin synthesizing posixpath and ntpath
- Restore Python \N{UNICODE NAME} support
- Restore Python NFKD symbol normalization
- Add optimized code path for Intel SHA-NI
- Get more Python unit tests passing faster
- Get Python help() pagination working on NT
- Python hashlib now supports MbedTLS PBKDF2
- Make memcpy/memmove/memcmp/bcmp/etc. faster
- Add Mersenne Twister and Vigna to LIBC_RAND
- Provide privileged __printf() for error code
- Fix zipos opendir() so that it reports ENOTDIR
- Add basic chmod() implementation for Windows NT
- Add Cosmo's best functions to Python cosmo module
- Pin function trace indent depth to that of caller
- Show memory diagram on invalid access in MODE=dbg
- Differentiate stack overflow on crash in MODE=dbg
- Add stb_truetype and tools for analyzing font files
- Upgrade to UNICODE 13 and reduce its binary footprint
- COMPILE.COM now logs resource usage of build commands
- Start implementing basic poll() support on bare metal
- Set getauxval(AT_EXECFN) to GetModuleFileName() on NT
- Add descriptions to strerror() in non-TINY build modes
- Add COUNTBRANCH() macro to help with micro-optimizations
- Make error / backtrace / asan / memory code more unbreakable
- Add fast perfect C implementation of μ-Law and a-Law audio codecs
- Make strtol() functions consistent with other libc implementations
- Improve Linenoise implementation (see also github.com/jart/bestline)
- COMPILE.COM now suppresses stdout/stderr of successful build commands
This commit is contained in:
Justine Tunney 2021-09-27 22:58:51 -07:00
parent fa7b4f5bd1
commit 39bf41f4eb
806 changed files with 77494 additions and 63859 deletions

View file

@ -36,6 +36,9 @@ from __future__ import print_function
import time, sys
if __name__ == 'PYOBJ.COM':
import resource
#
# Note: Please keep this module compatible to Python 1.5.2.
#

View file

@ -65,4 +65,6 @@ def gencodecs(prefix):
if __name__ == '__main__':
import sys
gencodecs(sys.argv[1])
gencodecs(sys.argv[1]
if len(sys.argv) > 1 else
"third_party/python/Lib/encodings")

View file

@ -29,6 +29,7 @@
import os
import sys
import zlib
import zipfile
from textwrap import dedent
@ -42,7 +43,7 @@ VERSION = "3.2"
# * Doc/library/stdtypes.rst, and
# * Doc/library/unicodedata.rst
# * Doc/reference/lexical_analysis.rst (two occurrences)
UNIDATA_VERSION = "9.0.0"
UNIDATA_VERSION = "13.0.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@ -106,39 +107,68 @@ cjk_ranges = [
('2B820', '2CEA1'),
]
def bias(c):
# if c <= 0xffff:
# return True
# if 0x1f600 <= c <= 0x1f64f:
# return True
return True
def maketables(trace=0):
print("--- Reading", UNICODE_DATA % "", "...")
version = ""
unicode = UnicodeData(UNIDATA_VERSION)
unicode = UnicodeData(UNIDATA_VERSION, select=bias)
print(len(list(filter(None, unicode.table))), "characters")
for version in old_versions:
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
old_unicode = UnicodeData(version, cjk_check=False)
old_unicode = UnicodeData(version, cjk_check=False, select=bias)
print(len(list(filter(None, old_unicode.table))), "characters")
merge_old_version(version, unicode, old_unicode)
with open("third_party/python/Modules/unicodedata_unidata.h", "w") as hdr:
print("""\
#ifndef COSMOPOLITAN_THIRD_PARTY_PYTHON_MODULES_UNICODEDATA_UNIDATA_H_
#define COSMOPOLITAN_THIRD_PARTY_PYTHON_MODULES_UNICODEDATA_UNIDATA_H_
#include "third_party/python/Modules/unicodedata.h"
COSMOPOLITAN_C_START_
/* GENERATED BY %s %s */""" % (SCRIPT, VERSION), file=hdr)
print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=hdr)
makeunicodename(hdr, unicode, trace)
makeunicodedata(hdr, unicode, trace)
makeunicodetype(hdr, unicode, trace)
hdr.write("""\
COSMOPOLITAN_C_END_
#endif /* COSMOPOLITAN_THIRD_PARTY_PYTHON_MODULES_UNICODEDATA_UNIDATA_H_ */
""")
makeunicodename(unicode, trace)
makeunicodedata(unicode, trace)
makeunicodetype(unicode, trace)
def startfile(fp):
print('#include "libc/nexgen32e/kompressor.h"', file=fp)
print('#include "third_party/python/Modules/unicodedata.h"', file=fp)
print("/* clang-format off */", file=fp)
print("/* GENERATED BY %s %s */" % (SCRIPT, VERSION), file=fp)
print(file=fp)
def makestringarray(name, strings, fp, hdr):
ml = max(len(s) for s in strings)
if ml < 8:
print('extern const char %s[%d][%d];' % (name, len(strings), ml+1), file=hdr)
print("const char %s[%d][%d] = {" % (name, len(strings), ml+1), file=fp)
else:
print('extern const char *const %s[%d];' % (name, len(strings)), file=hdr)
print("const char *const %s[%d] = {" % (name, len(strings)), file=fp)
for s in strings:
print(" \"%s\"," % (s), file=fp)
print("};", file=fp)
# --------------------------------------------------------------------
# unicode character properties
def makeunicodedata(unicode, trace):
def makeunicodedata(hdr, unicode, trace):
dummy = (0, 0, 0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
FILE = "Modules/unicodedata_db.h"
print("--- Preparing", FILE, "...")
# 1) database properties
for char in unicode.chars:
@ -256,135 +286,123 @@ def makeunicodedata(unicode, trace):
print(total_last, "last characters in NFC")
print(len(comp_pairs), "NFC pairs")
print("--- Writing", FILE, "...")
fp = open(FILE, "w")
print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
print(file=fp)
print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
print("/* a list of unique database records */", file=fp)
print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
for item in table:
print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
print("};", file=fp)
print(file=fp)
print("/* Reindexing of NFC first characters. */", file=fp)
print("#define TOTAL_FIRST",total_first, file=fp)
print("#define TOTAL_LAST",total_last, file=fp)
print("struct reindex{int start;short count,index;};", file=fp)
print("static struct reindex nfc_first[] = {", file=fp)
for start,end in comp_first_ranges:
print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
print(" {0,0,0}", file=fp)
print("};\n", file=fp)
print("static struct reindex nfc_last[] = {", file=fp)
for start,end in comp_last_ranges:
print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
print(" {0,0,0}", file=fp)
print("};\n", file=fp)
# FIXME: <fl> the following tables could be made static, and
# the support code moved into unicodedatabase.c
print("/* string literals */", file=fp)
print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
for name in CATEGORY_NAMES:
print(" \"%s\"," % name, file=fp)
print(" NULL", file=fp)
print("};", file=fp)
print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
for name in BIDIRECTIONAL_NAMES:
print(" \"%s\"," % name, file=fp)
print(" NULL", file=fp)
print("};", file=fp)
print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
for name in EASTASIANWIDTH_NAMES:
print(" \"%s\"," % name, file=fp)
print(" NULL", file=fp)
print("};", file=fp)
print("static const char *decomp_prefix[] = {", file=fp)
for name in decomp_prefix:
print(" \"%s\"," % name, file=fp)
print(" NULL", file=fp)
print("};", file=fp)
# split record index table
index1, index2, shift = splitbins(index, trace)
print("/* index tables for the database records */", file=fp)
print("#define SHIFT", shift, file=fp)
Array("index1", index1).dump(fp, trace)
Array("index2", index2).dump(fp, trace)
# split decomposition index table
index1, index2, shift = splitbins(decomp_index, trace)
print("/* decomposition data */", file=fp)
Array("decomp_data", decomp_data).dump(fp, trace)
print("/* index tables for the decomposition data */", file=fp)
print("#define DECOMP_SHIFT", shift, file=fp)
Array("decomp_index1", index1).dump(fp, trace)
Array("decomp_index2", index2).dump(fp, trace)
index, index2, shift = splitbins(comp_data, trace)
print("/* NFC pairs */", file=fp)
print("#define COMP_SHIFT", shift, file=fp)
Array("comp_index", index).dump(fp, trace)
Array("comp_data", index2).dump(fp, trace)
# Generate delta tables for old versions
for version, table, normalization in unicode.changed:
cversion = version.replace(".","_")
records = [table[0]]
cache = {table[0]:0}
index = [0] * len(table)
for i, record in enumerate(table):
try:
index[i] = cache[record]
except KeyError:
index[i] = cache[record] = len(records)
records.append(record)
index1, index2, shift = splitbins(index, trace)
print("static const change_record change_records_%s[] = {" % cversion, file=fp)
for record in records:
print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
# a list of unique records
with open("third_party/python/Modules/unicodedata_records.c", "w") as fp:
startfile(fp)
print("extern const _PyUnicode_Record _PyUnicode_Records[%d];" % (len(table)), file=hdr)
print("const _PyUnicode_Record _PyUnicode_Records[] = {", file=fp)
for item in table:
print(" {%3d, %3d, %3d, %3d, %3d, %3d}," % item, file=fp)
print("};", file=fp)
Array("changes_%s_index" % cversion, index1).dump(fp, trace)
Array("changes_%s_data" % cversion, index2).dump(fp, trace)
print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
print("{", file=fp)
print("\tint index;", file=fp)
print("\tif (n >= 0x110000) index = 0;", file=fp)
print("\telse {", file=fp)
print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
(cversion, shift, ((1<<shift)-1)), file=fp)
print("\t}", file=fp)
print("\treturn change_records_%s+index;" % cversion, file=fp)
print("}\n", file=fp)
print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
print("{", file=fp)
print("\tswitch(n) {", file=fp)
for k, v in normalization:
print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
print("\tdefault: return 0;", file=fp)
print("\t}\n}\n", file=fp)
print(file=fp)
index1, index2, shift = splitbins(index, trace)
print("#define _PyUnicode_RecordsShift", shift, file=hdr)
Array("_PyUnicode_RecordsIndex1", index1, rle=True).dump(fp, hdr, trace)
Array("_PyUnicode_RecordsIndex2", index2, rle=True).dump(fp, hdr, trace)
fp.close()
print("#define UNIDATA_TOTAL_FIRST", total_first, file=hdr)
print("#define UNIDATA_TOTAL_LAST", total_last, file=hdr)
with open("third_party/python/Modules/unicodedata_nfcfirst.c", "w") as fp:
startfile(fp)
print("extern const _PyUnicode_Reindex _PyUnicode_NfcFirst[%d];" % (len(comp_first_ranges)), file=hdr)
print("const _PyUnicode_Reindex _PyUnicode_NfcFirst[] = {", file=fp)
for start,end in comp_first_ranges:
print(" {%#07x, %3d, %3d}," % (start,end-start,comp_first[start]), file=fp)
print(" {0}", file=fp)
print("};\n", file=fp)
with open("third_party/python/Modules/unicodedata_nfclast.c", "w") as fp:
startfile(fp)
print("extern const _PyUnicode_Reindex _PyUnicode_NfcLast[%d];" % (len(comp_last_ranges)), file=hdr)
print("const _PyUnicode_Reindex _PyUnicode_NfcLast[] = {", file=fp)
for start,end in comp_last_ranges:
print(" {%#07x, %3d, %3d}," % (start,end-start,comp_last[start]), file=fp)
print(" {0}", file=fp)
print("};\n", file=fp)
with open("third_party/python/Modules/unicodedata_categorynames.c", "w") as fp:
startfile(fp)
makestringarray("_PyUnicode_CategoryNames", CATEGORY_NAMES, fp, hdr)
with open("third_party/python/Modules/unicodedata_bidirectionalnames.c", "w") as fp:
startfile(fp)
makestringarray("_PyUnicode_BidirectionalNames", BIDIRECTIONAL_NAMES, fp, hdr)
with open("third_party/python/Modules/unicodedata_eastasianwidthnames.c", "w") as fp:
startfile(fp)
makestringarray("_PyUnicode_EastAsianWidthNames", EASTASIANWIDTH_NAMES, fp, hdr)
with open("third_party/python/Modules/unicodedata_decompprefix.c", "w") as fp:
startfile(fp)
makestringarray("_PyUnicode_DecompPrefix", decomp_prefix, fp, hdr)
with open("third_party/python/Modules/unicodedata_decomp.c", "w") as fp:
startfile(fp)
index1, index2, shift = splitbins(decomp_index, trace)
print("#define _PyUnicode_DecompShift", shift, file=hdr)
Array("_PyUnicode_Decomp", decomp_data, pack=True).dump(fp, hdr, trace)
Array("_PyUnicode_DecompIndex1", index1, rle=True).dump(fp, hdr, trace)
Array("_PyUnicode_DecompIndex2", index2).dump(fp, hdr, trace)
with open("third_party/python/Modules/unicodedata_comp.c", "w") as fp:
startfile(fp)
index, index2, shift = splitbins(comp_data, trace)
print("#define _PyUnicode_CompShift", shift, file=hdr)
Array("_PyUnicode_CompIndex", index, rle=True).dump(fp, hdr, trace)
Array("_PyUnicode_CompData", index2, pack=True).dump(fp, hdr, trace)
# Generate delta tables for old versions [because punycode is pinned to 3.2.0]
for version, table, normalization in unicode.changed:
with open("third_party/python/Modules/unicodedata_%s.c" % (version), "w") as fp:
startfile(fp)
cversion = version.replace(".","_")
records = [table[0]]
cache = {table[0]:0}
index = [0] * len(table)
for i, record in enumerate(table):
try:
index[i] = cache[record]
except KeyError:
index[i] = cache[record] = len(records)
records.append(record)
index1, index2, shift = splitbins(index, trace)
print("const _PyUnicode_ChangeRecord _PyUnicode_ChangeRecords_%s[] = {" % cversion, file=fp)
for record in records:
print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
print("};", file=fp)
print(file=fp)
Array("_PyUnicode_ChangeIndex_%s" % cversion, index1, rle=True).dump(fp, hdr, trace)
Array("_PyUnicode_ChangeData_%s" % cversion, index2, rle=True).dump(fp, hdr, trace)
print("const _PyUnicode_ChangeRecord *_PyUnicode_GetChange_%s(Py_UCS4);" % cversion, file=hdr)
print("const _PyUnicode_ChangeRecord *_PyUnicode_GetChange_%s(Py_UCS4 n)" % cversion, file=fp)
print("{", file=fp)
print(" int i;", file=fp)
print(" if (n >= 0x110000) {", file=fp)
print(" i = 0;", file=fp)
print(" } else {", file=fp)
print(" i = _PyUnicode_ChangeIndex_%s[n>>%d];" % (cversion, shift), file=fp)
print(" i = _PyUnicode_ChangeData_%s[(i<<%d)+(n & %d)];" % (cversion, shift, ((1<<shift)-1)), file=fp)
print(" }", file=fp)
print(" return _PyUnicode_ChangeRecords_%s + i;" % cversion, file=fp)
print("}", file=fp)
print(file=fp)
print("Py_UCS4 _PyUnicode_Normalization_%s(Py_UCS4);" % (cversion), file=hdr)
print("Py_UCS4 _PyUnicode_Normalization_%s(Py_UCS4 n)" % (cversion), file=fp)
print("{", file=fp)
print(" switch(n) {", file=fp)
for k, v in normalization:
print(" case 0x%04x:" % (k), file=fp)
print(" return 0x%s;" % (v), file=fp)
print(" default:", file=fp)
print(" return 0;", file=fp)
print(" }", file=fp)
print("}", file=fp)
# --------------------------------------------------------------------
# unicode character type tables
def makeunicodetype(unicode, trace):
FILE = "Objects/unicodetype_db.h"
print("--- Preparing", FILE, "...")
def makeunicodetype(hdr, unicode, trace):
# extract unicode types
dummy = (0, 0, 0, 0, 0, 0)
@ -503,101 +521,98 @@ def makeunicodetype(unicode, trace):
print(len(linebreaks), "linebreak code points")
print(len(extra_casing), "extended case array")
print("--- Writing", FILE, "...")
with open("third_party/python/Modules/unicodedata_typerecords.c", "w") as fp:
startfile(fp)
print("extern const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[%d];" % (len(table)), file=hdr)
print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[%d] = {" % (len(table)), file=fp)
for item in table:
print(" {%3d, %3d, %3d, %3d, %3d, %3d}," % item, file=fp)
print("};", file=fp)
index1, index2, shift = splitbins(index, trace)
print("#define _PyUnicode_TypeRecordsShift", shift, file=hdr)
Array("_PyUnicode_TypeRecordsIndex1", index1, rle=True).dump(fp, hdr, trace)
Array("_PyUnicode_TypeRecordsIndex2", index2, rle=True).dump(fp, hdr, trace)
fp = open(FILE, "w")
print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
print(file=fp)
print("/* a list of unique character type descriptors */", file=fp)
print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
for item in table:
print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
print("};", file=fp)
print(file=fp)
with open("third_party/python/Modules/unicodedata_extendedcase.c", "w") as fp:
startfile(fp)
type_ = "char16_t"
for c in extra_casing:
if c > 0xffff:
type_ = "Py_UCS4"
break
print("extern const %s _PyUnicode_ExtendedCase[%d];" % (type_, len(extra_casing)), file=hdr)
print("const %s _PyUnicode_ExtendedCase[%d] = {" % (type_, len(extra_casing)), file=fp)
for c in extra_casing:
print(" %d," % c, file=fp)
print("};", file=fp)
print("/* extended case mappings */", file=fp)
print(file=fp)
print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
for c in extra_casing:
print(" %d," % c, file=fp)
print("};", file=fp)
print(file=fp)
with open("third_party/python/Modules/unicodedata_tonumeric.c", "w") as fp:
startfile(fp)
# Generate code for _PyUnicode_ToNumeric()
numeric_items = sorted(numeric.items())
print('/* Returns the numeric value as double for Unicode characters', file=fp)
print(' * having this property, -1.0 otherwise.', file=fp)
print(' */', file=fp)
print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
print('{', file=fp)
print(' long a, b = 1;', file=fp)
print(' switch (ch) {', file=fp)
for value, codepoints in numeric_items:
# Turn text into float literals
parts = value.split('/')
codepoints.sort()
for codepoint in codepoints:
print(' case 0x%04X:' % (codepoint,), file=fp)
if len(parts) == 1:
print(' a = %s;' % (parts[0],), file=fp)
elif len(parts) == 2:
print(' a = %s;' % (parts[0],), file=fp)
print(' b = %s;' % (parts[1],), file=fp)
else:
assert False
print(' break;', file=fp)
print(' default:', file=fp)
print(' a = -1;', file=fp)
print(' break;', file=fp)
print(' }', file=fp)
print(' return (double)a / b;', file=fp)
print('}', file=fp)
# split decomposition index table
index1, index2, shift = splitbins(index, trace)
print("/* type indexes */", file=fp)
print("#define SHIFT", shift, file=fp)
Array("index1", index1).dump(fp, trace)
Array("index2", index2).dump(fp, trace)
# Generate code for _PyUnicode_ToNumeric()
numeric_items = sorted(numeric.items())
print('/* Returns the numeric value as double for Unicode characters', file=fp)
print(' * having this property, -1.0 otherwise.', file=fp)
print(' */', file=fp)
print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
print('{', file=fp)
print(' switch (ch) {', file=fp)
for value, codepoints in numeric_items:
# Turn text into float literals
parts = value.split('/')
parts = [repr(float(part)) for part in parts]
value = '/'.join(parts)
codepoints.sort()
for codepoint in codepoints:
with open("third_party/python/Modules/unicodedata_iswhitespace.c", "w") as fp:
startfile(fp)
print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
print(" */", file=fp)
print('int _PyUnicode_IsWhitespace(Py_UCS4 ch)', file=fp)
print('{', file=fp)
print(' switch (ch) {', file=fp)
for codepoint in sorted(spaces):
print(' case 0x%04X:' % (codepoint,), file=fp)
print(' return (double) %s;' % (value,), file=fp)
print(' }', file=fp)
print(' return -1.0;', file=fp)
print('}', file=fp)
print(file=fp)
print(' return 1;', file=fp)
print(' }', file=fp)
print(' return 0;', file=fp)
print('}', file=fp)
# Generate code for _PyUnicode_IsWhitespace()
print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
print(" */", file=fp)
print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)
print('{', file=fp)
print(' switch (ch) {', file=fp)
for codepoint in sorted(spaces):
print(' case 0x%04X:' % (codepoint,), file=fp)
print(' return 1;', file=fp)
print(' }', file=fp)
print(' return 0;', file=fp)
print('}', file=fp)
print(file=fp)
# Generate code for _PyUnicode_IsLinebreak()
print("/* Returns 1 for Unicode characters having the line break", file=fp)
print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
print(" * type 'B', 0 otherwise.", file=fp)
print(" */", file=fp)
print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)
print('{', file=fp)
print(' switch (ch) {', file=fp)
for codepoint in sorted(linebreaks):
print(' case 0x%04X:' % (codepoint,), file=fp)
print(' return 1;', file=fp)
print(' }', file=fp)
print(' return 0;', file=fp)
print('}', file=fp)
print(file=fp)
fp.close()
with open("third_party/python/Modules/unicodedata_islinebreak.c", "w") as fp:
startfile(fp)
print("/* Returns 1 for Unicode characters having the line break", file=fp)
print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
print(" * type 'B', 0 otherwise.", file=fp)
print(" */", file=fp)
print('int _PyUnicode_IsLinebreak(Py_UCS4 ch)', file=fp)
print('{', file=fp)
print(' switch (ch) {', file=fp)
for codepoint in sorted(linebreaks):
print(' case 0x%04X:' % (codepoint,), file=fp)
print(' return 1;', file=fp)
print(' }', file=fp)
print(' return 0;', file=fp)
print('}', file=fp)
# --------------------------------------------------------------------
# unicode name database
def makeunicodename(unicode, trace):
FILE = "third_party/python/Modules/unicodename_db.h"
print("--- Preparing", FILE, "...")
def makeunicodename(hdr, unicode, trace):
# collect names
names = [None] * len(unicode.chars)
@ -631,7 +646,6 @@ def makeunicodename(unicode, trace):
words[w] = [len(words)]
print(n, "words in text;", b, "bytes")
wordlist = list(words.items())
# sort on falling frequency, then by name
@ -650,10 +664,14 @@ def makeunicodename(unicode, trace):
assert short > 0
# [jart] is this right?
short = min(short, len(wordlist))
print(short, "short indexes in lexicon")
# statistics
n = 0
print(short)
for i in range(short):
n = n + len(wordlist[i][1])
print(n, "short indexes in phrasebook")
@ -723,67 +741,50 @@ def makeunicodename(unicode, trace):
# collisions on the current data set. if you like, change it
# and see what happens...
codehash = Hash("code", data, 47)
codehash = Hash("_PyUnicode_Code", data, 47)
print("--- Writing", FILE, "...")
fp = open(FILE, "w")
print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
print(file=fp)
print("#define NAME_MAXLEN", 256, file=fp)
print(file=fp)
print("/* lexicon */", file=fp)
Array("lexicon", lexicon).dump(fp, trace)
Array("lexicon_offset", lexicon_offset).dump(fp, trace)
print("#define UNIDATA_NAME_MAXLEN", 256, file=hdr)
with open("third_party/python/Modules/unicodedata_lexicon.c", "w") as fp:
startfile(fp)
Array("_PyUnicode_Lexicon", lexicon).dump(fp, hdr, trace)
Array("_PyUnicode_LexiconOffset", lexicon_offset).dump(fp, hdr, trace)
# split decomposition index table
offset1, offset2, shift = splitbins(phrasebook_offset, trace)
print("#define _PyUnicode_PhrasebookShift", shift, file=hdr)
print("#define _PyUnicode_PhrasebookShort", short, file=hdr)
with open("third_party/python/Modules/unicodedata_phrasebook.c", "w") as fp:
startfile(fp)
Array("_PyUnicode_Phrasebook", phrasebook).dump(fp, hdr, trace)
Array("_PyUnicode_PhrasebookOffset1", offset1, rle=True).dump(fp, hdr, trace)
Array("_PyUnicode_PhrasebookOffset2", offset2, pack=True).dump(fp, hdr, trace)
print("/* code->name phrasebook */", file=fp)
print("#define phrasebook_shift", shift, file=fp)
print("#define phrasebook_short", short, file=fp)
with open("third_party/python/Modules/unicodedata_codehash.c", "w") as fp:
startfile(fp)
codehash.dump(fp, hdr, trace)
Array("phrasebook", phrasebook).dump(fp, trace)
Array("phrasebook_offset1", offset1).dump(fp, trace)
Array("phrasebook_offset2", offset2).dump(fp, trace)
print('#define _PyUnicode_AliasesStart %#x' % (NAME_ALIASES_START), file=hdr)
print('#define _PyUnicode_AliasesEnd %#x' % (NAME_ALIASES_START + len(unicode.aliases)), file=hdr)
print('extern const unsigned int _PyUnicode_NameAliases[%d];' % (len(unicode.aliases)), file=hdr)
with open("third_party/python/Modules/unicodedata_aliases.c", "w") as fp:
startfile(fp)
print('const unsigned int _PyUnicode_NameAliases[%d] = {' % (len(unicode.aliases)), file=fp)
for name, codepoint in unicode.aliases:
print(' 0x%04X,' % codepoint, file=fp)
print('};', file=fp)
print("/* name->code dictionary */", file=fp)
codehash.dump(fp, trace)
print('#define _PyUnicode_NamedSequencesStart %#x' % (NAMED_SEQUENCES_START), file=hdr)
print('#define _PyUnicode_NamedSequencesEnd %#x' %
(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=hdr)
print(file=fp)
print('static const unsigned int aliases_start = %#x;' %
NAME_ALIASES_START, file=fp)
print('static const unsigned int aliases_end = %#x;' %
(NAME_ALIASES_START + len(unicode.aliases)), file=fp)
print('static const unsigned int name_aliases[] = {', file=fp)
for name, codepoint in unicode.aliases:
print(' 0x%04X,' % codepoint, file=fp)
print('};', file=fp)
# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
# so we are using Py_UCS2 seq[4]. This needs to be updated if longer
# sequences or sequences with non-BMP chars are added.
# unicodedata_lookup should be adapted too.
print(dedent("""
typedef struct NamedSequence {
int seqlen;
Py_UCS2 seq[4];
} named_sequence;
"""), file=fp)
print('static const unsigned int named_sequences_start = %#x;' %
NAMED_SEQUENCES_START, file=fp)
print('static const unsigned int named_sequences_end = %#x;' %
(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
print('static const named_sequence named_sequences[] = {', file=fp)
for name, sequence in unicode.named_sequences:
seq_str = ', '.join('0x%04X' % cp for cp in sequence)
print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
print('};', file=fp)
fp.close()
print('extern const _PyUnicode_NamedSequence _PyUnicode_NamedSequences[%d];' % (len(unicode.named_sequences)), file=hdr)
with open("third_party/python/Modules/unicodedata_namedsequences.c", "w") as fp:
startfile(fp)
print('const _PyUnicode_NamedSequence _PyUnicode_NamedSequences[%d] = {' % (len(unicode.named_sequences)), file=fp)
for name, sequence in unicode.named_sequences:
seq_str = ', '.join('0x%04X' % cp for cp in sequence)
print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
print('};', file=fp)
def merge_old_version(version, new, old):
@ -914,7 +915,8 @@ class UnicodeData:
def __init__(self, version,
linebreakprops=False,
expand=1,
cjk_check=True):
cjk_check=True,
select=lambda c: True):
self.changed = []
table = [None] * 0x110000
with open_data(UNICODE_DATA, version) as file:
@ -924,14 +926,19 @@ class UnicodeData:
break
s = s.strip().split(";")
char = int(s[0], 16)
table[char] = s
if select(char):
table[char] = s
cjk_ranges_found = []
cjk_ranger = [(a,b) for a,b in cjk_ranges
if select(int(a,16)) and select(int(b,16))]
# expand first-last ranges
if expand:
field = None
for i in range(0, 0x110000):
if not select(i):
continue
s = table[i]
if s:
if s[1][-6:] == "First>":
@ -947,8 +954,9 @@ class UnicodeData:
f2 = field[:]
f2[0] = "%X" % i
table[i] = f2
if cjk_check and cjk_ranges != cjk_ranges_found:
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
# if cjk_check and cjk_ranger != cjk_ranges_found:
# raise ValueError("CJK ranges deviate: have %r want %r" %
# (cjk_ranges_found, cjk_ranger))
# public attributes
self.filename = UNICODE_DATA % ''
@ -970,10 +978,11 @@ class UnicodeData:
continue
char, name, abbrev = s.split(';')
char = int(char, 16)
self.aliases.append((name, char))
# also store the name in the PUA 1
self.table[pua_index][1] = name
pua_index += 1
if select(pua_index) and select(char):
self.aliases.append((name, char))
# also store the name in the PUA 1
self.table[pua_index][1] = name
pua_index += 1
assert pua_index - NAME_ALIASES_START == len(self.aliases)
self.named_sequences = []
@ -983,22 +992,24 @@ class UnicodeData:
assert pua_index < NAMED_SEQUENCES_START
pua_index = NAMED_SEQUENCES_START
with open_data(NAMED_SEQUENCES, version) as file:
for s in file:
s = s.strip()
if not s or s.startswith('#'):
continue
name, chars = s.split(';')
chars = tuple(int(char, 16) for char in chars.split())
# check that the structure defined in makeunicodename is OK
assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
"the NamedSequence struct and in unicodedata_lookup")
self.named_sequences.append((name, chars))
# also store these in the PUA 1
self.table[pua_index][1] = name
pua_index += 1
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
if select(pua_index):
with open_data(NAMED_SEQUENCES, version) as file:
for s in file:
s = s.strip()
if not s or s.startswith('#'):
continue
name, chars = s.split(';')
chars = tuple(int(char, 16) for char in chars.split())
chars = tuple(c for c in chars if select(c))
# check that the strutcure defined in makeunicodename is OK
assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
"the NamedSequence struct and in unicodedata_lookup")
self.named_sequences.append((name, chars))
# also store these in the PUA 1
self.table[pua_index][1] = name
pua_index += 1
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
self.exclusions = {}
with open_data(COMPOSITION_EXCLUSIONS, version) as file:
@ -1009,7 +1020,8 @@ class UnicodeData:
if s[0] == '#':
continue
char = int(s.split()[0],16)
self.exclusions[char] = 1
if select(char):
self.exclusions[char] = 1
widths = [None] * 0x110000
with open_data(EASTASIAN_WIDTH, version) as file:
@ -1026,7 +1038,8 @@ class UnicodeData:
else:
chars = [int(s[0], 16)]
for char in chars:
widths[char] = s[1]
if select(char):
widths[char] = s[1]
for i in range(0, 0x110000):
if table[i] is not None:
@ -1041,7 +1054,6 @@ class UnicodeData:
s = s.split('#', 1)[0].strip()
if not s:
continue
r, p = s.split(";")
r = r.strip()
p = p.strip()
@ -1067,7 +1079,8 @@ class UnicodeData:
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
table[char][-1].add('Line_Break')
if select(char):
table[char][-1].add('Line_Break')
# We only want the quickcheck properties
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
@ -1093,8 +1106,9 @@ class UnicodeData:
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
assert not (quickchecks[char]>>quickcheck_shift)&3
quickchecks[char] |= quickcheck
if select(char):
assert not (quickchecks[char]>>quickcheck_shift)&3
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(quickchecks[i])
@ -1130,10 +1144,11 @@ class UnicodeData:
# handle_capital_sigma in unicodeobject.c.
continue
c = int(data[0], 16)
lower = [int(char, 16) for char in data[1].split()]
title = [int(char, 16) for char in data[2].split()]
upper = [int(char, 16) for char in data[3].split()]
sc[c] = (lower, title, upper)
if select(c):
lower = [int(char, 16) for char in data[1].split() if select(int(char, 16))]
title = [int(char, 16) for char in data[2].split() if select(int(char, 16))]
upper = [int(char, 16) for char in data[3].split() if select(int(char, 16))]
sc[c] = (lower, title, upper)
cf = self.case_folding = {}
if version != '3.2.0':
with open_data(CASE_FOLDING, version) as file:
@ -1144,7 +1159,8 @@ class UnicodeData:
data = s.split("; ")
if data[1] in "CF":
c = int(data[0], 16)
cf[c] = [int(char, 16) for char in data[2].split()]
if select(c):
cf[c] = [int(char, 16) for char in data[2].split()]
def uselatin1(self):
# restrict character range to ISO Latin 1
@ -1223,52 +1239,122 @@ class Hash:
if table[i] is None:
table[i] = 0
self.data = Array(name + "_hash", table)
self.data = Array(name + "Hash", table, pack=True)
self.magic = magic
self.name = name
self.size = size
self.poly = poly
def dump(self, file, trace):
def dump(self, file, hdr, trace):
# write data to file, as a C array
self.data.dump(file, trace)
file.write("#define %s_magic %d\n" % (self.name, self.magic))
file.write("#define %s_size %d\n" % (self.name, self.size))
file.write("#define %s_poly %d\n" % (self.name, self.poly))
self.data.dump(file, hdr, trace)
hdr.write("#define %sMagic %d\n" % (self.name, self.magic))
hdr.write("#define %sSize %d\n" % (self.name, self.size))
hdr.write("#define %sPoly %d\n" % (self.name, self.poly))
# stuff to deal with arrays of unsigned integers
class Array:
def pack(data, bits, word=32):
assert 0 < bits < word
bitn = (bits * len(data) + word - 1) // word
bita = 0
for x in reversed(data):
bita <<= bits
bita |= x
for i in range(bitn):
yield bita & ((1 << word) - 1)
bita >>= 32
def __init__(self, name, data):
def deflate(data):
# z = zlib.compressobj(zlib.Z_BEST_COMPRESSION, zlib.DEFLATED, -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, zlib.Z_RLE)
z = zlib.compressobj(zlib.Z_BEST_COMPRESSION, zlib.DEFLATED, -zlib.MAX_WBITS)
b = z.compress(data)
b += z.flush(zlib.Z_FINISH)
return b
class Array:
def __init__(self, name, data, rle=False, pack=False):
self.name = name
self.data = data
self.pack = pack
self.rle = rle # adds 90µs latency to startup
def dump(self, file, trace=0):
# write data to file, as a C array
def dump(self, file, hdr, trace=0):
# write data to f, as a C array
f = file
bits = max(x.bit_length() for x in self.data)
size = getsize(self.data)
if trace:
print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
file.write("static ")
print("%s: %d bits" % (self.name, bits), file=sys.stderr)
print("%s: size is %d bytes" % (self.name, size*len(self.data)), file=sys.stderr)
print("%s: packed size is %d bytes" % (self.name, (bits*len(self.data)+31)//32*4), file=sys.stderr)
print("%s: rle size is %d bytes" % (self.name, len(tuple(rle(self.data, (1<<(8*size))-1)))*size*2), file=sys.stderr)
if size == 1:
print("%s: deflate size is %d bytes" % (self.name, len(deflate(bytearray(self.data)))), file=sys.stderr)
if self.pack:
hdr.write("#define %sBits %d\n" % (self.name, bits))
self.data = tuple(pack(self.data, bits))
size = 4
if size == 1:
file.write("unsigned char")
t = "unsigned char"
elif size == 2:
file.write("unsigned short")
t = "unsigned short"
else:
file.write("unsigned int")
file.write(" " + self.name + "[] = {\n")
if self.data:
s = " "
for item in self.data:
i = str(item) + ", "
if len(s) + len(i) > 78:
file.write(s + "\n")
s = " " + i
else:
s = s + i
if s.strip():
file.write(s + "\n")
file.write("};\n\n")
t = "unsigned int"
hdr.write("extern const %s %s[%d];\n" % (t, self.name, len(self.data)))
if self.rle:
codes = tuple(rle(self.data, (1<<(8*size))-1))
f.write("%s %s[%d];\n" % (t, self.name, len(self.data)))
f.write("static const %s %s_rodata[%d+1][2] = { /* %g%% profit */\n" % (t, self.name, len(codes), len(codes) * size * 2 / float(len(self.data) * size) * 100))
for a,b in codes:
f.write(" {%3d, 0x%02x},\n" % (a, b))
f.write(" {0},\n")
f.write("};\n")
f.write("static textstartup void %s_init(void) {\n" % (self.name));
if size == 1:
f.write(" rldecode2(%s, (void *)%s_rodata);\n" % (self.name, self.name));
else:
f.write(" int i, j, k;\n");
f.write(" for (k = i = 0; i < %d; ++i) {\n" % (len(codes)));
f.write(" for (j = 0; j < %s_rodata[i][0]; ++j) {\n" % (self.name));
f.write(" %s[k++] = %s_rodata[i][1];\n" % (self.name, self.name));
f.write(" }\n");
f.write(" }\n");
f.write("}\n");
f.write("const void *const %s_ctor[] initarray = {\n" % (self.name));
f.write(" %s_init,\n" % (self.name));
f.write("};\n");
f.write("\n");
else:
f.write("const %s %s[] = {\n" % (t, self.name))
if self.data:
s = " "
for item in self.data:
i = str(item) + ", "
if len(s) + len(i) > 78:
f.write(s + "\n")
s = " " + i
else:
s = s + i
if s.strip():
f.write(s + "\n")
f.write("};\n\n")
def rle(data, maxval):
i = 0
j = 0
for i,x in enumerate(data):
if j == 0:
y = x
j = 1
elif y == x and j < maxval:
j += 1
else:
yield (j, y)
y = x
j = 1
if j:
yield (j, y)
def getsize(data):
# return smallest possible integer size for the given array
@ -1294,7 +1380,6 @@ def splitbins(t, trace=0):
is printed to sys.stderr. The higher the value, the more info
you'll get.
"""
if trace:
def dump(t1, t2, shift, bytes):
print("%d+%d bins at shift %d; %d bytes" % (