Make numerous improvements

- Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
2025-05-23 13:52:28 +00:00 · 2021-09-27 22:58:51 -07:00 · 2021-09-27 22:58:51 -07:00 · 39bf41f4eb
commit 39bf41f4eb
parent fa7b4f5bd1
806 changed files with 77494 additions and 63859 deletions
--- a/third_party/python/Objects/unicodeobject.c
+++ b/third_party/python/Objects/unicodeobject.c
@ -6,6 +6,8 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #define PY_SSIZE_T_CLEAN
 #include "libc/assert.h"
+#include "libc/bits/likely.h"
+#include "libc/bits/weaken.h"
 #include "libc/errno.h"
 #include "libc/fmt/fmt.h"
 #include "libc/str/str.h"
@ -39,7 +41,7 @@
 #include "third_party/python/Include/unicodeobject.h"
 #include "third_party/python/Include/warnings.h"
 #include "third_party/python/Include/yoink.h"
-#include "third_party/python/Objects/stringlib/eq.inc"
+#include "third_party/python/Modules/unicodedata.h"
 /* clang-format off */

 PYTHON_PROVIDE("_string");
@ -51,6 +53,8 @@ PYTHON_PROVIDE("_string.__spec__");
 PYTHON_PROVIDE("_string.formatter_field_name_split");
 PYTHON_PROVIDE("_string.formatter_parser");

+#include "third_party/python/Objects/stringlib/eq.inc"
+
 /*

 Unicode implementation based on original code by Fredrik Lundh,
@ -1411,12 +1415,10 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
 {
    const wchar_t *iter;
    Py_UCS4 *ucs4_out;
-
    assert(unicode != NULL);
    assert(_PyUnicode_CHECK(unicode));
    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
-
    for (iter = begin; iter < end; ) {
        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
                           _PyUnicode_GET_LENGTH(unicode)));
@ -1434,7 +1436,6 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
    }
    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
                        _PyUnicode_GET_LENGTH(unicode)));
-
 }
 #endif

@ -2993,7 +2994,6 @@ PyUnicode_FromFormat(const char *format, ...)
 {
    PyObject* ret;
    va_list vargs;
-
 #ifdef HAVE_STDARG_PROTOTYPES
    va_start(vargs, format);
 #else
@ -3203,9 +3203,7 @@ _Py_normalize_encoding(const char *encoding,
    char *l;
    char *l_end;
    int punct;
-
    assert(encoding != NULL);
-
    e = encoding;
    l = lower;
    l_end = &lower[lower_len - 1];
@ -3215,7 +3213,6 @@ _Py_normalize_encoding(const char *encoding,
        if (c == 0) {
            break;
        }
-
        if (Py_ISALNUM(c) || c == '.') {
            if (punct && l != lower) {
                if (l == l_end) {
@ -3224,7 +3221,6 @@ _Py_normalize_encoding(const char *encoding,
                *l++ = '_';
            }
            punct = 0;
-
            if (l == l_end) {
                return 0;
            }
@ -3233,7 +3229,6 @@ _Py_normalize_encoding(const char *encoding,
        else {
            punct = 1;
        }
-
        e++;
    }
    *l = '\0';
@ -3385,7 +3380,6 @@ PyUnicode_Encode(const Py_UNICODE *s,
                 const char *errors)
 {
    PyObject *v, *unicode;
-
    unicode = PyUnicode_FromUnicode(s, size);
    if (unicode == NULL)
        return NULL;
@ -3613,7 +3607,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
 PyObject *
 PyUnicode_EncodeFSDefault(PyObject *unicode)
 {
-#if defined(__APPLE__)
+#if defined(__APPLE__) || defined(__COSMOPOLITAN__)
    return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
 #else
    PyInterpreterState *interp = PyThreadState_GET()->interp;
@ -3930,7 +3924,7 @@ PyUnicode_DecodeFSDefault(const char *s) {
 PyObject*
 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
 {
-#if defined(__APPLE__)
+#if defined(__APPLE__) || defined(__COSMOPOLITAN__)
    return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
 #else
    PyInterpreterState *interp = PyThreadState_GET()->interp;
@ -3955,7 +3949,6 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
 #endif
 }

-
 int
 PyUnicode_FSConverter(PyObject* arg, void* addr)
 {
@ -4071,18 +4064,16 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
 }


-char*
+char *
 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
 {
    PyObject *bytes;
-
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        return NULL;
    }
    if (PyUnicode_READY(unicode) == -1)
        return NULL;
-
    if (PyUnicode_UTF8(unicode) == NULL) {
        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
        bytes = _PyUnicode_AsUTF8String(unicode, NULL);
@ -4100,13 +4091,12 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
        Py_DECREF(bytes);
    }
-
    if (psize)
        *psize = PyUnicode_UTF8_LENGTH(unicode);
    return PyUnicode_UTF8(unicode);
 }

-char*
+char *
 PyUnicode_AsUTF8(PyObject *unicode)
 {
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
@ -5989,8 +5979,6 @@ PyUnicode_AsUTF16String(PyObject *unicode)

 /* --- Unicode Escape Codec ----------------------------------------------- */

-static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
-
 PyObject *
 _PyUnicode_DecodeUnicodeEscape(const char *s,
                               Py_ssize_t size,
@ -6142,19 +6130,13 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,

            /* \N{name} */
        case 'N':
-            if (ucnhash_CAPI == NULL) {
-                /* load the unicode data module */
-                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
-                                                PyUnicodeData_CAPSULE_NAME, 1);
-                if (ucnhash_CAPI == NULL) {
-                    PyErr_SetString(
-                        PyExc_UnicodeError,
-                        "\\N escapes not supported (can't load unicodedata module)"
-                        );
-                    goto onError;
-                }
+            if (!weaken(_PyUnicode_GetCode)) {
+                PyErr_SetString(
+                    PyExc_UnicodeError,
+                    "\\N escapes not supported "
+                    "(you must yoink pyc:unicodedata or _PyUnicode_GetCode)");
+                goto onError;
            }
-
            message = "malformed \\N character escape";
            if (s < end && *s == '{') {
                const char *start = ++s;
@ -6168,8 +6150,7 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
                    s++;
                    ch = 0xffffffff; /* in case 'getcode' messes up */
                    if (namelen <= INT_MAX &&
-                        ucnhash_CAPI->getcode(NULL, start, (int)namelen,
-                                              &ch, 0)) {
+                        weaken(_PyUnicode_GetCode)(NULL, start, (int)namelen, &ch, 0)) {
                        assert(ch <= MAX_UNICODE);
                        WRITE_CHAR(ch);
                        continue;
@ -7613,7 +7594,6 @@ encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
    int l2 = (c>>7) & 0xF;
    int l3 = c & 0x7F;
    int i;
-
    if (c > 0xFFFF)
        return -1;
    if (c == 0)
@ -7644,7 +7624,6 @@ charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
 {
    PyObject *w = PyLong_FromLong((long)c);
    PyObject *x;
-
    if (w == NULL)
        return NULL;
    x = PyObject_GetItem(mapping, w);
@ -9006,11 +8985,11 @@ tailmatch(PyObject *self,
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
        /* If both are of the same kind, memcmp is sufficient */
        if (kind_self == kind_sub) {
-            return ! memcmp((char *)data_self +
-                                (offset * PyUnicode_KIND(substring)),
-                            data_sub,
-                            PyUnicode_GET_LENGTH(substring) *
-                                PyUnicode_KIND(substring));
+            return !bcmp((char *)data_self +
+                             (offset * PyUnicode_KIND(substring)),
+                         data_sub,
+                         PyUnicode_GET_LENGTH(substring) *
+                             PyUnicode_KIND(substring));
        }
        /* otherwise we have to compare each character by first accessing it */
        else {
@ -10387,14 +10366,12 @@ unicode_compare(PyObject *str1, PyObject *str2)
 #undef COMPARE
 }

-static int
+static inline int
 unicode_compare_eq(PyObject *str1, PyObject *str2)
 {
    int kind;
    void *data1, *data2;
    Py_ssize_t len;
-    int cmp;
-
    len = PyUnicode_GET_LENGTH(str1);
    if (PyUnicode_GET_LENGTH(str2) != len)
        return 0;
@ -10403,9 +10380,7 @@ unicode_compare_eq(PyObject *str1, PyObject *str2)
        return 0;
    data1 = PyUnicode_DATA(str1);
    data2 = PyUnicode_DATA(str2);
-
-    cmp = memcmp(data1, data2, len * kind);
-    return (cmp == 0);
+    return !bcmp(data1, data2, len * kind);
 }


@ -10509,17 +10484,25 @@ non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
    return 1;
 }

+static int
+IsPureAscii(const char *p)
+{
+    int c;
+    while ((c = *p++)) {
+        if (c & 128) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
 int
 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
 {
    size_t len;
-    assert(_PyUnicode_CHECK(unicode));
    assert(str);
-#ifndef NDEBUG
-    for (const char *p = str; *p; p++) {
-        assert((unsigned char)*p < 128);
-    }
-#endif
+    assert(IsPureAscii(str));
+    assert(_PyUnicode_CHECK(unicode));
    if (PyUnicode_READY(unicode) == -1) {
        /* Memory error or bad data */
        PyErr_Clear();
@ -10529,7 +10512,7 @@ _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
        return 0;
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
    return strlen(str) == len &&
-           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
+           !bcmp(PyUnicode_1BYTE_DATA(unicode), str, len);
 }

 int
@ -10540,11 +10523,7 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)

    assert(_PyUnicode_CHECK(left));
    assert(right->string);
-#ifndef NDEBUG
-    for (const char *p = right->string; *p; p++) {
-        assert((unsigned char)*p < 128);
-    }
-#endif
+    assert(IsPureAscii(right->string));

    if (PyUnicode_READY(left) == -1) {
        /* memory error or bad data */
@ -10585,11 +10564,12 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
    int result;
    PyObject *v;

-    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
+    if (UNLIKELY(!PyUnicode_Check(left) ||
+                 !PyUnicode_Check(right)))
        Py_RETURN_NOTIMPLEMENTED;

-    if (PyUnicode_READY(left) == -1 ||
-        PyUnicode_READY(right) == -1)
+    if (UNLIKELY(PyUnicode_READY(left) == -1 ||
+                 PyUnicode_READY(right) == -1))
        return NULL;

    if (left == right) {