Make numerous improvements

- Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
2025-10-06 22:47:20 +00:00 · 2021-09-27 22:58:51 -07:00 · 2021-09-27 22:58:51 -07:00 · 39bf41f4eb
commit 39bf41f4eb
parent fa7b4f5bd1
806 changed files with 77494 additions and 63859 deletions
--- a/third_party/python/Modules/unicodedata_nfdnfkd.c
+++ b/third_party/python/Modules/unicodedata_nfdnfkd.c
@ -0,0 +1,140 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Python 3                                                                     │
+│ https://docs.python.org/3/license.html                                       │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/python/Include/pyerrors.h"
+#include "third_party/python/Include/pymem.h"
+#include "third_party/python/Modules/unicodedata.h"
+#include "third_party/python/Modules/unicodedata_unidata.h"
+/* clang-format off */
+
+PyObject *
+_PyUnicode_NfdNfkd(PyObject *self, PyObject *input, int k)
+{
+    PyObject *result;
+    Py_UCS4 *output;
+    Py_ssize_t i, o, osize;
+    int kind;
+    void *data;
+    /* Longest decomposition in Unicode 3.2: U+FDFA */
+    Py_UCS4 stack[20];
+    Py_ssize_t space, isize;
+    int index, prefix, count, stackptr;
+    unsigned char prev, cur;
+    stackptr = 0;
+    isize = PyUnicode_GET_LENGTH(input);
+    space = isize;
+    /* Overallocate at most 10 characters. */
+    if (space > 10) {
+        if (space <= PY_SSIZE_T_MAX - 10)
+            space += 10;
+    }
+    else {
+        space *= 2;
+    }
+    osize = space;
+    output = PyMem_NEW(Py_UCS4, space);
+    if (!output) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    i = o = 0;
+    kind = PyUnicode_KIND(input);
+    data = PyUnicode_DATA(input);
+    while (i < isize) {
+        stack[stackptr++] = PyUnicode_READ(kind, data, i++);
+        while(stackptr) {
+            Py_UCS4 code = stack[--stackptr];
+            /* Hangul Decomposition adds three characters in
+               a single step, so we need at least that much room. */
+            if (space < 3) {
+                Py_UCS4 *new_output;
+                osize += 10;
+                space += 10;
+                new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
+                if (new_output == NULL) {
+                    PyMem_Free(output);
+                    PyErr_NoMemory();
+                    return NULL;
+                }
+                output = new_output;
+            }
+            /* Hangul Decomposition. */
+            if (_Hanghoul_SBase <= code && code < (_Hanghoul_SBase + _Hanghoul_SCount)) {
+                int SIndex = code - _Hanghoul_SBase;
+                int L = _Hanghoul_LBase + SIndex / _Hanghoul_NCount;
+                int V = _Hanghoul_VBase + (SIndex % _Hanghoul_NCount) / _Hanghoul_TCount;
+                int T = _Hanghoul_TBase + SIndex % _Hanghoul_TCount;
+                output[o++] = L;
+                output[o++] = V;
+                space -= 2;
+                if (T != _Hanghoul_TBase) {
+                    output[o++] = T;
+                    space --;
+                }
+                continue;
+            }
+            /* normalization changes */
+            if (self && UCD_Check(self)) {
+                Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
+                if (value != 0) {
+                    stack[stackptr++] = value;
+                    continue;
+                }
+            }
+            /* Other decompositions. */
+            _PyUnicode_GetDecompRecord(self, code, &index, &prefix, &count);
+            /* Copy character if it is not decomposable, or has a
+               compatibility decomposition, but we do NFD. */
+            if (!count || (prefix && !k)) {
+                output[o++] = code;
+                space--;
+                continue;
+            }
+            /* Copy decomposition onto the stack, in reverse
+               order.  */
+            while(count) {
+                code = _PyUnicode_Bextr(_PyUnicode_Decomp,
+                                        index + (--count),
+                                        _PyUnicode_DecompBits);
+                stack[stackptr++] = code;
+            }
+        }
+    }
+    result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
+                                       output, o);
+    PyMem_Free(output);
+    if (!result)
+        return NULL;
+    /* result is guaranteed to be ready, as it is compact. */
+    kind = PyUnicode_KIND(result);
+    data = PyUnicode_DATA(result);
+    /* Sort canonically. */
+    i = 0;
+    prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
+    for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
+        cur = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
+        if (prev == 0 || cur == 0 || prev <= cur) {
+            prev = cur;
+            continue;
+        }
+        /* Non-canonical order. Need to switch *i with previous. */
+        o = i - 1;
+        while (1) {
+            Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
+            PyUnicode_WRITE(kind, data, o+1,
+                            PyUnicode_READ(kind, data, o));
+            PyUnicode_WRITE(kind, data, o, tmp);
+            o--;
+            if (o < 0)
+                break;
+            prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, o))->combining;
+            if (prev == 0 || prev <= cur)
+                break;
+        }
+        prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
+    }
+    return result;
+}