Make numerous improvements

- Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
2025-10-07 15:07:20 +00:00 · 2021-09-27 22:58:51 -07:00 · 2021-09-27 22:58:51 -07:00 · 39bf41f4eb
commit 39bf41f4eb
parent fa7b4f5bd1
806 changed files with 77494 additions and 63859 deletions
--- a/third_party/python/Modules/unicodedata_nfcnfkc.c
+++ b/third_party/python/Modules/unicodedata_nfcnfkc.c
@ -0,0 +1,147 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Python 3                                                                     │
+│ https://docs.python.org/3/license.html                                       │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/bits/likely.h"
+#include "third_party/python/Include/pyerrors.h"
+#include "third_party/python/Include/pymem.h"
+#include "third_party/python/Modules/unicodedata.h"
+#include "third_party/python/Modules/unicodedata_unidata.h"
+/* clang-format off */
+
+PyObject *
+_PyUnicode_NfcNfkc(PyObject *self, PyObject *input, int k)
+{
+    int kind;
+    void *data;
+    Py_UCS4 code;
+    Py_UCS4 *output;
+    PyObject *result;
+    int cskipped = 0;
+    Py_ssize_t skipped[20];
+    Py_ssize_t i, i1, o, len;
+    int f,l,index,index1,comb;
+    result = _PyUnicode_NfdNfkd(self, input, k);
+    if (!result)
+        return NULL;
+    /* result will be "ready". */
+    kind = PyUnicode_KIND(result);
+    data = PyUnicode_DATA(result);
+    len = PyUnicode_GET_LENGTH(result);
+    /* We allocate a buffer for the output.
+       If we find that we made no changes, we still return
+       the NFD result. */
+    output = PyMem_NEW(Py_UCS4, len);
+    if (!output) {
+        PyErr_NoMemory();
+        Py_DECREF(result);
+        return 0;
+    }
+    i = o = 0;
+  again:
+    while (i < len) {
+      for (index = 0; index < cskipped; index++) {
+          if (skipped[index] == i) {
+              /* *i character is skipped.
+                 Remove from list. */
+              skipped[index] = skipped[cskipped-1];
+              cskipped--;
+              i++;
+              goto again; /* continue while */
+          }
+      }
+      /* Hangul Composition. We don't need to check for <LV,T>
+         pairs, since we always have decomposed data. */
+      code = PyUnicode_READ(kind, data, i);
+      if ((UNLIKELY(_Hanghoul_LBase <= code && code < _Hanghoul_LBase + _Hanghoul_LCount) &&
+           i + 1 < len && _Hanghoul_VBase <= PyUnicode_READ(kind, data, i+1) &&
+           PyUnicode_READ(kind, data, i+1) < _Hanghoul_VBase + _Hanghoul_VCount)) {
+          /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
+             and V character is a modern vowel (0x1161 ~ 0x1175). */
+          int LIndex, VIndex;
+          LIndex = code - _Hanghoul_LBase;
+          VIndex = PyUnicode_READ(kind, data, i+1) - _Hanghoul_VBase;
+          code = _Hanghoul_SBase + (LIndex * _Hanghoul_VCount + VIndex) * _Hanghoul_TCount;
+          i+=2;
+          if ((i < len &&
+               _Hanghoul_TBase < PyUnicode_READ(kind, data, i) &&
+               PyUnicode_READ(kind, data, i) < (_Hanghoul_TBase + _Hanghoul_TCount))) {
+              /* check T character is a modern trailing consonant
+                 (0x11A8 ~ 0x11C2). */
+              code += PyUnicode_READ(kind, data, i) - _Hanghoul_TBase;
+              i++;
+          }
+          output[o++] = code;
+          continue;
+      }
+      /* code is still input[i] here */
+      f = _PyUnicode_FindNfcIndex(_PyUnicode_NfcFirst, code);
+      if (f == -1) {
+          output[o++] = code;
+          i++;
+          continue;
+      }
+      /* Find next unblocked character. */
+      i1 = i+1;
+      comb = 0;
+      /* output base character for now; might be updated later. */
+      output[o] = PyUnicode_READ(kind, data, i);
+      while (i1 < len) {
+          Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
+          int comb1 = _PyUnicode_GetRecord(code1)->combining;
+          if (comb) {
+              if (comb1 == 0)
+                  break;
+              if (comb >= comb1) {
+                  /* Character is blocked. */
+                  i1++;
+                  continue;
+              }
+          }
+          l = _PyUnicode_FindNfcIndex(_PyUnicode_NfcLast, code1);
+          /* i1 cannot be combined with i. If i1
+             is a starter, we don't need to look further.
+             Otherwise, record the combining class. */
+          if (l == -1) {
+            not_combinable:
+              if (comb1 == 0)
+                  break;
+              comb = comb1;
+              i1++;
+              continue;
+          }
+          index = f * UNIDATA_TOTAL_LAST + l;
+          index1 = _PyUnicode_CompIndex[index >> _PyUnicode_CompShift];
+          code = _PyUnicode_Bextr(_PyUnicode_CompData,
+                                  (index1 << _PyUnicode_CompShift)+
+                                  (index & ((1 << _PyUnicode_CompShift) - 1)),
+                                  _PyUnicode_CompDataBits);
+          if (code == 0)
+              goto not_combinable;
+          /* Replace the original character. */
+          output[o] = code;
+          /* Mark the second character unused. */
+          assert(cskipped < 20);
+          skipped[cskipped++] = i1;
+          i1++;
+          f = _PyUnicode_FindNfcIndex(_PyUnicode_NfcFirst, output[o]);
+          if (f == -1)
+              break;
+      }
+      /* Output character was already written.
+         Just advance the indices. */
+      o++; i++;
+    }
+    if (o == len) {
+        /* No changes. Return original string. */
+        PyMem_Free(output);
+        return result;
+    }
+    Py_DECREF(result);
+    result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
+                                       output, o);
+    PyMem_Free(output);
+    return result;
+}