Apply fixes and speedups

2025-10-25 10:40:57 +00:00 · 2021-10-04 03:23:31 -07:00 · 2021-10-04 03:23:31 -07:00 · 725f4d79f6
commit 725f4d79f6
parent 7521bf9e73
36 changed files with 682 additions and 334 deletions
--- a/third_party/python/Modules/unicodedata.c
+++ b/third_party/python/Modules/unicodedata.c
@ -5,6 +5,7 @@
 │ https://docs.python.org/3/license.html                                       │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #define PY_SSIZE_T_CLEAN
+#include "libc/bits/bits.h"
 #include "libc/fmt/fmt.h"
 #include "libc/nexgen32e/kompressor.h"
 #include "third_party/python/Include/floatobject.h"
@ -404,7 +405,7 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)

    /* high byte is number of hex bytes (usually one or two), low byte
       is prefix code (from*/
-    count = _PyUnicode_Bextr(_PyUnicode_Decomp, index, _PyUnicode_DecompBits) >> 8;
+    count = bextra(_PyUnicode_Decomp, index, _PyUnicode_DecompBits) >> 8;

    /* XXX: could allocate the PyString up front instead
       (strlen(prefix) + 5 * count + 1 bytes) */
@ -412,7 +413,7 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
    /* Based on how index is calculated above and _PyUnicode_Decomp is
       generated from Tools/unicode/makeunicodedata.py, it should not be
       possible to overflow _PyUnicode_DecompPrefix. */
-    prefix_index = _PyUnicode_Bextr(_PyUnicode_Decomp, index, _PyUnicode_DecompBits) & 255;
+    prefix_index = bextra(_PyUnicode_Decomp, index, _PyUnicode_DecompBits) & 255;
    assert(prefix_index < Py_ARRAY_LENGTH(_PyUnicode_DecompPrefix));

    /* copy prefix */
@ -424,8 +425,8 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
            decomp[i++] = ' ';
        assert(i < sizeof(decomp));
        PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
-                      _PyUnicode_Bextr(_PyUnicode_Decomp, ++index,
-                                       _PyUnicode_DecompBits));
+                      bextra(_PyUnicode_Decomp, ++index,
+                             _PyUnicode_DecompBits));
        i += strlen(decomp + i);
    }
    return PyUnicode_FromStringAndSize(decomp, i);
--- a/third_party/python/Modules/unicodedata.h
+++ b/third_party/python/Modules/unicodedata.h
@ -96,25 +96,6 @@ void _PyUnicode_FindSyllable(const char *, int *, int *, int, int);
 int _PyUnicode_GetCode(PyObject *, const char *, int, Py_UCS4 *, int);
 void _PyUnicode_GetDecompRecord(PyObject *, Py_UCS4, int *, int *, int *);

-static inline unsigned _PyUnicode_Bextr(const unsigned *p, unsigned i, char b) {
-  size_t j;
-  unsigned k, r, w;
-  w = sizeof(unsigned) * CHAR_BIT;
-  assert(0 <= b && b < w);
-  j = i;
-  j *= b;
-  k = j & (w - 1);
-  j /= w;
-  if (k <= w - b) {
-    return (p[j] >> k) & ((1ul << b) - 1);
-  } else {
-    r = p[j] >> k;
-    r |= p[j + 1] << (w - k);
-    r &= (1ul << b) - 1;
-    return r;
-  }
-}
-
 COSMOPOLITAN_C_END_
 #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
 #endif /* COSMOPOLITAN_THIRD_PARTY_PYTHON_MODULES_UNICODEDATA_H_ */
--- a/third_party/python/Modules/unicodedata_getcode.c
+++ b/third_party/python/Modules/unicodedata_getcode.c
@ -4,6 +4,7 @@
 │ Python 3                                                                     │
 │ https://docs.python.org/3/license.html                                       │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/bits/bits.h"
 #include "libc/fmt/fmt.h"
 #include "third_party/python/Include/pyctype.h"
 #include "third_party/python/Include/pyerrors.h"
@ -20,7 +21,7 @@
 #define IS_NAMED_SEQ(cp) ((cp >= _PyUnicode_NamedSequencesStart) && \
                          (cp <  _PyUnicode_NamedSequencesEnd))

-static const char * const kHangulSyllables[][3] = {
+static const char kHangulSyllables[][3][4] = {
    { "G",  "A",   ""   },
    { "GG", "AE",  "G"  },
    { "N",  "YA",  "GG" },
@ -40,15 +41,15 @@ static const char * const kHangulSyllables[][3] = {
    { "T",  "WI",  "M"  },
    { "P",  "YU",  "B"  },
    { "H",  "EU",  "BS" },
-    { 0,    "YI",  "S"  },
-    { 0,    "I",   "SS" },
-    { 0,    0,     "NG" },
-    { 0,    0,     "J"  },
-    { 0,    0,     "C"  },
-    { 0,    0,     "K"  },
-    { 0,    0,     "T"  },
-    { 0,    0,     "P"  },
-    { 0,    0,     "H"  }
+    { "",   "YI",  "S"  },
+    { "",   "I",   "SS" },
+    { "",   "",    "NG" },
+    { "",   "",    "J"  },
+    { "",   "",    "C"  },
+    { "",   "",    "K"  },
+    { "",   "",    "T"  },
+    { "",   "",    "P"  },
+    { "",   "",    "H"  }
 };

 void
@ -173,7 +174,7 @@ _PyUnicode_GetCode(PyObject *self, const char *name, int namelen, Py_UCS4 *code,
       details */
    h = (unsigned int)_gethash(name, namelen, _PyUnicode_CodeMagic);
    i = ~h & mask;
-    v = _PyUnicode_Bextr(_PyUnicode_CodeHash, i, _PyUnicode_CodeHashBits);
+    v = bextra(_PyUnicode_CodeHash, i, _PyUnicode_CodeHashBits);
    if (!v)
        return 0;
    if (_cmpname(self, v, name, namelen))
@ -183,7 +184,7 @@ _PyUnicode_GetCode(PyObject *self, const char *name, int namelen, Py_UCS4 *code,
        incr = mask;
    for (;;) {
        i = (i + incr) & mask;
-        v = _PyUnicode_Bextr(_PyUnicode_CodeHash, i, _PyUnicode_CodeHashBits);
+        v = bextra(_PyUnicode_CodeHash, i, _PyUnicode_CodeHashBits);
        if (!v)
            return 0;
        if (_cmpname(self, v, name, namelen))
@ -246,10 +247,10 @@ _PyUnicode_GetUcName(PyObject *self, Py_UCS4 code, char *buffer, int buflen,
    }
    /* get offset into phrasebook */
    offset = _PyUnicode_PhrasebookOffset1[(code>>_PyUnicode_PhrasebookShift)];
-    offset = _PyUnicode_Bextr(_PyUnicode_PhrasebookOffset2,
-                              (offset << _PyUnicode_PhrasebookShift) +
-                              (code & ((1 << _PyUnicode_PhrasebookShift) - 1)),
-                              _PyUnicode_PhrasebookOffset2Bits);
+    offset = bextra(_PyUnicode_PhrasebookOffset2,
+                    (offset << _PyUnicode_PhrasebookShift) +
+                    (code & ((1 << _PyUnicode_PhrasebookShift) - 1)),
+                    _PyUnicode_PhrasebookOffset2Bits);
    if (!offset)
        return 0;
    i = 0;
@ -270,8 +271,8 @@ _PyUnicode_GetUcName(PyObject *self, Py_UCS4 code, char *buffer, int buflen,
           word has bit 7 set.  the last word in a string ends with
           0x80 */
        w = (_PyUnicode_Lexicon +
-             _PyUnicode_Bextr(_PyUnicode_LexiconOffset,
-                              word, _PyUnicode_LexiconOffsetBits));
+             bextra(_PyUnicode_LexiconOffset, word,
+                    _PyUnicode_LexiconOffsetBits));
        while (*w < 128) {
            if (i >= buflen)
                return 0; /* buffer overflow */
--- a/third_party/python/Modules/unicodedata_getdecomprecord.c
+++ b/third_party/python/Modules/unicodedata_getdecomprecord.c
@ -4,6 +4,7 @@
 │ Python 3                                                                     │
 │ https://docs.python.org/3/license.html                                       │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/bits/bits.h"
 #include "third_party/python/Modules/unicodedata.h"
 #include "third_party/python/Modules/unicodedata_unidata.h"
 /* clang-format off */
@ -30,7 +31,7 @@ _PyUnicode_GetDecompRecord(PyObject *self,
    }
    /* high byte is number of hex bytes (usually one or two), low byte
       is prefix code (from*/
-    decomp = _PyUnicode_Bextr(_PyUnicode_Decomp, *index, _PyUnicode_DecompBits);
+    decomp = bextra(_PyUnicode_Decomp, *index, _PyUnicode_DecompBits);
    *count = decomp >> 8;
    *prefix = decomp & 255;
    (*index)++;
--- a/third_party/python/Modules/unicodedata_nfcnfkc.c
+++ b/third_party/python/Modules/unicodedata_nfcnfkc.c
@ -4,6 +4,7 @@
 │ Python 3                                                                     │
 │ https://docs.python.org/3/license.html                                       │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/bits/bits.h"
 #include "libc/bits/likely.h"
 #include "third_party/python/Include/pyerrors.h"
 #include "third_party/python/Include/pymem.h"
@ -114,10 +115,10 @@ _PyUnicode_NfcNfkc(PyObject *self, PyObject *input, int k)
          }
          index = f * UNIDATA_TOTAL_LAST + l;
          index1 = _PyUnicode_CompIndex[index >> _PyUnicode_CompShift];
-          code = _PyUnicode_Bextr(_PyUnicode_CompData,
-                                  (index1 << _PyUnicode_CompShift)+
-                                  (index & ((1 << _PyUnicode_CompShift) - 1)),
-                                  _PyUnicode_CompDataBits);
+          code = bextra(_PyUnicode_CompData,
+                        (index1 << _PyUnicode_CompShift)+
+                        (index & ((1 << _PyUnicode_CompShift) - 1)),
+                        _PyUnicode_CompDataBits);
          if (code == 0)
              goto not_combinable;
          /* Replace the original character. */
--- a/third_party/python/Modules/unicodedata_nfdnfkd.c
+++ b/third_party/python/Modules/unicodedata_nfdnfkd.c
@ -4,6 +4,7 @@
 │ Python 3                                                                     │
 │ https://docs.python.org/3/license.html                                       │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/bits/bits.h"
 #include "third_party/python/Include/pyerrors.h"
 #include "third_party/python/Include/pymem.h"
 #include "third_party/python/Modules/unicodedata.h"
@ -96,9 +97,9 @@ _PyUnicode_NfdNfkd(PyObject *self, PyObject *input, int k)
            /* Copy decomposition onto the stack, in reverse
               order.  */
            while(count) {
-                code = _PyUnicode_Bextr(_PyUnicode_Decomp,
-                                        index + (--count),
-                                        _PyUnicode_DecompBits);
+                code = bextra(_PyUnicode_Decomp,
+                              index + (--count),
+                              _PyUnicode_DecompBits);
                stack[stackptr++] = code;
            }
        }