cosmopolitan/third_party/python/Modules/unicodedata_nfdnfkd.c

142 lines
5.5 KiB
C
Raw Normal View History

Make numerous improvements - Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
2021-09-28 05:58:51 +00:00
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
Python 3
https://docs.python.org/3/license.html │
*/
2022-08-11 19:13:18 +00:00
#include "libc/intrin/bits.h"
Make numerous improvements - Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
2021-09-28 05:58:51 +00:00
#include "third_party/python/Include/pyerrors.h"
#include "third_party/python/Include/pymem.h"
#include "third_party/python/Modules/unicodedata.h"
#include "third_party/python/Modules/unicodedata_unidata.h"
/* clang-format off */
PyObject *
_PyUnicode_NfdNfkd(PyObject *self, PyObject *input, int k)
{
PyObject *result;
Py_UCS4 *output;
Py_ssize_t i, o, osize;
int kind;
void *data;
/* Longest decomposition in Unicode 3.2: U+FDFA */
Py_UCS4 stack[20];
Py_ssize_t space, isize;
int index, prefix, count, stackptr;
unsigned char prev, cur;
stackptr = 0;
isize = PyUnicode_GET_LENGTH(input);
space = isize;
/* Overallocate at most 10 characters. */
if (space > 10) {
if (space <= PY_SSIZE_T_MAX - 10)
space += 10;
}
else {
space *= 2;
}
osize = space;
output = PyMem_NEW(Py_UCS4, space);
if (!output) {
PyErr_NoMemory();
return NULL;
}
i = o = 0;
kind = PyUnicode_KIND(input);
data = PyUnicode_DATA(input);
while (i < isize) {
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
while(stackptr) {
Py_UCS4 code = stack[--stackptr];
/* Hangul Decomposition adds three characters in
a single step, so we need at least that much room. */
if (space < 3) {
Py_UCS4 *new_output;
osize += 10;
space += 10;
new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
if (new_output == NULL) {
PyMem_Free(output);
PyErr_NoMemory();
return NULL;
}
output = new_output;
}
/* Hangul Decomposition. */
if (_Hanghoul_SBase <= code && code < (_Hanghoul_SBase + _Hanghoul_SCount)) {
int SIndex = code - _Hanghoul_SBase;
int L = _Hanghoul_LBase + SIndex / _Hanghoul_NCount;
int V = _Hanghoul_VBase + (SIndex % _Hanghoul_NCount) / _Hanghoul_TCount;
int T = _Hanghoul_TBase + SIndex % _Hanghoul_TCount;
output[o++] = L;
output[o++] = V;
space -= 2;
if (T != _Hanghoul_TBase) {
output[o++] = T;
space --;
}
continue;
}
/* normalization changes */
if (self && UCD_Check(self)) {
Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
if (value != 0) {
stack[stackptr++] = value;
continue;
}
}
/* Other decompositions. */
_PyUnicode_GetDecompRecord(self, code, &index, &prefix, &count);
/* Copy character if it is not decomposable, or has a
compatibility decomposition, but we do NFD. */
if (!count || (prefix && !k)) {
output[o++] = code;
space--;
continue;
}
/* Copy decomposition onto the stack, in reverse
order. */
while(count) {
code = _bextra(_PyUnicode_Decomp,
index + (--count),
_PyUnicode_DecompBits);
Make numerous improvements - Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
2021-09-28 05:58:51 +00:00
stack[stackptr++] = code;
}
}
}
result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
output, o);
PyMem_Free(output);
if (!result)
return NULL;
/* result is guaranteed to be ready, as it is compact. */
kind = PyUnicode_KIND(result);
data = PyUnicode_DATA(result);
/* Sort canonically. */
i = 0;
prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
cur = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
if (prev == 0 || cur == 0 || prev <= cur) {
prev = cur;
continue;
}
/* Non-canonical order. Need to switch *i with previous. */
o = i - 1;
while (1) {
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
PyUnicode_WRITE(kind, data, o+1,
PyUnicode_READ(kind, data, o));
PyUnicode_WRITE(kind, data, o, tmp);
o--;
if (o < 0)
break;
prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, o))->combining;
if (prev == 0 || prev <= cur)
break;
}
prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
}
return result;
}