cosmopolitan/third_party/python/Modules/unicodedata_nfdnfkd.c
Justine Tunney 6f7d0cb1c3
Pay off more technical debt
This makes breaking changes to add underscores to many non-standard
function names provided by the c library. MODE=tiny is now tinier and
we now use smaller locks that are better for tiny apps in this mode.
Some headers have been renamed to be in the same folder as the build
package, so it'll be easier to know which build dependency is needed.
Certain old misguided interfaces have been removed. Intel intrinsics
headers are now listed in libc/isystem (but not in the amalgamation)
to help further improve open source compatibility. Header complexity
has also been reduced. Lastly, more shell scripts are now available.
2022-09-12 23:36:56 -07:00

141 lines
5.5 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Python 3 │
│ https://docs.python.org/3/license.html │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/intrin/bits.h"
#include "third_party/python/Include/pyerrors.h"
#include "third_party/python/Include/pymem.h"
#include "third_party/python/Modules/unicodedata.h"
#include "third_party/python/Modules/unicodedata_unidata.h"
/* clang-format off */
PyObject *
_PyUnicode_NfdNfkd(PyObject *self, PyObject *input, int k)
{
PyObject *result;
Py_UCS4 *output;
Py_ssize_t i, o, osize;
int kind;
void *data;
/* Longest decomposition in Unicode 3.2: U+FDFA */
Py_UCS4 stack[20];
Py_ssize_t space, isize;
int index, prefix, count, stackptr;
unsigned char prev, cur;
stackptr = 0;
isize = PyUnicode_GET_LENGTH(input);
space = isize;
/* Overallocate at most 10 characters. */
if (space > 10) {
if (space <= PY_SSIZE_T_MAX - 10)
space += 10;
}
else {
space *= 2;
}
osize = space;
output = PyMem_NEW(Py_UCS4, space);
if (!output) {
PyErr_NoMemory();
return NULL;
}
i = o = 0;
kind = PyUnicode_KIND(input);
data = PyUnicode_DATA(input);
while (i < isize) {
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
while(stackptr) {
Py_UCS4 code = stack[--stackptr];
/* Hangul Decomposition adds three characters in
a single step, so we need at least that much room. */
if (space < 3) {
Py_UCS4 *new_output;
osize += 10;
space += 10;
new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
if (new_output == NULL) {
PyMem_Free(output);
PyErr_NoMemory();
return NULL;
}
output = new_output;
}
/* Hangul Decomposition. */
if (_Hanghoul_SBase <= code && code < (_Hanghoul_SBase + _Hanghoul_SCount)) {
int SIndex = code - _Hanghoul_SBase;
int L = _Hanghoul_LBase + SIndex / _Hanghoul_NCount;
int V = _Hanghoul_VBase + (SIndex % _Hanghoul_NCount) / _Hanghoul_TCount;
int T = _Hanghoul_TBase + SIndex % _Hanghoul_TCount;
output[o++] = L;
output[o++] = V;
space -= 2;
if (T != _Hanghoul_TBase) {
output[o++] = T;
space --;
}
continue;
}
/* normalization changes */
if (self && UCD_Check(self)) {
Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
if (value != 0) {
stack[stackptr++] = value;
continue;
}
}
/* Other decompositions. */
_PyUnicode_GetDecompRecord(self, code, &index, &prefix, &count);
/* Copy character if it is not decomposable, or has a
compatibility decomposition, but we do NFD. */
if (!count || (prefix && !k)) {
output[o++] = code;
space--;
continue;
}
/* Copy decomposition onto the stack, in reverse
order. */
while(count) {
code = _bextra(_PyUnicode_Decomp,
index + (--count),
_PyUnicode_DecompBits);
stack[stackptr++] = code;
}
}
}
result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
output, o);
PyMem_Free(output);
if (!result)
return NULL;
/* result is guaranteed to be ready, as it is compact. */
kind = PyUnicode_KIND(result);
data = PyUnicode_DATA(result);
/* Sort canonically. */
i = 0;
prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
cur = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
if (prev == 0 || cur == 0 || prev <= cur) {
prev = cur;
continue;
}
/* Non-canonical order. Need to switch *i with previous. */
o = i - 1;
while (1) {
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
PyUnicode_WRITE(kind, data, o+1,
PyUnicode_READ(kind, data, o));
PyUnicode_WRITE(kind, data, o, tmp);
o--;
if (o < 0)
break;
prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, o))->combining;
if (prev == 0 || prev <= cur)
break;
}
prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
}
return result;
}