mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-02-07 15:03:34 +00:00
6f7d0cb1c3
This makes breaking changes to add underscores to many non-standard function names provided by the c library. MODE=tiny is now tinier and we now use smaller locks that are better for tiny apps in this mode. Some headers have been renamed to be in the same folder as the build package, so it'll be easier to know which build dependency is needed. Certain old misguided interfaces have been removed. Intel intrinsics headers are now listed in libc/isystem (but not in the amalgamation) to help further improve open source compatibility. Header complexity has also been reduced. Lastly, more shell scripts are now available.
141 lines
5.5 KiB
C
141 lines
5.5 KiB
C
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
|
|
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
│ Python 3 │
|
|
│ https://docs.python.org/3/license.html │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "libc/intrin/bits.h"
|
|
#include "third_party/python/Include/pyerrors.h"
|
|
#include "third_party/python/Include/pymem.h"
|
|
#include "third_party/python/Modules/unicodedata.h"
|
|
#include "third_party/python/Modules/unicodedata_unidata.h"
|
|
/* clang-format off */
|
|
|
|
PyObject *
|
|
_PyUnicode_NfdNfkd(PyObject *self, PyObject *input, int k)
|
|
{
|
|
PyObject *result;
|
|
Py_UCS4 *output;
|
|
Py_ssize_t i, o, osize;
|
|
int kind;
|
|
void *data;
|
|
/* Longest decomposition in Unicode 3.2: U+FDFA */
|
|
Py_UCS4 stack[20];
|
|
Py_ssize_t space, isize;
|
|
int index, prefix, count, stackptr;
|
|
unsigned char prev, cur;
|
|
stackptr = 0;
|
|
isize = PyUnicode_GET_LENGTH(input);
|
|
space = isize;
|
|
/* Overallocate at most 10 characters. */
|
|
if (space > 10) {
|
|
if (space <= PY_SSIZE_T_MAX - 10)
|
|
space += 10;
|
|
}
|
|
else {
|
|
space *= 2;
|
|
}
|
|
osize = space;
|
|
output = PyMem_NEW(Py_UCS4, space);
|
|
if (!output) {
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
i = o = 0;
|
|
kind = PyUnicode_KIND(input);
|
|
data = PyUnicode_DATA(input);
|
|
while (i < isize) {
|
|
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
|
|
while(stackptr) {
|
|
Py_UCS4 code = stack[--stackptr];
|
|
/* Hangul Decomposition adds three characters in
|
|
a single step, so we need at least that much room. */
|
|
if (space < 3) {
|
|
Py_UCS4 *new_output;
|
|
osize += 10;
|
|
space += 10;
|
|
new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
|
|
if (new_output == NULL) {
|
|
PyMem_Free(output);
|
|
PyErr_NoMemory();
|
|
return NULL;
|
|
}
|
|
output = new_output;
|
|
}
|
|
/* Hangul Decomposition. */
|
|
if (_Hanghoul_SBase <= code && code < (_Hanghoul_SBase + _Hanghoul_SCount)) {
|
|
int SIndex = code - _Hanghoul_SBase;
|
|
int L = _Hanghoul_LBase + SIndex / _Hanghoul_NCount;
|
|
int V = _Hanghoul_VBase + (SIndex % _Hanghoul_NCount) / _Hanghoul_TCount;
|
|
int T = _Hanghoul_TBase + SIndex % _Hanghoul_TCount;
|
|
output[o++] = L;
|
|
output[o++] = V;
|
|
space -= 2;
|
|
if (T != _Hanghoul_TBase) {
|
|
output[o++] = T;
|
|
space --;
|
|
}
|
|
continue;
|
|
}
|
|
/* normalization changes */
|
|
if (self && UCD_Check(self)) {
|
|
Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
|
|
if (value != 0) {
|
|
stack[stackptr++] = value;
|
|
continue;
|
|
}
|
|
}
|
|
/* Other decompositions. */
|
|
_PyUnicode_GetDecompRecord(self, code, &index, &prefix, &count);
|
|
/* Copy character if it is not decomposable, or has a
|
|
compatibility decomposition, but we do NFD. */
|
|
if (!count || (prefix && !k)) {
|
|
output[o++] = code;
|
|
space--;
|
|
continue;
|
|
}
|
|
/* Copy decomposition onto the stack, in reverse
|
|
order. */
|
|
while(count) {
|
|
code = _bextra(_PyUnicode_Decomp,
|
|
index + (--count),
|
|
_PyUnicode_DecompBits);
|
|
stack[stackptr++] = code;
|
|
}
|
|
}
|
|
}
|
|
result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
|
|
output, o);
|
|
PyMem_Free(output);
|
|
if (!result)
|
|
return NULL;
|
|
/* result is guaranteed to be ready, as it is compact. */
|
|
kind = PyUnicode_KIND(result);
|
|
data = PyUnicode_DATA(result);
|
|
/* Sort canonically. */
|
|
i = 0;
|
|
prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
|
|
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
|
|
cur = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
|
|
if (prev == 0 || cur == 0 || prev <= cur) {
|
|
prev = cur;
|
|
continue;
|
|
}
|
|
/* Non-canonical order. Need to switch *i with previous. */
|
|
o = i - 1;
|
|
while (1) {
|
|
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
|
|
PyUnicode_WRITE(kind, data, o+1,
|
|
PyUnicode_READ(kind, data, o));
|
|
PyUnicode_WRITE(kind, data, o, tmp);
|
|
o--;
|
|
if (o < 0)
|
|
break;
|
|
prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, o))->combining;
|
|
if (prev == 0 || prev <= cur)
|
|
break;
|
|
}
|
|
prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
|
|
}
|
|
return result;
|
|
}
|