mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 11:37:35 +00:00
0a24b4fc3c
The *NSYNC linked list API is good enough that it deserves to be part of the C libray, so this change writes an improved version of it which uses that offsetof() trick from the Linux Kernel. We vendor all of the *NSYNC tests in third_party which helped confirm the needed refactoring is safe This change also deletes more old code that didn't pan out. My goal here is to work towards a vision where the Cosmopolitan core libraries become less experimental and more focused on curation. This better reflects the current level of quality we've managed to achieve.
148 lines
5.8 KiB
C
148 lines
5.8 KiB
C
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
|
|
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
│ Python 3 │
|
|
│ https://docs.python.org/3/license.html │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "libc/intrin/likely.h"
|
|
#include "third_party/python/Include/pyerrors.h"
|
|
#include "third_party/python/Include/pymem.h"
|
|
#include "third_party/python/Modules/bextra.h"
|
|
#include "third_party/python/Modules/unicodedata.h"
|
|
#include "third_party/python/Modules/unicodedata_unidata.h"
|
|
/* clang-format off */
|
|
|
|
PyObject *
|
|
_PyUnicode_NfcNfkc(PyObject *self, PyObject *input, int k)
|
|
{
|
|
int kind;
|
|
void *data;
|
|
Py_UCS4 code;
|
|
Py_UCS4 *output;
|
|
PyObject *result;
|
|
int cskipped = 0;
|
|
Py_ssize_t skipped[20];
|
|
Py_ssize_t i, i1, o, len;
|
|
int f,l,index,index1,comb;
|
|
result = _PyUnicode_NfdNfkd(self, input, k);
|
|
if (!result)
|
|
return NULL;
|
|
/* result will be "ready". */
|
|
kind = PyUnicode_KIND(result);
|
|
data = PyUnicode_DATA(result);
|
|
len = PyUnicode_GET_LENGTH(result);
|
|
/* We allocate a buffer for the output.
|
|
If we find that we made no changes, we still return
|
|
the NFD result. */
|
|
output = PyMem_NEW(Py_UCS4, len);
|
|
if (!output) {
|
|
PyErr_NoMemory();
|
|
Py_DECREF(result);
|
|
return 0;
|
|
}
|
|
i = o = 0;
|
|
again:
|
|
while (i < len) {
|
|
for (index = 0; index < cskipped; index++) {
|
|
if (skipped[index] == i) {
|
|
/* *i character is skipped.
|
|
Remove from list. */
|
|
skipped[index] = skipped[cskipped-1];
|
|
cskipped--;
|
|
i++;
|
|
goto again; /* continue while */
|
|
}
|
|
}
|
|
/* Hangul Composition. We don't need to check for <LV,T>
|
|
pairs, since we always have decomposed data. */
|
|
code = PyUnicode_READ(kind, data, i);
|
|
if ((UNLIKELY(_Hanghoul_LBase <= code && code < _Hanghoul_LBase + _Hanghoul_LCount) &&
|
|
i + 1 < len && _Hanghoul_VBase <= PyUnicode_READ(kind, data, i+1) &&
|
|
PyUnicode_READ(kind, data, i+1) < _Hanghoul_VBase + _Hanghoul_VCount)) {
|
|
/* check L character is a modern leading consonant (0x1100 ~ 0x1112)
|
|
and V character is a modern vowel (0x1161 ~ 0x1175). */
|
|
int LIndex, VIndex;
|
|
LIndex = code - _Hanghoul_LBase;
|
|
VIndex = PyUnicode_READ(kind, data, i+1) - _Hanghoul_VBase;
|
|
code = _Hanghoul_SBase + (LIndex * _Hanghoul_VCount + VIndex) * _Hanghoul_TCount;
|
|
i+=2;
|
|
if ((i < len &&
|
|
_Hanghoul_TBase < PyUnicode_READ(kind, data, i) &&
|
|
PyUnicode_READ(kind, data, i) < (_Hanghoul_TBase + _Hanghoul_TCount))) {
|
|
/* check T character is a modern trailing consonant
|
|
(0x11A8 ~ 0x11C2). */
|
|
code += PyUnicode_READ(kind, data, i) - _Hanghoul_TBase;
|
|
i++;
|
|
}
|
|
output[o++] = code;
|
|
continue;
|
|
}
|
|
/* code is still input[i] here */
|
|
f = _PyUnicode_FindNfcIndex(_PyUnicode_NfcFirst, code);
|
|
if (f == -1) {
|
|
output[o++] = code;
|
|
i++;
|
|
continue;
|
|
}
|
|
/* Find next unblocked character. */
|
|
i1 = i+1;
|
|
comb = 0;
|
|
/* output base character for now; might be updated later. */
|
|
output[o] = PyUnicode_READ(kind, data, i);
|
|
while (i1 < len) {
|
|
Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
|
|
int comb1 = _PyUnicode_GetRecord(code1)->combining;
|
|
if (comb) {
|
|
if (comb1 == 0)
|
|
break;
|
|
if (comb >= comb1) {
|
|
/* Character is blocked. */
|
|
i1++;
|
|
continue;
|
|
}
|
|
}
|
|
l = _PyUnicode_FindNfcIndex(_PyUnicode_NfcLast, code1);
|
|
/* i1 cannot be combined with i. If i1
|
|
is a starter, we don't need to look further.
|
|
Otherwise, record the combining class. */
|
|
if (l == -1) {
|
|
not_combinable:
|
|
if (comb1 == 0)
|
|
break;
|
|
comb = comb1;
|
|
i1++;
|
|
continue;
|
|
}
|
|
index = f * UNIDATA_TOTAL_LAST + l;
|
|
index1 = _PyUnicode_CompIndex[index >> _PyUnicode_CompShift];
|
|
code = BitFieldExtract(_PyUnicode_CompData,
|
|
(index1 << _PyUnicode_CompShift)+
|
|
(index & ((1 << _PyUnicode_CompShift) - 1)),
|
|
_PyUnicode_CompDataBits);
|
|
if (code == 0)
|
|
goto not_combinable;
|
|
/* Replace the original character. */
|
|
output[o] = code;
|
|
/* Mark the second character unused. */
|
|
assert(cskipped < 20);
|
|
skipped[cskipped++] = i1;
|
|
i1++;
|
|
f = _PyUnicode_FindNfcIndex(_PyUnicode_NfcFirst, output[o]);
|
|
if (f == -1)
|
|
break;
|
|
}
|
|
/* Output character was already written.
|
|
Just advance the indices. */
|
|
o++; i++;
|
|
}
|
|
if (o == len) {
|
|
/* No changes. Return original string. */
|
|
PyMem_Free(output);
|
|
return result;
|
|
}
|
|
Py_DECREF(result);
|
|
result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
|
|
output, o);
|
|
PyMem_Free(output);
|
|
return result;
|
|
}
|