cosmopolitan/third_party/python/Modules/unicodedata_nfdnfkd.c

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Python 3                                                                     │
│ https://docs.python.org/3/license.html                                       │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/intrin/bits.h"
#include "third_party/python/Include/pyerrors.h"
#include "third_party/python/Include/pymem.h"
#include "third_party/python/Modules/unicodedata.h"
#include "third_party/python/Modules/unicodedata_unidata.h"
/* clang-format off */

PyObject *
_PyUnicode_NfdNfkd(PyObject *self, PyObject *input, int k)
{
    PyObject *result;
    Py_UCS4 *output;
    Py_ssize_t i, o, osize;
    int kind;
    void *data;
    /* Longest decomposition in Unicode 3.2: U+FDFA */
    Py_UCS4 stack[20];
    Py_ssize_t space, isize;
    int index, prefix, count, stackptr;
    unsigned char prev, cur;
    stackptr = 0;
    isize = PyUnicode_GET_LENGTH(input);
    space = isize;
    /* Overallocate at most 10 characters. */
    if (space > 10) {
        if (space <= PY_SSIZE_T_MAX - 10)
            space += 10;
    }
    else {
        space *= 2;
    }
    osize = space;
    output = PyMem_NEW(Py_UCS4, space);
    if (!output) {
        PyErr_NoMemory();
        return NULL;
    }
    i = o = 0;
    kind = PyUnicode_KIND(input);
    data = PyUnicode_DATA(input);
    while (i < isize) {
        stack[stackptr++] = PyUnicode_READ(kind, data, i++);
        while(stackptr) {
            Py_UCS4 code = stack[--stackptr];
            /* Hangul Decomposition adds three characters in
               a single step, so we need at least that much room. */
            if (space < 3) {
                Py_UCS4 *new_output;
                osize += 10;
                space += 10;
                new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
                if (new_output == NULL) {
                    PyMem_Free(output);
                    PyErr_NoMemory();
                    return NULL;
                }
                output = new_output;
            }
            /* Hangul Decomposition. */
            if (_Hanghoul_SBase <= code && code < (_Hanghoul_SBase + _Hanghoul_SCount)) {
                int SIndex = code - _Hanghoul_SBase;
                int L = _Hanghoul_LBase + SIndex / _Hanghoul_NCount;
                int V = _Hanghoul_VBase + (SIndex % _Hanghoul_NCount) / _Hanghoul_TCount;
                int T = _Hanghoul_TBase + SIndex % _Hanghoul_TCount;
                output[o++] = L;
                output[o++] = V;
                space -= 2;
                if (T != _Hanghoul_TBase) {
                    output[o++] = T;
                    space --;
                }
                continue;
            }
            /* normalization changes */
            if (self && UCD_Check(self)) {
                Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
                if (value != 0) {
                    stack[stackptr++] = value;
                    continue;
                }
            }
            /* Other decompositions. */
            _PyUnicode_GetDecompRecord(self, code, &index, &prefix, &count);
            /* Copy character if it is not decomposable, or has a
               compatibility decomposition, but we do NFD. */
            if (!count || (prefix && !k)) {
                output[o++] = code;
                space--;
                continue;
            }
            /* Copy decomposition onto the stack, in reverse
               order.  */
            while(count) {
                code = _bextra(_PyUnicode_Decomp,
                               index + (--count),
                               _PyUnicode_DecompBits);
                stack[stackptr++] = code;
            }
        }
    }
    result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
                                       output, o);
    PyMem_Free(output);
    if (!result)
        return NULL;
    /* result is guaranteed to be ready, as it is compact. */
    kind = PyUnicode_KIND(result);
    data = PyUnicode_DATA(result);
    /* Sort canonically. */
    i = 0;
    prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
    for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
        cur = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
        if (prev == 0 || cur == 0 || prev <= cur) {
            prev = cur;
            continue;
        }
        /* Non-canonical order. Need to switch *i with previous. */
        o = i - 1;
        while (1) {
            Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
            PyUnicode_WRITE(kind, data, o+1,
                            PyUnicode_READ(kind, data, o));
            PyUnicode_WRITE(kind, data, o, tmp);
            o--;
            if (o < 0)
                break;
            prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, o))->combining;
            if (prev == 0 || prev <= cur)
                break;
        }
        prev = _PyUnicode_GetRecord(PyUnicode_READ(kind, data, i))->combining;
    }
    return result;
}