2021-08-13 10:20:45 +00:00
|
|
|
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
|
|
|
|
│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi │
|
|
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
|
|
│ Python 3 │
|
|
|
|
│ https://docs.python.org/3/license.html │
|
|
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
2021-08-12 07:42:14 +00:00
|
|
|
#define PY_SSIZE_T_CLEAN
|
2023-06-17 03:05:24 +00:00
|
|
|
#include "third_party/python/Modules/unicodedata.h"
|
2021-09-28 05:58:51 +00:00
|
|
|
#include "libc/nexgen32e/kompressor.h"
|
2021-08-12 07:42:14 +00:00
|
|
|
#include "third_party/python/Include/floatobject.h"
|
2021-09-07 18:40:11 +00:00
|
|
|
#include "third_party/python/Include/import.h"
|
2021-08-12 07:42:14 +00:00
|
|
|
#include "third_party/python/Include/longobject.h"
|
|
|
|
#include "third_party/python/Include/modsupport.h"
|
|
|
|
#include "third_party/python/Include/objimpl.h"
|
|
|
|
#include "third_party/python/Include/pycapsule.h"
|
|
|
|
#include "third_party/python/Include/pyctype.h"
|
|
|
|
#include "third_party/python/Include/pyerrors.h"
|
|
|
|
#include "third_party/python/Include/pymacro.h"
|
|
|
|
#include "third_party/python/Include/structmember.h"
|
|
|
|
#include "third_party/python/Include/ucnhash.h"
|
2021-09-05 08:20:03 +00:00
|
|
|
#include "third_party/python/Include/yoink.h"
|
2023-07-06 13:57:28 +00:00
|
|
|
#include "third_party/python/Modules/bextra.h"
|
2021-09-28 05:58:51 +00:00
|
|
|
#include "third_party/python/Modules/unicodedata_unidata.h"
|
2021-08-12 07:42:14 +00:00
|
|
|
|
2021-09-05 08:20:03 +00:00
|
|
|
PYTHON_PROVIDE("unicodedata");
|
2021-09-07 02:24:10 +00:00
|
|
|
PYTHON_PROVIDE("unicodedata.UCD");
|
|
|
|
PYTHON_PROVIDE("unicodedata.bidirectional");
|
|
|
|
PYTHON_PROVIDE("unicodedata.category");
|
|
|
|
PYTHON_PROVIDE("unicodedata.combining");
|
|
|
|
PYTHON_PROVIDE("unicodedata.decimal");
|
|
|
|
PYTHON_PROVIDE("unicodedata.decomposition");
|
|
|
|
PYTHON_PROVIDE("unicodedata.digit");
|
|
|
|
PYTHON_PROVIDE("unicodedata.east_asian_width");
|
|
|
|
PYTHON_PROVIDE("unicodedata.lookup");
|
|
|
|
PYTHON_PROVIDE("unicodedata.mirrored");
|
|
|
|
PYTHON_PROVIDE("unicodedata.name");
|
|
|
|
PYTHON_PROVIDE("unicodedata.normalize");
|
|
|
|
PYTHON_PROVIDE("unicodedata.numeric");
|
|
|
|
PYTHON_PROVIDE("unicodedata.ucd_3_2_0");
|
|
|
|
PYTHON_PROVIDE("unicodedata.ucnhash_CAPI");
|
|
|
|
PYTHON_PROVIDE("unicodedata.unidata_version");
|
2021-09-05 08:20:03 +00:00
|
|
|
|
2021-08-08 04:08:33 +00:00
|
|
|
/* ------------------------------------------------------------------------
|
|
|
|
|
|
|
|
unicodedata -- Provides access to the Unicode database.
|
|
|
|
|
|
|
|
Data was extracted from the UnicodeData.txt file.
|
|
|
|
The current version number is reported in the unidata_version constant.
|
|
|
|
|
|
|
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
|
|
|
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
|
|
|
|
Modified by Martin v. Löwis (martin@v.loewis.de)
|
|
|
|
|
|
|
|
Copyright (c) Corporation for National Research Initiatives.
|
|
|
|
|
|
|
|
------------------------------------------------------------------------ */
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
module unicodedata
|
|
|
|
class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
|
|
|
|
[clinic start generated code]*/
|
|
|
|
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
|
|
|
|
|
2021-08-10 17:26:13 +00:00
|
|
|
#include "third_party/python/Modules/clinic/unicodedata.inc"
|
2021-08-08 04:08:33 +00:00
|
|
|
|
2021-09-28 05:58:51 +00:00
|
|
|
/* ------------- Previous-version API ------------------------------------- */
|
2021-08-08 04:08:33 +00:00
|
|
|
|
|
|
|
static PyMemberDef DB_members[] = {
|
2021-09-28 05:58:51 +00:00
|
|
|
{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
|
|
|
|
{0}
|
2021-08-08 04:08:33 +00:00
|
|
|
};
|
|
|
|
|
2021-09-28 05:58:51 +00:00
|
|
|
static PyObject *
|
2022-07-22 04:46:07 +00:00
|
|
|
new_previous_version(const char *name,
|
2021-09-28 05:58:51 +00:00
|
|
|
const _PyUnicode_ChangeRecord* (*getrecord)(Py_UCS4),
|
2021-08-08 04:08:33 +00:00
|
|
|
Py_UCS4 (*normalization)(Py_UCS4))
|
|
|
|
{
|
2021-09-28 05:58:51 +00:00
|
|
|
PreviousDBVersion *self;
|
|
|
|
self = PyObject_New(PreviousDBVersion, &UCD_Type);
|
|
|
|
if (self == NULL)
|
|
|
|
return NULL;
|
|
|
|
self->name = name;
|
|
|
|
self->getrecord = getrecord;
|
|
|
|
self->normalization = normalization;
|
|
|
|
return (PyObject*)self;
|
2021-08-08 04:08:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* --- Module API --------------------------------------------------------- */
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
unicodedata.UCD.decimal
|
|
|
|
|
|
|
|
self: self
|
|
|
|
chr: int(accept={str})
|
|
|
|
default: object=NULL
|
|
|
|
/
|
|
|
|
|
|
|
|
Converts a Unicode character into its equivalent decimal value.
|
|
|
|
|
|
|
|
Returns the decimal value assigned to the character chr as integer.
|
|
|
|
If no such value is defined, default is returned, or, if not given,
|
|
|
|
ValueError is raised.
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
unicodedata_UCD_decimal_impl(PyObject *self, int chr,
|
|
|
|
PyObject *default_value)
|
|
|
|
/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
|
|
|
|
{
|
|
|
|
int have_old = 0;
|
|
|
|
long rc;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
|
|
|
|
if (self && UCD_Check(self)) {
|
2021-09-28 05:58:51 +00:00
|
|
|
const _PyUnicode_ChangeRecord *old = get_old_record(self, c);
|
2021-08-08 04:08:33 +00:00
|
|
|
if (old->category_changed == 0) {
|
|
|
|
/* unassigned */
|
|
|
|
have_old = 1;
|
|
|
|
rc = -1;
|
|
|
|
}
|
|
|
|
else if (old->decimal_changed != 0xFF) {
|
|
|
|
have_old = 1;
|
|
|
|
rc = old->decimal_changed;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!have_old)
|
|
|
|
rc = Py_UNICODE_TODECIMAL(c);
|
|
|
|
if (rc < 0) {
|
|
|
|
if (default_value == NULL) {
|
|
|
|
PyErr_SetString(PyExc_ValueError,
|
|
|
|
"not a decimal");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
Py_INCREF(default_value);
|
|
|
|
return default_value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return PyLong_FromLong(rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
unicodedata.UCD.digit
|
|
|
|
|
|
|
|
self: self
|
|
|
|
chr: int(accept={str})
|
|
|
|
default: object=NULL
|
|
|
|
/
|
|
|
|
|
|
|
|
Converts a Unicode character into its equivalent digit value.
|
|
|
|
|
|
|
|
Returns the digit value assigned to the character chr as integer.
|
|
|
|
If no such value is defined, default is returned, or, if not given,
|
|
|
|
ValueError is raised.
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
|
|
|
|
/*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
|
|
|
|
{
|
|
|
|
long rc;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
rc = Py_UNICODE_TODIGIT(c);
|
|
|
|
if (rc < 0) {
|
|
|
|
if (default_value == NULL) {
|
|
|
|
PyErr_SetString(PyExc_ValueError, "not a digit");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
Py_INCREF(default_value);
|
|
|
|
return default_value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return PyLong_FromLong(rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
unicodedata.UCD.numeric
|
|
|
|
|
|
|
|
self: self
|
|
|
|
chr: int(accept={str})
|
|
|
|
default: object=NULL
|
|
|
|
/
|
|
|
|
|
|
|
|
Converts a Unicode character into its equivalent numeric value.
|
|
|
|
|
|
|
|
Returns the numeric value assigned to the character chr as float.
|
|
|
|
If no such value is defined, default is returned, or, if not given,
|
|
|
|
ValueError is raised.
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
unicodedata_UCD_numeric_impl(PyObject *self, int chr,
|
|
|
|
PyObject *default_value)
|
|
|
|
/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
|
|
|
|
{
|
|
|
|
int have_old = 0;
|
|
|
|
double rc;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
if (self && UCD_Check(self)) {
|
2021-09-28 05:58:51 +00:00
|
|
|
const _PyUnicode_ChangeRecord *old = get_old_record(self, c);
|
2021-08-08 04:08:33 +00:00
|
|
|
if (old->category_changed == 0) {
|
|
|
|
/* unassigned */
|
|
|
|
have_old = 1;
|
|
|
|
rc = -1.0;
|
|
|
|
}
|
|
|
|
else if (old->decimal_changed != 0xFF) {
|
|
|
|
have_old = 1;
|
|
|
|
rc = old->decimal_changed;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!have_old)
|
|
|
|
rc = Py_UNICODE_TONUMERIC(c);
|
|
|
|
if (rc == -1.0) {
|
|
|
|
if (default_value == NULL) {
|
|
|
|
PyErr_SetString(PyExc_ValueError, "not a numeric character");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
Py_INCREF(default_value);
|
|
|
|
return default_value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return PyFloat_FromDouble(rc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
unicodedata.UCD.category
|
|
|
|
|
|
|
|
self: self
|
|
|
|
chr: int(accept={str})
|
|
|
|
/
|
|
|
|
|
|
|
|
Returns the general category assigned to the character chr as string.
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
unicodedata_UCD_category_impl(PyObject *self, int chr)
|
|
|
|
/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
|
|
|
|
{
|
|
|
|
int index;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
2021-09-28 05:58:51 +00:00
|
|
|
index = (int) _PyUnicode_GetRecord(c)->category;
|
2021-08-08 04:08:33 +00:00
|
|
|
if (self && UCD_Check(self)) {
|
2021-09-28 05:58:51 +00:00
|
|
|
const _PyUnicode_ChangeRecord *old = get_old_record(self, c);
|
2021-08-08 04:08:33 +00:00
|
|
|
if (old->category_changed != 0xFF)
|
|
|
|
index = old->category_changed;
|
|
|
|
}
|
|
|
|
return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
unicodedata.UCD.bidirectional
|
|
|
|
|
|
|
|
self: self
|
|
|
|
chr: int(accept={str})
|
|
|
|
/
|
|
|
|
|
|
|
|
Returns the bidirectional class assigned to the character chr as string.
|
|
|
|
|
|
|
|
If no such value is defined, an empty string is returned.
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
|
|
|
|
/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
|
|
|
|
{
|
|
|
|
int index;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
2021-09-28 05:58:51 +00:00
|
|
|
index = (int) _PyUnicode_GetRecord(c)->bidirectional;
|
2021-08-08 04:08:33 +00:00
|
|
|
if (self && UCD_Check(self)) {
|
2021-09-28 05:58:51 +00:00
|
|
|
const _PyUnicode_ChangeRecord *old = get_old_record(self, c);
|
2021-08-08 04:08:33 +00:00
|
|
|
if (old->category_changed == 0)
|
|
|
|
index = 0; /* unassigned */
|
|
|
|
else if (old->bidir_changed != 0xFF)
|
|
|
|
index = old->bidir_changed;
|
|
|
|
}
|
|
|
|
return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
unicodedata.UCD.combining -> int
|
|
|
|
|
|
|
|
self: self
|
|
|
|
chr: int(accept={str})
|
|
|
|
/
|
|
|
|
|
|
|
|
Returns the canonical combining class assigned to the character chr as integer.
|
|
|
|
|
|
|
|
Returns 0 if no combining class is defined.
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static int
|
|
|
|
unicodedata_UCD_combining_impl(PyObject *self, int chr)
|
|
|
|
/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
|
|
|
|
{
|
|
|
|
int index;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
2021-09-28 05:58:51 +00:00
|
|
|
index = (int) _PyUnicode_GetRecord(c)->combining;
|
2021-08-08 04:08:33 +00:00
|
|
|
if (self && UCD_Check(self)) {
|
2021-09-28 05:58:51 +00:00
|
|
|
const _PyUnicode_ChangeRecord *old = get_old_record(self, c);
|
2021-08-08 04:08:33 +00:00
|
|
|
if (old->category_changed == 0)
|
|
|
|
index = 0; /* unassigned */
|
|
|
|
}
|
|
|
|
return index;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
unicodedata.UCD.mirrored -> int
|
|
|
|
|
|
|
|
self: self
|
|
|
|
chr: int(accept={str})
|
|
|
|
/
|
|
|
|
|
|
|
|
Returns the mirrored property assigned to the character chr as integer.
|
|
|
|
|
|
|
|
Returns 1 if the character has been identified as a "mirrored"
|
|
|
|
character in bidirectional text, 0 otherwise.
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static int
|
|
|
|
unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
|
|
|
|
/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
|
|
|
|
{
|
|
|
|
int index;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
2021-09-28 05:58:51 +00:00
|
|
|
index = (int) _PyUnicode_GetRecord(c)->mirrored;
|
2021-08-08 04:08:33 +00:00
|
|
|
if (self && UCD_Check(self)) {
|
2021-09-28 05:58:51 +00:00
|
|
|
const _PyUnicode_ChangeRecord *old = get_old_record(self, c);
|
2021-08-08 04:08:33 +00:00
|
|
|
if (old->category_changed == 0)
|
|
|
|
index = 0; /* unassigned */
|
|
|
|
else if (old->mirrored_changed != 0xFF)
|
|
|
|
index = old->mirrored_changed;
|
|
|
|
}
|
|
|
|
return index;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
unicodedata.UCD.east_asian_width
|
|
|
|
|
|
|
|
self: self
|
|
|
|
chr: int(accept={str})
|
|
|
|
/
|
|
|
|
|
|
|
|
Returns the east asian width assigned to the character chr as string.
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
|
|
|
|
/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
|
|
|
|
{
|
|
|
|
int index;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
2021-09-28 05:58:51 +00:00
|
|
|
index = (int) _PyUnicode_GetRecord(c)->east_asian_width;
|
2021-08-08 04:08:33 +00:00
|
|
|
if (self && UCD_Check(self)) {
|
2021-09-28 05:58:51 +00:00
|
|
|
const _PyUnicode_ChangeRecord *old = get_old_record(self, c);
|
2021-08-08 04:08:33 +00:00
|
|
|
if (old->category_changed == 0)
|
|
|
|
index = 0; /* unassigned */
|
|
|
|
else if (old->east_asian_width_changed != 0xFF)
|
|
|
|
index = old->east_asian_width_changed;
|
|
|
|
}
|
|
|
|
return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
unicodedata.UCD.decomposition
|
|
|
|
|
|
|
|
self: self
|
|
|
|
chr: int(accept={str})
|
|
|
|
/
|
|
|
|
|
|
|
|
Returns the character decomposition mapping assigned to the character chr as string.
|
|
|
|
|
|
|
|
An empty string is returned in case no such mapping is defined.
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
|
|
|
|
/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
|
|
|
|
{
|
|
|
|
char decomp[256];
|
|
|
|
int code, index, count;
|
|
|
|
size_t i;
|
|
|
|
unsigned int prefix_index;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
|
|
|
|
code = (int)c;
|
|
|
|
|
|
|
|
if (self && UCD_Check(self)) {
|
2021-09-28 05:58:51 +00:00
|
|
|
const _PyUnicode_ChangeRecord *old = get_old_record(self, c);
|
2021-08-08 04:08:33 +00:00
|
|
|
if (old->category_changed == 0)
|
|
|
|
return PyUnicode_FromString(""); /* unassigned */
|
|
|
|
}
|
|
|
|
|
2021-09-28 05:58:51 +00:00
|
|
|
if (code < 0 || code >= 0x110000) {
|
2021-08-08 04:08:33 +00:00
|
|
|
index = 0;
|
2021-09-28 05:58:51 +00:00
|
|
|
} else {
|
|
|
|
index = _PyUnicode_DecompIndex1[(code>>_PyUnicode_DecompShift)];
|
|
|
|
index = _PyUnicode_DecompIndex2[(index<<_PyUnicode_DecompShift)+
|
|
|
|
(code&((1<<_PyUnicode_DecompShift)-1))];
|
2021-08-08 04:08:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* high byte is number of hex bytes (usually one or two), low byte
|
|
|
|
is prefix code (from*/
|
2023-07-06 13:57:28 +00:00
|
|
|
count = BitFieldExtract(_PyUnicode_Decomp, index, _PyUnicode_DecompBits) >> 8;
|
2021-08-08 04:08:33 +00:00
|
|
|
|
|
|
|
/* XXX: could allocate the PyString up front instead
|
|
|
|
(strlen(prefix) + 5 * count + 1 bytes) */
|
|
|
|
|
2021-09-28 05:58:51 +00:00
|
|
|
/* Based on how index is calculated above and _PyUnicode_Decomp is
|
|
|
|
generated from Tools/unicode/makeunicodedata.py, it should not be
|
|
|
|
possible to overflow _PyUnicode_DecompPrefix. */
|
2023-07-06 13:57:28 +00:00
|
|
|
prefix_index = BitFieldExtract(_PyUnicode_Decomp, index, _PyUnicode_DecompBits) & 255;
|
2021-09-28 05:58:51 +00:00
|
|
|
assert(prefix_index < Py_ARRAY_LENGTH(_PyUnicode_DecompPrefix));
|
2021-08-08 04:08:33 +00:00
|
|
|
|
|
|
|
/* copy prefix */
|
2021-09-28 05:58:51 +00:00
|
|
|
i = strlen(_PyUnicode_DecompPrefix[prefix_index]);
|
|
|
|
memcpy(decomp, _PyUnicode_DecompPrefix[prefix_index], i);
|
2021-08-08 04:08:33 +00:00
|
|
|
|
|
|
|
while (count-- > 0) {
|
|
|
|
if (i)
|
|
|
|
decomp[i++] = ' ';
|
|
|
|
assert(i < sizeof(decomp));
|
|
|
|
PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
|
2023-07-06 13:57:28 +00:00
|
|
|
BitFieldExtract(_PyUnicode_Decomp, ++index,
|
|
|
|
_PyUnicode_DecompBits));
|
2021-08-08 04:08:33 +00:00
|
|
|
i += strlen(decomp + i);
|
|
|
|
}
|
|
|
|
return PyUnicode_FromStringAndSize(decomp, i);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
unicodedata.UCD.normalize
|
|
|
|
|
|
|
|
self: self
|
|
|
|
form: str
|
|
|
|
unistr as input: object(subclass_of='&PyUnicode_Type')
|
|
|
|
/
|
|
|
|
|
|
|
|
Return the normal form 'form' for the Unicode string unistr.
|
|
|
|
|
|
|
|
Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
|
|
|
|
PyObject *input)
|
|
|
|
/*[clinic end generated code: output=62d1f8870027efdc input=cd092e631cf11883]*/
|
|
|
|
{
|
|
|
|
if (PyUnicode_READY(input) == -1)
|
|
|
|
return NULL;
|
|
|
|
if (PyUnicode_GET_LENGTH(input) == 0) {
|
|
|
|
/* Special case empty input strings, since resizing
|
|
|
|
them later would cause internal errors. */
|
|
|
|
Py_INCREF(input);
|
|
|
|
return input;
|
|
|
|
}
|
|
|
|
if (strcmp(form, "NFC") == 0) {
|
2021-09-28 05:58:51 +00:00
|
|
|
if (_PyUnicode_IsNormalized(self, input, 1, 0)) {
|
2021-08-08 04:08:33 +00:00
|
|
|
Py_INCREF(input);
|
|
|
|
return input;
|
|
|
|
}
|
2021-09-28 05:58:51 +00:00
|
|
|
return _PyUnicode_NfcNfkc(self, input, 0);
|
2021-08-08 04:08:33 +00:00
|
|
|
}
|
|
|
|
if (strcmp(form, "NFKC") == 0) {
|
2021-09-28 05:58:51 +00:00
|
|
|
if (_PyUnicode_IsNormalized(self, input, 1, 1)) {
|
2021-08-08 04:08:33 +00:00
|
|
|
Py_INCREF(input);
|
|
|
|
return input;
|
|
|
|
}
|
2021-09-28 05:58:51 +00:00
|
|
|
return _PyUnicode_NfcNfkc(self, input, 1);
|
2021-08-08 04:08:33 +00:00
|
|
|
}
|
|
|
|
if (strcmp(form, "NFD") == 0) {
|
2021-09-28 05:58:51 +00:00
|
|
|
if (_PyUnicode_IsNormalized(self, input, 0, 0)) {
|
2021-08-08 04:08:33 +00:00
|
|
|
Py_INCREF(input);
|
|
|
|
return input;
|
|
|
|
}
|
2021-09-28 05:58:51 +00:00
|
|
|
return _PyUnicode_NfdNfkd(self, input, 0);
|
2021-08-08 04:08:33 +00:00
|
|
|
}
|
|
|
|
if (strcmp(form, "NFKD") == 0) {
|
2021-09-28 05:58:51 +00:00
|
|
|
if (_PyUnicode_IsNormalized(self, input, 0, 1)) {
|
2021-08-08 04:08:33 +00:00
|
|
|
Py_INCREF(input);
|
|
|
|
return input;
|
|
|
|
}
|
2021-09-28 05:58:51 +00:00
|
|
|
return _PyUnicode_NfdNfkd(self, input, 1);
|
2021-08-08 04:08:33 +00:00
|
|
|
}
|
|
|
|
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
/* database code (cut and pasted from the unidb package) */
|
|
|
|
|
|
|
|
/* macros used to determine if the given code point is in the PUA range that
|
|
|
|
* we are using to store aliases and named sequences */
|
2021-09-28 05:58:51 +00:00
|
|
|
#define IS_ALIAS(cp) ((cp >= _PyUnicode_AliasesStart) && \
|
|
|
|
(cp < _PyUnicode_AliasesEnd))
|
|
|
|
#define IS_NAMED_SEQ(cp) ((cp >= _PyUnicode_NamedSequencesStart) && \
|
|
|
|
(cp < _PyUnicode_NamedSequencesEnd))
|
2021-08-08 04:08:33 +00:00
|
|
|
|
|
|
|
static const _PyUnicode_Name_CAPI hashAPI =
|
|
|
|
{
|
|
|
|
sizeof(_PyUnicode_Name_CAPI),
|
2021-09-28 05:58:51 +00:00
|
|
|
_PyUnicode_GetUcName,
|
|
|
|
_PyUnicode_GetCode,
|
2021-08-08 04:08:33 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
/* Python bindings */
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
unicodedata.UCD.name
|
|
|
|
|
|
|
|
self: self
|
|
|
|
chr: int(accept={str})
|
|
|
|
default: object=NULL
|
|
|
|
/
|
|
|
|
|
|
|
|
Returns the name assigned to the character chr as a string.
|
|
|
|
|
|
|
|
If no name is defined, default is returned, or, if not given,
|
|
|
|
ValueError is raised.
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
|
|
|
|
/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
|
|
|
|
{
|
2021-09-28 05:58:51 +00:00
|
|
|
char name[UNIDATA_NAME_MAXLEN+1];
|
2021-08-08 04:08:33 +00:00
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
2021-09-28 05:58:51 +00:00
|
|
|
if (!_PyUnicode_GetUcName(self, c, name, UNIDATA_NAME_MAXLEN, 0)) {
|
2021-08-08 04:08:33 +00:00
|
|
|
if (default_value == NULL) {
|
|
|
|
PyErr_SetString(PyExc_ValueError, "no such name");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
Py_INCREF(default_value);
|
|
|
|
return default_value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return PyUnicode_FromString(name);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*[clinic input]
|
|
|
|
unicodedata.UCD.lookup
|
|
|
|
|
|
|
|
self: self
|
|
|
|
name: str(accept={str, robuffer}, zeroes=True)
|
|
|
|
/
|
|
|
|
|
|
|
|
Look up character by name.
|
|
|
|
|
|
|
|
If a character with the given name is found, return the
|
|
|
|
corresponding character. If not found, KeyError is raised.
|
|
|
|
[clinic start generated code]*/
|
|
|
|
|
|
|
|
static PyObject *
|
|
|
|
unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
|
|
|
|
Py_ssize_clean_t name_length)
|
|
|
|
/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
|
|
|
|
{
|
|
|
|
Py_UCS4 code;
|
|
|
|
unsigned int index;
|
2021-09-28 05:58:51 +00:00
|
|
|
if (name_length > UNIDATA_NAME_MAXLEN) {
|
2021-08-08 04:08:33 +00:00
|
|
|
PyErr_SetString(PyExc_KeyError, "name too long");
|
|
|
|
return NULL;
|
|
|
|
}
|
2021-09-28 05:58:51 +00:00
|
|
|
if (!_PyUnicode_GetCode(self, name, (int)name_length, &code, 1)) {
|
2021-08-08 04:08:33 +00:00
|
|
|
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
/* check if code is in the PUA range that we use for named sequences
|
|
|
|
and convert it */
|
|
|
|
if (IS_NAMED_SEQ(code)) {
|
2021-09-28 05:58:51 +00:00
|
|
|
index = code - _PyUnicode_NamedSequencesStart;
|
2021-08-08 04:08:33 +00:00
|
|
|
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
|
2021-09-28 05:58:51 +00:00
|
|
|
_PyUnicode_NamedSequences[index].seq,
|
|
|
|
_PyUnicode_NamedSequences[index].seqlen);
|
2021-08-08 04:08:33 +00:00
|
|
|
}
|
|
|
|
return PyUnicode_FromOrdinal(code);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* XXX Add doc strings. */
|
|
|
|
|
|
|
|
static PyMethodDef unicodedata_functions[] = {
|
|
|
|
UNICODEDATA_UCD_DECIMAL_METHODDEF
|
|
|
|
UNICODEDATA_UCD_DIGIT_METHODDEF
|
|
|
|
UNICODEDATA_UCD_NUMERIC_METHODDEF
|
|
|
|
UNICODEDATA_UCD_CATEGORY_METHODDEF
|
|
|
|
UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
|
|
|
|
UNICODEDATA_UCD_COMBINING_METHODDEF
|
|
|
|
UNICODEDATA_UCD_MIRRORED_METHODDEF
|
|
|
|
UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
|
|
|
|
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
|
|
|
|
UNICODEDATA_UCD_NAME_METHODDEF
|
|
|
|
UNICODEDATA_UCD_LOOKUP_METHODDEF
|
|
|
|
UNICODEDATA_UCD_NORMALIZE_METHODDEF
|
|
|
|
{NULL, NULL} /* sentinel */
|
|
|
|
};
|
|
|
|
|
|
|
|
PyDoc_STRVAR(unicodedata_docstring,
|
|
|
|
"This module provides access to the Unicode Character Database which\n\
|
|
|
|
defines character properties for all Unicode characters. The data in\n\
|
|
|
|
this database is based on the UnicodeData.txt file version\n\
|
|
|
|
" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
|
|
|
|
\n\
|
|
|
|
The module uses the same names and symbols as defined by the\n\
|
|
|
|
UnicodeData File Format " UNIDATA_VERSION ".");
|
|
|
|
|
|
|
|
static struct PyModuleDef unicodedatamodule = {
|
2021-09-28 05:58:51 +00:00
|
|
|
PyModuleDef_HEAD_INIT,
|
|
|
|
"unicodedata",
|
|
|
|
unicodedata_docstring,
|
|
|
|
-1,
|
|
|
|
unicodedata_functions
|
2021-08-08 04:08:33 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
PyMODINIT_FUNC
|
|
|
|
PyInit_unicodedata(void)
|
|
|
|
{
|
|
|
|
PyObject *m, *v;
|
2021-09-28 05:58:51 +00:00
|
|
|
UCD_Type.tp_dealloc = (destructor)PyObject_Del;
|
|
|
|
UCD_Type.tp_getattro = PyObject_GenericGetAttr;
|
|
|
|
UCD_Type.tp_flags = Py_TPFLAGS_DEFAULT;
|
|
|
|
UCD_Type.tp_methods = unicodedata_functions;
|
|
|
|
UCD_Type.tp_members = DB_members;
|
2021-08-08 04:08:33 +00:00
|
|
|
Py_TYPE(&UCD_Type) = &PyType_Type;
|
|
|
|
m = PyModule_Create(&unicodedatamodule);
|
|
|
|
if (!m)
|
|
|
|
return NULL;
|
|
|
|
PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
|
|
|
|
Py_INCREF(&UCD_Type);
|
|
|
|
PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
|
|
|
|
/* Previous versions */
|
2021-09-28 05:58:51 +00:00
|
|
|
v = new_previous_version("3.2.0",
|
|
|
|
_PyUnicode_GetChange_3_2_0,
|
|
|
|
_PyUnicode_Normalization_3_2_0);
|
2021-08-08 04:08:33 +00:00
|
|
|
if (v != NULL)
|
|
|
|
PyModule_AddObject(m, "ucd_3_2_0", v);
|
|
|
|
/* Export C API */
|
|
|
|
v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
|
|
|
|
if (v != NULL)
|
|
|
|
PyModule_AddObject(m, "ucnhash_CAPI", v);
|
|
|
|
return m;
|
|
|
|
}
|
|
|
|
|
2023-06-17 03:05:24 +00:00
|
|
|
#ifdef __aarch64__
|
|
|
|
_Section(".rodata.pytab.1 //")
|
|
|
|
#else
|
|
|
|
_Section(".rodata.pytab.1")
|
|
|
|
#endif
|
|
|
|
const struct _inittab _PyImport_Inittab_unicodedata = {
|
2021-09-07 18:40:11 +00:00
|
|
|
"unicodedata",
|
|
|
|
PyInit_unicodedata,
|
|
|
|
};
|