mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-02-07 23:13:34 +00:00
Status lines for Emacs and Vim have been added to Python sources so they'll be easier to edit using Python's preferred coding style. Some DNS helper functions have been broken up into multiple files. It's nice to have one function per file whenever possible, since that way we don't need -ffunction-sections. Another reason it's good to have small source files, is because the build will be enforcing resource limits on compilation and testing soon.
830 lines
28 KiB
C++
830 lines
28 KiB
C++
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
|
|
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
│ Python 3 │
|
|
│ https://docs.python.org/3/license.html │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
/* clang-format off */
|
|
|
|
/* stringlib: codec implementations */
|
|
|
|
#if !STRINGLIB_IS_UNICODE
|
|
# error "codecs.h is specific to Unicode"
|
|
#endif
|
|
|
|
/* Mask to quickly check whether a C 'long' contains a
|
|
non-ASCII, UTF8-encoded char. */
|
|
#if (SIZEOF_LONG == 8)
|
|
# define ASCII_CHAR_MASK 0x8080808080808080UL
|
|
#elif (SIZEOF_LONG == 4)
|
|
# define ASCII_CHAR_MASK 0x80808080UL
|
|
#else
|
|
# error C 'long' size should be either 4 or 8!
|
|
#endif
|
|
|
|
/* 10xxxxxx */
|
|
#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
|
|
|
|
Py_LOCAL_INLINE(Py_UCS4)
|
|
STRINGLIB(utf8_decode)(const char **inptr, const char *end,
|
|
STRINGLIB_CHAR *dest,
|
|
Py_ssize_t *outpos)
|
|
{
|
|
Py_UCS4 ch;
|
|
const char *s = *inptr;
|
|
const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
|
|
STRINGLIB_CHAR *p = dest + *outpos;
|
|
|
|
while (s < end) {
|
|
ch = (unsigned char)*s;
|
|
|
|
if (ch < 0x80) {
|
|
/* Fast path for runs of ASCII characters. Given that common UTF-8
|
|
input will consist of an overwhelming majority of ASCII
|
|
characters, we try to optimize for this case by checking
|
|
as many characters as a C 'long' can contain.
|
|
First, check if we can do an aligned read, as most CPUs have
|
|
a penalty for unaligned reads.
|
|
*/
|
|
if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
|
|
/* Help register allocation */
|
|
const char *_s = s;
|
|
STRINGLIB_CHAR *_p = p;
|
|
while (_s < aligned_end) {
|
|
/* Read a whole long at a time (either 4 or 8 bytes),
|
|
and do a fast unrolled copy if it only contains ASCII
|
|
characters. */
|
|
unsigned long value = *(unsigned long *) _s;
|
|
if (value & ASCII_CHAR_MASK)
|
|
break;
|
|
#if PY_LITTLE_ENDIAN
|
|
_p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
|
|
_p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
|
|
_p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
|
|
_p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
|
|
# if SIZEOF_LONG == 8
|
|
_p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
|
|
_p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
|
|
_p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
|
|
_p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
|
|
# endif
|
|
#else
|
|
# if SIZEOF_LONG == 8
|
|
_p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
|
|
_p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
|
|
_p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
|
|
_p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
|
|
_p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
|
|
_p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
|
|
_p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
|
|
_p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
|
|
# else
|
|
_p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
|
|
_p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
|
|
_p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
|
|
_p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
|
|
# endif
|
|
#endif
|
|
_s += SIZEOF_LONG;
|
|
_p += SIZEOF_LONG;
|
|
}
|
|
s = _s;
|
|
p = _p;
|
|
if (s == end)
|
|
break;
|
|
ch = (unsigned char)*s;
|
|
}
|
|
if (ch < 0x80) {
|
|
s++;
|
|
*p++ = ch;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (ch < 0xE0) {
|
|
/* \xC2\x80-\xDF\xBF -- 0080-07FF */
|
|
Py_UCS4 ch2;
|
|
if (ch < 0xC2) {
|
|
/* invalid sequence
|
|
\x80-\xBF -- continuation byte
|
|
\xC0-\xC1 -- fake 0000-007F */
|
|
goto InvalidStart;
|
|
}
|
|
if (end - s < 2) {
|
|
/* unexpected end of data: the caller will decide whether
|
|
it's an error or not */
|
|
break;
|
|
}
|
|
ch2 = (unsigned char)s[1];
|
|
if (!IS_CONTINUATION_BYTE(ch2))
|
|
/* invalid continuation byte */
|
|
goto InvalidContinuation1;
|
|
ch = (ch << 6) + ch2 -
|
|
((0xC0 << 6) + 0x80);
|
|
assert ((ch > 0x007F) && (ch <= 0x07FF));
|
|
s += 2;
|
|
if (STRINGLIB_MAX_CHAR <= 0x007F ||
|
|
(STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
|
|
/* Out-of-range */
|
|
goto Return;
|
|
*p++ = ch;
|
|
continue;
|
|
}
|
|
|
|
if (ch < 0xF0) {
|
|
/* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
|
|
Py_UCS4 ch2, ch3;
|
|
if (end - s < 3) {
|
|
/* unexpected end of data: the caller will decide whether
|
|
it's an error or not */
|
|
if (end - s < 2)
|
|
break;
|
|
ch2 = (unsigned char)s[1];
|
|
if (!IS_CONTINUATION_BYTE(ch2) ||
|
|
(ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
|
|
/* for clarification see comments below */
|
|
goto InvalidContinuation1;
|
|
break;
|
|
}
|
|
ch2 = (unsigned char)s[1];
|
|
ch3 = (unsigned char)s[2];
|
|
if (!IS_CONTINUATION_BYTE(ch2)) {
|
|
/* invalid continuation byte */
|
|
goto InvalidContinuation1;
|
|
}
|
|
if (ch == 0xE0) {
|
|
if (ch2 < 0xA0)
|
|
/* invalid sequence
|
|
\xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
|
|
goto InvalidContinuation1;
|
|
} else if (ch == 0xED && ch2 >= 0xA0) {
|
|
/* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
|
|
will result in surrogates in range D800-DFFF. Surrogates are
|
|
not valid UTF-8 so they are rejected.
|
|
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
|
|
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
|
|
goto InvalidContinuation1;
|
|
}
|
|
if (!IS_CONTINUATION_BYTE(ch3)) {
|
|
/* invalid continuation byte */
|
|
goto InvalidContinuation2;
|
|
}
|
|
ch = (ch << 12) + (ch2 << 6) + ch3 -
|
|
((0xE0 << 12) + (0x80 << 6) + 0x80);
|
|
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
|
|
s += 3;
|
|
if (STRINGLIB_MAX_CHAR <= 0x07FF ||
|
|
(STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
|
|
/* Out-of-range */
|
|
goto Return;
|
|
*p++ = ch;
|
|
continue;
|
|
}
|
|
|
|
if (ch < 0xF5) {
|
|
/* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
|
|
Py_UCS4 ch2, ch3, ch4;
|
|
if (end - s < 4) {
|
|
/* unexpected end of data: the caller will decide whether
|
|
it's an error or not */
|
|
if (end - s < 2)
|
|
break;
|
|
ch2 = (unsigned char)s[1];
|
|
if (!IS_CONTINUATION_BYTE(ch2) ||
|
|
(ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
|
|
/* for clarification see comments below */
|
|
goto InvalidContinuation1;
|
|
if (end - s < 3)
|
|
break;
|
|
ch3 = (unsigned char)s[2];
|
|
if (!IS_CONTINUATION_BYTE(ch3))
|
|
goto InvalidContinuation2;
|
|
break;
|
|
}
|
|
ch2 = (unsigned char)s[1];
|
|
ch3 = (unsigned char)s[2];
|
|
ch4 = (unsigned char)s[3];
|
|
if (!IS_CONTINUATION_BYTE(ch2)) {
|
|
/* invalid continuation byte */
|
|
goto InvalidContinuation1;
|
|
}
|
|
if (ch == 0xF0) {
|
|
if (ch2 < 0x90)
|
|
/* invalid sequence
|
|
\xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
|
|
goto InvalidContinuation1;
|
|
} else if (ch == 0xF4 && ch2 >= 0x90) {
|
|
/* invalid sequence
|
|
\xF4\x90\x80\80- -- 110000- overflow */
|
|
goto InvalidContinuation1;
|
|
}
|
|
if (!IS_CONTINUATION_BYTE(ch3)) {
|
|
/* invalid continuation byte */
|
|
goto InvalidContinuation2;
|
|
}
|
|
if (!IS_CONTINUATION_BYTE(ch4)) {
|
|
/* invalid continuation byte */
|
|
goto InvalidContinuation3;
|
|
}
|
|
ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
|
|
((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
|
|
assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
|
|
s += 4;
|
|
if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
|
|
(STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
|
|
/* Out-of-range */
|
|
goto Return;
|
|
*p++ = ch;
|
|
continue;
|
|
}
|
|
goto InvalidStart;
|
|
}
|
|
ch = 0;
|
|
Return:
|
|
*inptr = s;
|
|
*outpos = p - dest;
|
|
return ch;
|
|
InvalidStart:
|
|
ch = 1;
|
|
goto Return;
|
|
InvalidContinuation1:
|
|
ch = 2;
|
|
goto Return;
|
|
InvalidContinuation2:
|
|
ch = 3;
|
|
goto Return;
|
|
InvalidContinuation3:
|
|
ch = 4;
|
|
goto Return;
|
|
}
|
|
|
|
#undef ASCII_CHAR_MASK
|
|
|
|
|
|
/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
|
|
PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
|
|
UCS-1 strings don't need to handle surrogates for example. */
|
|
Py_LOCAL_INLINE(PyObject *)
|
|
STRINGLIB(utf8_encoder)(PyObject *unicode,
|
|
STRINGLIB_CHAR *data,
|
|
Py_ssize_t size,
|
|
const char *errors)
|
|
{
|
|
Py_ssize_t i; /* index into data of next input character */
|
|
char *p; /* next free byte in output buffer */
|
|
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
PyObject *error_handler_obj = NULL;
|
|
PyObject *exc = NULL;
|
|
PyObject *rep = NULL;
|
|
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
|
|
#endif
|
|
#if STRINGLIB_SIZEOF_CHAR == 1
|
|
const Py_ssize_t max_char_size = 2;
|
|
#elif STRINGLIB_SIZEOF_CHAR == 2
|
|
const Py_ssize_t max_char_size = 3;
|
|
#else /* STRINGLIB_SIZEOF_CHAR == 4 */
|
|
const Py_ssize_t max_char_size = 4;
|
|
#endif
|
|
_PyBytesWriter writer;
|
|
|
|
assert(size >= 0);
|
|
_PyBytesWriter_Init(&writer);
|
|
|
|
if (size > PY_SSIZE_T_MAX / max_char_size) {
|
|
/* integer overflow */
|
|
return PyErr_NoMemory();
|
|
}
|
|
|
|
p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
|
|
if (p == NULL)
|
|
return NULL;
|
|
|
|
for (i = 0; i < size;) {
|
|
Py_UCS4 ch = data[i++];
|
|
|
|
if (ch < 0x80) {
|
|
/* Encode ASCII */
|
|
*p++ = (char) ch;
|
|
|
|
}
|
|
else
|
|
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
if (ch < 0x0800)
|
|
#endif
|
|
{
|
|
/* Encode Latin-1 */
|
|
*p++ = (char)(0xc0 | (ch >> 6));
|
|
*p++ = (char)(0x80 | (ch & 0x3f));
|
|
}
|
|
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
else if (Py_UNICODE_IS_SURROGATE(ch)) {
|
|
Py_ssize_t startpos, endpos, newpos;
|
|
Py_ssize_t k;
|
|
if (error_handler == _Py_ERROR_UNKNOWN) {
|
|
error_handler = get_error_handler(errors);
|
|
}
|
|
|
|
startpos = i-1;
|
|
endpos = startpos+1;
|
|
|
|
while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
|
|
endpos++;
|
|
|
|
/* Only overallocate the buffer if it's not the last write */
|
|
writer.overallocate = (endpos < size);
|
|
|
|
switch (error_handler)
|
|
{
|
|
case _Py_ERROR_REPLACE:
|
|
memset(p, '?', endpos - startpos);
|
|
p += (endpos - startpos);
|
|
/* fall through */
|
|
case _Py_ERROR_IGNORE:
|
|
i += (endpos - startpos - 1);
|
|
break;
|
|
|
|
case _Py_ERROR_SURROGATEPASS:
|
|
for (k=startpos; k<endpos; k++) {
|
|
ch = data[k];
|
|
*p++ = (char)(0xe0 | (ch >> 12));
|
|
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
|
*p++ = (char)(0x80 | (ch & 0x3f));
|
|
}
|
|
i += (endpos - startpos - 1);
|
|
break;
|
|
|
|
case _Py_ERROR_BACKSLASHREPLACE:
|
|
/* subtract preallocated bytes */
|
|
writer.min_size -= max_char_size * (endpos - startpos);
|
|
p = backslashreplace(&writer, p,
|
|
unicode, startpos, endpos);
|
|
if (p == NULL)
|
|
goto error;
|
|
i += (endpos - startpos - 1);
|
|
break;
|
|
|
|
case _Py_ERROR_XMLCHARREFREPLACE:
|
|
/* subtract preallocated bytes */
|
|
writer.min_size -= max_char_size * (endpos - startpos);
|
|
p = xmlcharrefreplace(&writer, p,
|
|
unicode, startpos, endpos);
|
|
if (p == NULL)
|
|
goto error;
|
|
i += (endpos - startpos - 1);
|
|
break;
|
|
|
|
case _Py_ERROR_SURROGATEESCAPE:
|
|
for (k=startpos; k<endpos; k++) {
|
|
ch = data[k];
|
|
if (!(0xDC80 <= ch && ch <= 0xDCFF))
|
|
break;
|
|
*p++ = (char)(ch & 0xff);
|
|
}
|
|
if (k >= endpos) {
|
|
i += (endpos - startpos - 1);
|
|
break;
|
|
}
|
|
startpos = k;
|
|
assert(startpos < endpos);
|
|
/* fall through */
|
|
default:
|
|
rep = unicode_encode_call_errorhandler(
|
|
errors, &error_handler_obj, "utf-8", "surrogates not allowed",
|
|
unicode, &exc, startpos, endpos, &newpos);
|
|
if (!rep)
|
|
goto error;
|
|
|
|
/* subtract preallocated bytes */
|
|
writer.min_size -= max_char_size * (newpos - startpos);
|
|
|
|
if (PyBytes_Check(rep)) {
|
|
p = _PyBytesWriter_WriteBytes(&writer, p,
|
|
PyBytes_AS_STRING(rep),
|
|
PyBytes_GET_SIZE(rep));
|
|
}
|
|
else {
|
|
/* rep is unicode */
|
|
if (PyUnicode_READY(rep) < 0)
|
|
goto error;
|
|
|
|
if (!PyUnicode_IS_ASCII(rep)) {
|
|
raise_encode_exception(&exc, "utf-8", unicode,
|
|
startpos, endpos,
|
|
"surrogates not allowed");
|
|
goto error;
|
|
}
|
|
|
|
p = _PyBytesWriter_WriteBytes(&writer, p,
|
|
PyUnicode_DATA(rep),
|
|
PyUnicode_GET_LENGTH(rep));
|
|
}
|
|
|
|
if (p == NULL)
|
|
goto error;
|
|
Py_CLEAR(rep);
|
|
|
|
i = newpos;
|
|
}
|
|
|
|
/* If overallocation was disabled, ensure that it was the last
|
|
write. Otherwise, we missed an optimization */
|
|
assert(writer.overallocate || i == size);
|
|
}
|
|
else
|
|
#if STRINGLIB_SIZEOF_CHAR > 2
|
|
if (ch < 0x10000)
|
|
#endif
|
|
{
|
|
*p++ = (char)(0xe0 | (ch >> 12));
|
|
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
|
*p++ = (char)(0x80 | (ch & 0x3f));
|
|
}
|
|
#if STRINGLIB_SIZEOF_CHAR > 2
|
|
else /* ch >= 0x10000 */
|
|
{
|
|
assert(ch <= MAX_UNICODE);
|
|
/* Encode UCS4 Unicode ordinals */
|
|
*p++ = (char)(0xf0 | (ch >> 18));
|
|
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
|
|
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
|
*p++ = (char)(0x80 | (ch & 0x3f));
|
|
}
|
|
#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
|
|
#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
|
|
}
|
|
|
|
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
Py_XDECREF(error_handler_obj);
|
|
Py_XDECREF(exc);
|
|
#endif
|
|
return _PyBytesWriter_Finish(&writer, p);
|
|
|
|
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
error:
|
|
Py_XDECREF(rep);
|
|
Py_XDECREF(error_handler_obj);
|
|
Py_XDECREF(exc);
|
|
_PyBytesWriter_Dealloc(&writer);
|
|
return NULL;
|
|
#endif
|
|
}
|
|
|
|
/* The pattern for constructing UCS2-repeated masks. */
|
|
#if SIZEOF_LONG == 8
|
|
# define UCS2_REPEAT_MASK 0x0001000100010001ul
|
|
#elif SIZEOF_LONG == 4
|
|
# define UCS2_REPEAT_MASK 0x00010001ul
|
|
#else
|
|
# error C 'long' size should be either 4 or 8!
|
|
#endif
|
|
|
|
/* The mask for fast checking. */
|
|
#if STRINGLIB_SIZEOF_CHAR == 1
|
|
/* The mask for fast checking of whether a C 'long' contains a
|
|
non-ASCII or non-Latin1 UTF16-encoded characters. */
|
|
# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
|
|
#else
|
|
/* The mask for fast checking of whether a C 'long' may contain
|
|
UTF16-encoded surrogate characters. This is an efficient heuristic,
|
|
assuming that non-surrogate characters with a code point >= 0x8000 are
|
|
rare in most input.
|
|
*/
|
|
# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
|
|
#endif
|
|
/* The mask for fast byte-swapping. */
|
|
#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
|
|
/* Swap bytes. */
|
|
#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
|
|
(((value) & STRIPPED_MASK) << 8))
|
|
|
|
Py_LOCAL_INLINE(Py_UCS4)
|
|
STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
|
|
STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
|
|
int native_ordering)
|
|
{
|
|
Py_UCS4 ch;
|
|
const unsigned char *aligned_end =
|
|
(const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
|
|
const unsigned char *q = *inptr;
|
|
STRINGLIB_CHAR *p = dest + *outpos;
|
|
/* Offsets from q for retrieving byte pairs in the right order. */
|
|
#if PY_LITTLE_ENDIAN
|
|
int ihi = !!native_ordering, ilo = !native_ordering;
|
|
#else
|
|
int ihi = !native_ordering, ilo = !!native_ordering;
|
|
#endif
|
|
--e;
|
|
|
|
while (q < e) {
|
|
Py_UCS4 ch2;
|
|
/* First check for possible aligned read of a C 'long'. Unaligned
|
|
reads are more expensive, better to defer to another iteration. */
|
|
if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
|
|
/* Fast path for runs of in-range non-surrogate chars. */
|
|
const unsigned char *_q = q;
|
|
while (_q < aligned_end) {
|
|
unsigned long block = * (unsigned long *) _q;
|
|
if (native_ordering) {
|
|
/* Can use buffer directly */
|
|
if (block & FAST_CHAR_MASK)
|
|
break;
|
|
}
|
|
else {
|
|
/* Need to byte-swap */
|
|
if (block & SWAB(FAST_CHAR_MASK))
|
|
break;
|
|
#if STRINGLIB_SIZEOF_CHAR == 1
|
|
block >>= 8;
|
|
#else
|
|
block = SWAB(block);
|
|
#endif
|
|
}
|
|
#if PY_LITTLE_ENDIAN
|
|
# if SIZEOF_LONG == 4
|
|
p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
|
|
p[1] = (STRINGLIB_CHAR)(block >> 16);
|
|
# elif SIZEOF_LONG == 8
|
|
p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
|
|
p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
|
|
p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
|
|
p[3] = (STRINGLIB_CHAR)(block >> 48);
|
|
# endif
|
|
#else
|
|
# if SIZEOF_LONG == 4
|
|
p[0] = (STRINGLIB_CHAR)(block >> 16);
|
|
p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
|
|
# elif SIZEOF_LONG == 8
|
|
p[0] = (STRINGLIB_CHAR)(block >> 48);
|
|
p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
|
|
p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
|
|
p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
|
|
# endif
|
|
#endif
|
|
_q += SIZEOF_LONG;
|
|
p += SIZEOF_LONG / 2;
|
|
}
|
|
q = _q;
|
|
if (q >= e)
|
|
break;
|
|
}
|
|
|
|
ch = (q[ihi] << 8) | q[ilo];
|
|
q += 2;
|
|
if (!Py_UNICODE_IS_SURROGATE(ch)) {
|
|
#if STRINGLIB_SIZEOF_CHAR < 2
|
|
if (ch > STRINGLIB_MAX_CHAR)
|
|
/* Out-of-range */
|
|
goto Return;
|
|
#endif
|
|
*p++ = (STRINGLIB_CHAR)ch;
|
|
continue;
|
|
}
|
|
|
|
/* UTF-16 code pair: */
|
|
if (q >= e)
|
|
goto UnexpectedEnd;
|
|
if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
|
|
goto IllegalEncoding;
|
|
ch2 = (q[ihi] << 8) | q[ilo];
|
|
q += 2;
|
|
if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
|
|
goto IllegalSurrogate;
|
|
ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
|
|
#if STRINGLIB_SIZEOF_CHAR < 4
|
|
/* Out-of-range */
|
|
goto Return;
|
|
#else
|
|
*p++ = (STRINGLIB_CHAR)ch;
|
|
#endif
|
|
}
|
|
ch = 0;
|
|
Return:
|
|
*inptr = q;
|
|
*outpos = p - dest;
|
|
return ch;
|
|
UnexpectedEnd:
|
|
ch = 1;
|
|
goto Return;
|
|
IllegalEncoding:
|
|
ch = 2;
|
|
goto Return;
|
|
IllegalSurrogate:
|
|
ch = 3;
|
|
goto Return;
|
|
}
|
|
#undef UCS2_REPEAT_MASK
|
|
#undef FAST_CHAR_MASK
|
|
#undef STRIPPED_MASK
|
|
#undef SWAB
|
|
|
|
|
|
#if STRINGLIB_MAX_CHAR >= 0x80
|
|
Py_LOCAL_INLINE(Py_ssize_t)
|
|
STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
|
|
Py_ssize_t len,
|
|
unsigned short **outptr,
|
|
int native_ordering)
|
|
{
|
|
unsigned short *out = *outptr;
|
|
const STRINGLIB_CHAR *end = in + len;
|
|
#if STRINGLIB_SIZEOF_CHAR == 1
|
|
if (native_ordering) {
|
|
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
|
while (in < unrolled_end) {
|
|
out[0] = in[0];
|
|
out[1] = in[1];
|
|
out[2] = in[2];
|
|
out[3] = in[3];
|
|
in += 4; out += 4;
|
|
}
|
|
while (in < end) {
|
|
*out++ = *in++;
|
|
}
|
|
} else {
|
|
# define SWAB2(CH) ((CH) << 8) /* high byte is zero */
|
|
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
|
while (in < unrolled_end) {
|
|
out[0] = SWAB2(in[0]);
|
|
out[1] = SWAB2(in[1]);
|
|
out[2] = SWAB2(in[2]);
|
|
out[3] = SWAB2(in[3]);
|
|
in += 4; out += 4;
|
|
}
|
|
while (in < end) {
|
|
Py_UCS4 ch = *in++;
|
|
*out++ = SWAB2((Py_UCS2)ch);
|
|
}
|
|
#undef SWAB2
|
|
}
|
|
*outptr = out;
|
|
return len;
|
|
#else
|
|
if (native_ordering) {
|
|
#if STRINGLIB_MAX_CHAR < 0x10000
|
|
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
|
while (in < unrolled_end) {
|
|
/* check if any character is a surrogate character */
|
|
if (((in[0] ^ 0xd800) &
|
|
(in[1] ^ 0xd800) &
|
|
(in[2] ^ 0xd800) &
|
|
(in[3] ^ 0xd800) & 0xf800) == 0)
|
|
break;
|
|
out[0] = in[0];
|
|
out[1] = in[1];
|
|
out[2] = in[2];
|
|
out[3] = in[3];
|
|
in += 4; out += 4;
|
|
}
|
|
#endif
|
|
while (in < end) {
|
|
Py_UCS4 ch;
|
|
ch = *in++;
|
|
if (ch < 0xd800)
|
|
*out++ = ch;
|
|
else if (ch < 0xe000)
|
|
/* reject surrogate characters (U+D800-U+DFFF) */
|
|
goto fail;
|
|
#if STRINGLIB_MAX_CHAR >= 0x10000
|
|
else if (ch >= 0x10000) {
|
|
out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
|
|
out[1] = Py_UNICODE_LOW_SURROGATE(ch);
|
|
out += 2;
|
|
}
|
|
#endif
|
|
else
|
|
*out++ = ch;
|
|
}
|
|
} else {
|
|
#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
|
|
#if STRINGLIB_MAX_CHAR < 0x10000
|
|
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
|
while (in < unrolled_end) {
|
|
/* check if any character is a surrogate character */
|
|
if (((in[0] ^ 0xd800) &
|
|
(in[1] ^ 0xd800) &
|
|
(in[2] ^ 0xd800) &
|
|
(in[3] ^ 0xd800) & 0xf800) == 0)
|
|
break;
|
|
out[0] = SWAB2(in[0]);
|
|
out[1] = SWAB2(in[1]);
|
|
out[2] = SWAB2(in[2]);
|
|
out[3] = SWAB2(in[3]);
|
|
in += 4; out += 4;
|
|
}
|
|
#endif
|
|
while (in < end) {
|
|
Py_UCS4 ch = *in++;
|
|
if (ch < 0xd800)
|
|
*out++ = SWAB2((Py_UCS2)ch);
|
|
else if (ch < 0xe000)
|
|
/* reject surrogate characters (U+D800-U+DFFF) */
|
|
goto fail;
|
|
#if STRINGLIB_MAX_CHAR >= 0x10000
|
|
else if (ch >= 0x10000) {
|
|
Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
|
|
Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
|
|
out[0] = SWAB2(ch1);
|
|
out[1] = SWAB2(ch2);
|
|
out += 2;
|
|
}
|
|
#endif
|
|
else
|
|
*out++ = SWAB2((Py_UCS2)ch);
|
|
}
|
|
#undef SWAB2
|
|
}
|
|
*outptr = out;
|
|
return len;
|
|
fail:
|
|
*outptr = out;
|
|
return len - (end - in + 1);
|
|
#endif
|
|
}
|
|
|
|
#if STRINGLIB_SIZEOF_CHAR == 1
|
|
# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
|
|
#elif STRINGLIB_SIZEOF_CHAR == 2
|
|
# define SWAB4(CH, tmp) (tmp = (CH), \
|
|
((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
|
|
/* high bytes are zero */
|
|
#else
|
|
# define SWAB4(CH, tmp) (tmp = (CH), \
|
|
tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
|
|
((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
|
|
#endif
|
|
Py_LOCAL_INLINE(Py_ssize_t)
|
|
STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
|
|
Py_ssize_t len,
|
|
PY_UINT32_T **outptr,
|
|
int native_ordering)
|
|
{
|
|
PY_UINT32_T *out = *outptr;
|
|
const STRINGLIB_CHAR *end = in + len;
|
|
if (native_ordering) {
|
|
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
|
while (in < unrolled_end) {
|
|
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
/* check if any character is a surrogate character */
|
|
if (((in[0] ^ 0xd800) &
|
|
(in[1] ^ 0xd800) &
|
|
(in[2] ^ 0xd800) &
|
|
(in[3] ^ 0xd800) & 0xf800) == 0)
|
|
break;
|
|
#endif
|
|
out[0] = in[0];
|
|
out[1] = in[1];
|
|
out[2] = in[2];
|
|
out[3] = in[3];
|
|
in += 4; out += 4;
|
|
}
|
|
while (in < end) {
|
|
Py_UCS4 ch;
|
|
ch = *in++;
|
|
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
if (Py_UNICODE_IS_SURROGATE(ch)) {
|
|
/* reject surrogate characters (U+D800-U+DFFF) */
|
|
goto fail;
|
|
}
|
|
#endif
|
|
*out++ = ch;
|
|
}
|
|
} else {
|
|
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
|
|
while (in < unrolled_end) {
|
|
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
Py_UCS4 ch1, ch2, ch3, ch4;
|
|
/* check if any character is a surrogate character */
|
|
if (((in[0] ^ 0xd800) &
|
|
(in[1] ^ 0xd800) &
|
|
(in[2] ^ 0xd800) &
|
|
(in[3] ^ 0xd800) & 0xf800) == 0)
|
|
break;
|
|
#endif
|
|
out[0] = SWAB4(in[0], ch1);
|
|
out[1] = SWAB4(in[1], ch2);
|
|
out[2] = SWAB4(in[2], ch3);
|
|
out[3] = SWAB4(in[3], ch4);
|
|
in += 4; out += 4;
|
|
}
|
|
while (in < end) {
|
|
Py_UCS4 ch = *in++;
|
|
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
if (Py_UNICODE_IS_SURROGATE(ch)) {
|
|
/* reject surrogate characters (U+D800-U+DFFF) */
|
|
goto fail;
|
|
}
|
|
#endif
|
|
*out++ = SWAB4(ch, ch);
|
|
}
|
|
}
|
|
*outptr = out;
|
|
return len;
|
|
#if STRINGLIB_SIZEOF_CHAR > 1
|
|
fail:
|
|
*outptr = out;
|
|
return len - (end - in + 1);
|
|
#endif
|
|
}
|
|
#undef SWAB4
|
|
|
|
#endif
|