cosmopolitan/third_party/python/Lib/test/multibytecodec_support.py
Justine Tunney 47a53e143b Productionize new APE loader and more
The APE_NO_MODIFY_SELF loader payload has been moved out of the examples
folder and improved so that it works on BSD systems, and permits general
elf program headers. This brings its quality up enough that it should be
acceptable to use by default for many programs, e.g. Python, Lua, SQLite
and Python. It's the responsibility of the user to define an appropriate
TMPDIR if /tmp is considered an adversarial environment. Mac OS shall be
supported by APE_NO_MODIFY_SELF soon.

Fixes and improvements have been made to program_executable_name as it's
now the one true way to get the absolute path of the executing image.

This change fixes a memory leak in linenoise history loading, introduced
by performance optimizations in 51904e2687
This change fixes a longstanding regression with Mach system calls, that
23ae9dfceb back in February which impacted
our sched_yield() implementation, which is why no one noticed until now.

The Blinkenlights PC emulator has been improved. We now fix rendering on
XNU and BSD by not making the assumption that the kernel terminal driver
understands UTF8 since that seems to break its internal modeling of \r\n
which is now being addressed by using \e[𝑦H instead. The paneling is now
more compact in real mode so you won't need to make your font as tiny if
you're only emulating an 8086 program. The CLMUL ISA is now emulated too

This change also makes improvement to time. CLOCK_MONOTONIC now does the
right thing on Windows NT. The nanosecond time module functions added in
Python 3.7 have been backported.

This change doubles the performance of Argon2 password stretching simply
by not using its copy_block and xor_block helper functions, as they were
trivial to inline thus resulting in us needing to iterate over each 1024
byte block four fewer times.

This change makes code size improvements. _PyUnicode_ToNumeric() was 64k
in size and now it's 10k. The CJK codec lookup tables now use lazy delta
zigzag deflate (δzd) encoding which reduces their size from 600k to 200k
plus the code bloat caused by macro abuse in _decimal.c is now addressed
so our fully-loaded statically-linked hermetically-sealed Python virtual
interpreter container is now 9.4 megs in the default build mode and 5.5m
in MODE=tiny which leaves plenty of room for chibicc.

The pydoc web server now accommodates the use case of people who work by
SSH'ing into a different machine w/ python.com -m pydoc -p8080 -h0.0.0.0

Finally Python Capsulae delenda est and won't be supported in the future
2021-10-02 08:27:03 -07:00

369 lines
14 KiB
Python

#
# multibytecodec_support.py
# Common Unittest Routines for CJK codecs
#
import codecs
import os
import re
import sys
import unittest
from test import support
from io import BytesIO
class TestBase:
encoding = '' # codec name
codec = None # codec tuple (with 4 elements)
tstring = None # must set. 2 strings to test StreamReader
codectests = None # must set. codec test tuple
roundtriptest = 1 # set if roundtrip is possible with unicode
has_iso10646 = 0 # set if this encoding contains whole iso10646 map
xmlcharnametest = None # string to test xmlcharrefreplace
unmappedunicode = '\udeee' # a unicode code point that is not mapped.
def setUp(self):
if self.codec is None:
self.codec = codecs.lookup(self.encoding)
self.encode = self.codec.encode
self.decode = self.codec.decode
self.reader = self.codec.streamreader
self.writer = self.codec.streamwriter
self.incrementalencoder = self.codec.incrementalencoder
self.incrementaldecoder = self.codec.incrementaldecoder
def test_chunkcoding(self):
tstring_lines = []
for b in self.tstring:
lines = b.split(b"\n")
last = lines.pop()
assert last == b""
lines = [line + b"\n" for line in lines]
tstring_lines.append(lines)
for native, utf8 in zip(*tstring_lines):
u = self.decode(native)[0]
self.assertEqual(u, utf8.decode('utf-8'))
if self.roundtriptest:
self.assertEqual(native, self.encode(u)[0])
def test_errorhandle(self):
for source, scheme, expected in self.codectests:
if isinstance(source, bytes):
func = self.decode
else:
func = self.encode
if expected:
result = func(source, scheme)[0]
if func is self.decode:
self.assertTrue(type(result) is str, type(result))
self.assertEqual(result, expected,
'%a.decode(%r, %r)=%a != %a'
% (source, self.encoding, scheme, result,
expected))
else:
self.assertTrue(type(result) is bytes, type(result))
self.assertEqual(result, expected,
'%a.encode(%r, %r)=%a != %a'
% (source, self.encoding, scheme, result,
expected))
else:
self.assertRaises(UnicodeError, func, source, scheme)
def test_xmlcharrefreplace(self):
if self.has_iso10646:
self.skipTest('encoding contains full ISO 10646 map')
s = "\u0b13\u0b23\u0b60 nd eggs"
self.assertEqual(
self.encode(s, "xmlcharrefreplace")[0],
b"ଓଣୠ nd eggs"
)
def test_customreplace_encode(self):
if self.has_iso10646:
self.skipTest('encoding contains full ISO 10646 map')
from html.entities import codepoint2name
def xmlcharnamereplace(exc):
if not isinstance(exc, UnicodeEncodeError):
raise TypeError("don't know how to handle %r" % exc)
l = []
for c in exc.object[exc.start:exc.end]:
if ord(c) in codepoint2name:
l.append("&%s;" % codepoint2name[ord(c)])
else:
l.append("&#%d;" % ord(c))
return ("".join(l), exc.end)
codecs.register_error("test.xmlcharnamereplace", xmlcharnamereplace)
if self.xmlcharnametest:
sin, sout = self.xmlcharnametest
else:
sin = "\xab\u211c\xbb = \u2329\u1234\u232a"
sout = b"«ℜ» = ⟨ሴ⟩"
self.assertEqual(self.encode(sin,
"test.xmlcharnamereplace")[0], sout)
def test_callback_returns_bytes(self):
def myreplace(exc):
return (b"1234", exc.end)
codecs.register_error("test.cjktest", myreplace)
enc = self.encode("abc" + self.unmappedunicode + "def", "test.cjktest")[0]
self.assertEqual(enc, b"abc1234def")
def test_callback_wrong_objects(self):
def myreplace(exc):
return (ret, exc.end)
codecs.register_error("test.cjktest", myreplace)
for ret in ([1, 2, 3], [], None, object()):
self.assertRaises(TypeError, self.encode, self.unmappedunicode,
'test.cjktest')
def test_callback_long_index(self):
def myreplace(exc):
return ('x', int(exc.end))
codecs.register_error("test.cjktest", myreplace)
self.assertEqual(self.encode('abcd' + self.unmappedunicode + 'efgh',
'test.cjktest'), (b'abcdxefgh', 9))
def myreplace(exc):
return ('x', sys.maxsize + 1)
codecs.register_error("test.cjktest", myreplace)
self.assertRaises(IndexError, self.encode, self.unmappedunicode,
'test.cjktest')
def test_callback_None_index(self):
def myreplace(exc):
return ('x', None)
codecs.register_error("test.cjktest", myreplace)
self.assertRaises(TypeError, self.encode, self.unmappedunicode,
'test.cjktest')
def test_callback_backward_index(self):
def myreplace(exc):
if myreplace.limit > 0:
myreplace.limit -= 1
return ('REPLACED', 0)
else:
return ('TERMINAL', exc.end)
myreplace.limit = 3
codecs.register_error("test.cjktest", myreplace)
self.assertEqual(self.encode('abcd' + self.unmappedunicode + 'efgh',
'test.cjktest'),
(b'abcdREPLACEDabcdREPLACEDabcdREPLACEDabcdTERMINALefgh', 9))
def test_callback_forward_index(self):
def myreplace(exc):
return ('REPLACED', exc.end + 2)
codecs.register_error("test.cjktest", myreplace)
self.assertEqual(self.encode('abcd' + self.unmappedunicode + 'efgh',
'test.cjktest'), (b'abcdREPLACEDgh', 9))
def test_callback_index_outofbound(self):
def myreplace(exc):
return ('TERM', 100)
codecs.register_error("test.cjktest", myreplace)
self.assertRaises(IndexError, self.encode, self.unmappedunicode,
'test.cjktest')
def test_incrementalencoder(self):
UTF8Reader = codecs.getreader('utf-8')
for sizehint in [None] + list(range(1, 33)) + \
[64, 128, 256, 512, 1024]:
istream = UTF8Reader(BytesIO(self.tstring[1]))
ostream = BytesIO()
encoder = self.incrementalencoder()
while 1:
if sizehint is not None:
data = istream.read(sizehint)
else:
data = istream.read()
if not data:
break
e = encoder.encode(data)
ostream.write(e)
self.assertEqual(ostream.getvalue(), self.tstring[0])
def test_incrementaldecoder(self):
UTF8Writer = codecs.getwriter('utf-8')
for sizehint in [None, -1] + list(range(1, 33)) + \
[64, 128, 256, 512, 1024]:
istream = BytesIO(self.tstring[0])
ostream = UTF8Writer(BytesIO())
decoder = self.incrementaldecoder()
while 1:
data = istream.read(sizehint)
if not data:
break
else:
u = decoder.decode(data)
ostream.write(u)
self.assertEqual(ostream.getvalue(), self.tstring[1])
def test_incrementalencoder_error_callback(self):
inv = self.unmappedunicode
e = self.incrementalencoder()
self.assertRaises(UnicodeEncodeError, e.encode, inv, True)
e.errors = 'ignore'
self.assertEqual(e.encode(inv, True), b'')
e.reset()
def tempreplace(exc):
return ('called', exc.end)
codecs.register_error('test.incremental_error_callback', tempreplace)
e.errors = 'test.incremental_error_callback'
self.assertEqual(e.encode(inv, True), b'called')
# again
e.errors = 'ignore'
self.assertEqual(e.encode(inv, True), b'')
def test_streamreader(self):
UTF8Writer = codecs.getwriter('utf-8')
for name in ["read", "readline", "readlines"]:
for sizehint in [None, -1] + list(range(1, 33)) + \
[64, 128, 256, 512, 1024]:
istream = self.reader(BytesIO(self.tstring[0]))
ostream = UTF8Writer(BytesIO())
func = getattr(istream, name)
while 1:
data = func(sizehint)
if not data:
break
if name == "readlines":
ostream.writelines(data)
else:
ostream.write(data)
self.assertEqual(ostream.getvalue(), self.tstring[1])
def test_streamwriter(self):
readfuncs = ('read', 'readline', 'readlines')
UTF8Reader = codecs.getreader('utf-8')
for name in readfuncs:
for sizehint in [None] + list(range(1, 33)) + \
[64, 128, 256, 512, 1024]:
istream = UTF8Reader(BytesIO(self.tstring[1]))
ostream = self.writer(BytesIO())
func = getattr(istream, name)
while 1:
if sizehint is not None:
data = func(sizehint)
else:
data = func()
if not data:
break
if name == "readlines":
ostream.writelines(data)
else:
ostream.write(data)
self.assertEqual(ostream.getvalue(), self.tstring[0])
def test_streamwriter_reset_no_pending(self):
# Issue #23247: Calling reset() on a fresh StreamWriter instance
# (without pending data) must not crash
stream = BytesIO()
writer = self.writer(stream)
writer.reset()
class TestBase_Mapping(unittest.TestCase):
pass_enctest = []
pass_dectest = []
supmaps = []
codectests = []
def setUp(self):
pass
def open_mapping_file(self):
return open(self.mapfileurl)
def test_mapping_file(self):
if self.mapfileurl.endswith('.ucm'):
self._test_mapping_file_ucm()
else:
self._test_mapping_file_plain()
def _test_mapping_file_plain(self):
def unichrs(s):
return ''.join(chr(int(x, 16)) for x in s.split('+'))
urt_wa = {}
with self.open_mapping_file() as f:
for line in f:
line = line.split('#')[0].strip()
if not line:
break
data = line.split()
if len(data) != 2:
continue
csetch = bytes.fromhex(data[0])
if len(csetch) == 1 and 0x80 <= csetch[0]:
continue
unich = unichrs(data[1])
if ord(unich) == 0xfffd or unich in urt_wa:
continue
urt_wa[unich] = csetch
self._testpoint(csetch, unich)
def _test_mapping_file_ucm(self):
with self.open_mapping_file() as f:
ucmdata = f.read()
for uni, coded in re.findall('^([A-F0-9]+)\t([0-9A-F ]+)$', ucmdata):
unich = chr(int(uni, 16))
codech = bytes(int(c, 16) for c in coded.split())
self._testpoint(codech, unich)
def test_mapping_supplemental(self):
for mapping in self.supmaps:
self._testpoint(*mapping)
def _testpoint(self, csetch, unich):
if (csetch, unich) not in self.pass_enctest:
self.assertEqual(unich.encode(self.encoding), csetch)
if (csetch, unich) not in self.pass_dectest:
self.assertEqual(str(csetch, self.encoding), unich)
def test_errorhandle(self):
for source, scheme, expected in self.codectests:
if isinstance(source, bytes):
func = source.decode
else:
func = source.encode
if expected:
if isinstance(source, bytes):
result = func(self.encoding, scheme)
self.assertTrue(type(result) is str, type(result))
self.assertEqual(result, expected,
'%a.decode(%r, %r)=%a != %a'
% (source, self.encoding, scheme, result,
expected))
else:
result = func(self.encoding, scheme)
self.assertTrue(type(result) is bytes, type(result))
self.assertEqual(result, expected,
'%a.encode(%r, %r)=%a != %a'
% (source, self.encoding, scheme, result,
expected))
else:
self.assertRaises(UnicodeError, func, self.encoding, scheme)
def load_teststring(name):
dir = os.path.join(os.path.dirname(__file__), 'cjkencodings')
with open(os.path.join(dir, name + '.txt'), 'rb') as f:
encoded = f.read()
with open(os.path.join(dir, name + '-utf8.txt'), 'rb') as f:
utf8 = f.read()
return encoded, utf8