Decentralize Python native module linkage

We can now link even smaller Python binaries. For example, the hello.com
program in the Python build directory is a compiled linked executable of
hello.py which just prints hello world. Using decentralized sections, we
can make that binary 1.9mb in size (noting that python.com is 6.3 megs!)

This works for nontrivial programs too. For example, say we want an APE
binary that's equivalent to python.com -m http.server. Our makefile now
builds such a binary using the new launcher and it's only 3.2mb in size
since Python sources get turned into ELF objects, which tell our linker
that we need things like native hashing algorithm code.
This commit is contained in:
Justine Tunney 2021-09-07 11:40:11 -07:00
parent dfa0359b50
commit 559b024e1d
129 changed files with 2798 additions and 13514 deletions

View file

@ -3281,11 +3281,6 @@ PyUnicode_Decode(const char *s,
|| strcmp(lower, "us_ascii") == 0) {
return PyUnicode_DecodeASCII(s, size, errors);
}
#ifdef MS_WINDOWS
else if (strcmp(lower, "mbcs") == 0) {
return PyUnicode_DecodeMBCS(s, size, errors);
}
#endif
else if (strcmp(lower, "latin1") == 0
|| strcmp(lower, "latin_1") == 0
|| strcmp(lower, "iso_8859_1") == 0
@ -7170,730 +7165,6 @@ PyUnicode_AsASCIIString(PyObject *unicode)
return _PyUnicode_AsASCIIString(unicode, NULL);
}
#ifdef MS_WINDOWS
/* --- MBCS codecs for Windows -------------------------------------------- */
#if SIZEOF_INT < SIZEOF_SIZE_T
#define NEED_RETRY
#endif
#ifndef WC_ERR_INVALID_CHARS
# define WC_ERR_INVALID_CHARS 0x0080
#endif
static const char*
code_page_name(UINT code_page, PyObject **obj)
{
*obj = NULL;
if (code_page == CP_ACP)
return "mbcs";
if (code_page == CP_UTF7)
return "CP_UTF7";
if (code_page == CP_UTF8)
return "CP_UTF8";
*obj = PyBytes_FromFormat("cp%u", code_page);
if (*obj == NULL)
return NULL;
return PyBytes_AS_STRING(*obj);
}
static DWORD
decode_code_page_flags(UINT code_page)
{
if (code_page == CP_UTF7) {
/* The CP_UTF7 decoder only supports flags=0 */
return 0;
}
else
return MB_ERR_INVALID_CHARS;
}
/*
* Decode a byte string from a Windows code page into unicode object in strict
* mode.
*
* Returns consumed size if succeed, returns -2 on decode error, or raise an
* OSError and returns -1 on other error.
*/
static int
decode_code_page_strict(UINT code_page,
PyObject **v,
const char *in,
int insize)
{
const DWORD flags = decode_code_page_flags(code_page);
wchar_t *out;
DWORD outsize;
/* First get the size of the result */
assert(insize > 0);
outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
if (outsize <= 0)
goto error;
if (*v == NULL) {
/* Create unicode object */
/* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
*v = (PyObject*)_PyUnicode_New(outsize);
if (*v == NULL)
return -1;
out = PyUnicode_AS_UNICODE(*v);
}
else {
/* Extend unicode object */
Py_ssize_t n = PyUnicode_GET_SIZE(*v);
if (unicode_resize(v, n + outsize) < 0)
return -1;
out = PyUnicode_AS_UNICODE(*v) + n;
}
/* Do the conversion */
outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
if (outsize <= 0)
goto error;
return insize;
error:
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
return -2;
PyErr_SetFromWindowsErr(0);
return -1;
}
/*
* Decode a byte string from a code page into unicode object with an error
* handler.
*
* Returns consumed size if succeed, or raise an OSError or
* UnicodeDecodeError exception and returns -1 on error.
*/
static int
decode_code_page_errors(UINT code_page,
PyObject **v,
const char *in, const int size,
const char *errors, int final)
{
const char *startin = in;
const char *endin = in + size;
const DWORD flags = decode_code_page_flags(code_page);
/* Ideally, we should get reason from FormatMessage. This is the Windows
2000 English version of the message. */
const char *reason = "No mapping for the Unicode character exists "
"in the target code page.";
/* each step cannot decode more than 1 character, but a character can be
represented as a surrogate pair */
wchar_t buffer[2], *out;
int insize;
Py_ssize_t outsize;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *encoding_obj = NULL;
const char *encoding;
DWORD err;
int ret = -1;
assert(size > 0);
encoding = code_page_name(code_page, &encoding_obj);
if (encoding == NULL)
return -1;
if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
/* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
UnicodeDecodeError. */
make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
if (exc != NULL) {
PyCodec_StrictErrors(exc);
Py_CLEAR(exc);
}
goto error;
}
if (*v == NULL) {
/* Create unicode object */
if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
PyErr_NoMemory();
goto error;
}
/* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
*v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
if (*v == NULL)
goto error;
out = PyUnicode_AS_UNICODE(*v);
}
else {
/* Extend unicode object */
Py_ssize_t n = PyUnicode_GET_SIZE(*v);
if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
PyErr_NoMemory();
goto error;
}
if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
goto error;
out = PyUnicode_AS_UNICODE(*v) + n;
}
/* Decode the byte string character per character */
while (in < endin)
{
/* Decode a character */
insize = 1;
do
{
outsize = MultiByteToWideChar(code_page, flags,
in, insize,
buffer, Py_ARRAY_LENGTH(buffer));
if (outsize > 0)
break;
err = GetLastError();
if (err != ERROR_NO_UNICODE_TRANSLATION
&& err != ERROR_INSUFFICIENT_BUFFER)
{
PyErr_SetFromWindowsErr(0);
goto error;
}
insize++;
}
/* 4=maximum length of a UTF-8 sequence */
while (insize <= 4 && (in + insize) <= endin);
if (outsize <= 0) {
Py_ssize_t startinpos, endinpos, outpos;
/* last character in partial decode? */
if (in + insize >= endin && !final)
break;
startinpos = in - startin;
endinpos = startinpos + 1;
outpos = out - PyUnicode_AS_UNICODE(*v);
if (unicode_decode_call_errorhandler_wchar(
errors, &errorHandler,
encoding, reason,
&startin, &endin, &startinpos, &endinpos, &exc, &in,
v, &outpos))
{
goto error;
}
out = PyUnicode_AS_UNICODE(*v) + outpos;
}
else {
in += insize;
memcpy(out, buffer, outsize * sizeof(wchar_t));
out += outsize;
}
}
/* write a NUL character at the end */
*out = 0;
/* Extend unicode object */
outsize = out - PyUnicode_AS_UNICODE(*v);
assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
if (unicode_resize(v, outsize) < 0)
goto error;
/* (in - startin) <= size and size is an int */
ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
error:
Py_XDECREF(encoding_obj);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return ret;
}
static PyObject *
decode_code_page_stateful(int code_page,
const char *s, Py_ssize_t size,
const char *errors, Py_ssize_t *consumed)
{
PyObject *v = NULL;
int chunk_size, final, converted, done;
if (code_page < 0) {
PyErr_SetString(PyExc_ValueError, "invalid code page number");
return NULL;
}
if (size < 0) {
PyErr_BadInternalCall();
return NULL;
}
if (consumed)
*consumed = 0;
do
{
#ifdef NEED_RETRY
if (size > INT_MAX) {
chunk_size = INT_MAX;
final = 0;
done = 0;
}
else
#endif
{
chunk_size = (int)size;
final = (consumed == NULL);
done = 1;
}
if (chunk_size == 0 && done) {
if (v != NULL)
break;
_Py_RETURN_UNICODE_EMPTY();
}
converted = decode_code_page_strict(code_page, &v,
s, chunk_size);
if (converted == -2)
converted = decode_code_page_errors(code_page, &v,
s, chunk_size,
errors, final);
assert(converted != 0 || done);
if (converted < 0) {
Py_XDECREF(v);
return NULL;
}
if (consumed)
*consumed += converted;
s += converted;
size -= converted;
} while (!done);
return unicode_result(v);
}
PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,
const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
return decode_code_page_stateful(code_page, s, size, errors, consumed);
}
PyObject *
PyUnicode_DecodeMBCSStateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
}
PyObject *
PyUnicode_DecodeMBCS(const char *s,
Py_ssize_t size,
const char *errors)
{
return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
}
static DWORD
encode_code_page_flags(UINT code_page, const char *errors)
{
if (code_page == CP_UTF8) {
return WC_ERR_INVALID_CHARS;
}
else if (code_page == CP_UTF7) {
/* CP_UTF7 only supports flags=0 */
return 0;
}
else {
if (errors != NULL && strcmp(errors, "replace") == 0)
return 0;
else
return WC_NO_BEST_FIT_CHARS;
}
}
/*
* Encode a Unicode string to a Windows code page into a byte string in strict
* mode.
*
* Returns consumed characters if succeed, returns -2 on encode error, or raise
* an OSError and returns -1 on other error.
*/
static int
encode_code_page_strict(UINT code_page, PyObject **outbytes,
PyObject *unicode, Py_ssize_t offset, int len,
const char* errors)
{
BOOL usedDefaultChar = FALSE;
BOOL *pusedDefaultChar = &usedDefaultChar;
int outsize;
wchar_t *p;
Py_ssize_t size;
const DWORD flags = encode_code_page_flags(code_page, NULL);
char *out;
/* Create a substring so that we can get the UTF-16 representation
of just the slice under consideration. */
PyObject *substring;
assert(len > 0);
if (code_page != CP_UTF8 && code_page != CP_UTF7)
pusedDefaultChar = &usedDefaultChar;
else
pusedDefaultChar = NULL;
substring = PyUnicode_Substring(unicode, offset, offset+len);
if (substring == NULL)
return -1;
p = PyUnicode_AsUnicodeAndSize(substring, &size);
if (p == NULL) {
Py_DECREF(substring);
return -1;
}
assert(size <= INT_MAX);
/* First get the size of the result */
outsize = WideCharToMultiByte(code_page, flags,
p, (int)size,
NULL, 0,
NULL, pusedDefaultChar);
if (outsize <= 0)
goto error;
/* If we used a default char, then we failed! */
if (pusedDefaultChar && *pusedDefaultChar) {
Py_DECREF(substring);
return -2;
}
if (*outbytes == NULL) {
/* Create string object */
*outbytes = PyBytes_FromStringAndSize(NULL, outsize);
if (*outbytes == NULL) {
Py_DECREF(substring);
return -1;
}
out = PyBytes_AS_STRING(*outbytes);
}
else {
/* Extend string object */
const Py_ssize_t n = PyBytes_Size(*outbytes);
if (outsize > PY_SSIZE_T_MAX - n) {
PyErr_NoMemory();
Py_DECREF(substring);
return -1;
}
if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Py_DECREF(substring);
return -1;
}
out = PyBytes_AS_STRING(*outbytes) + n;
}
/* Do the conversion */
outsize = WideCharToMultiByte(code_page, flags,
p, (int)size,
out, outsize,
NULL, pusedDefaultChar);
Py_CLEAR(substring);
if (outsize <= 0)
goto error;
if (pusedDefaultChar && *pusedDefaultChar)
return -2;
return 0;
error:
Py_XDECREF(substring);
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
return -2;
PyErr_SetFromWindowsErr(0);
return -1;
}
/*
* Encode a Unicode string to a Windows code page into a byte string using an
* error handler.
*
* Returns consumed characters if succeed, or raise an OSError and returns
* -1 on other error.
*/
static int
encode_code_page_errors(UINT code_page, PyObject **outbytes,
PyObject *unicode, Py_ssize_t unicode_offset,
Py_ssize_t insize, const char* errors)
{
const DWORD flags = encode_code_page_flags(code_page, errors);
Py_ssize_t pos = unicode_offset;
Py_ssize_t endin = unicode_offset + insize;
/* Ideally, we should get reason from FormatMessage. This is the Windows
2000 English version of the message. */
const char *reason = "invalid character";
/* 4=maximum length of a UTF-8 sequence */
char buffer[4];
BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
Py_ssize_t outsize;
char *out;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *encoding_obj = NULL;
const char *encoding;
Py_ssize_t newpos, newoutsize;
PyObject *rep;
int ret = -1;
assert(insize > 0);
encoding = code_page_name(code_page, &encoding_obj);
if (encoding == NULL)
return -1;
if (errors == NULL || strcmp(errors, "strict") == 0) {
/* The last error was ERROR_NO_UNICODE_TRANSLATION,
then we raise a UnicodeEncodeError. */
make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
if (exc != NULL) {
PyCodec_StrictErrors(exc);
Py_DECREF(exc);
}
Py_XDECREF(encoding_obj);
return -1;
}
if (code_page != CP_UTF8 && code_page != CP_UTF7)
pusedDefaultChar = &usedDefaultChar;
else
pusedDefaultChar = NULL;
if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
PyErr_NoMemory();
goto error;
}
outsize = insize * Py_ARRAY_LENGTH(buffer);
if (*outbytes == NULL) {
/* Create string object */
*outbytes = PyBytes_FromStringAndSize(NULL, outsize);
if (*outbytes == NULL)
goto error;
out = PyBytes_AS_STRING(*outbytes);
}
else {
/* Extend string object */
Py_ssize_t n = PyBytes_Size(*outbytes);
if (n > PY_SSIZE_T_MAX - outsize) {
PyErr_NoMemory();
goto error;
}
if (_PyBytes_Resize(outbytes, n + outsize) < 0)
goto error;
out = PyBytes_AS_STRING(*outbytes) + n;
}
/* Encode the string character per character */
while (pos < endin)
{
Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
wchar_t chars[2];
int charsize;
if (ch < 0x10000) {
chars[0] = (wchar_t)ch;
charsize = 1;
}
else {
chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
charsize = 2;
}
outsize = WideCharToMultiByte(code_page, flags,
chars, charsize,
buffer, Py_ARRAY_LENGTH(buffer),
NULL, pusedDefaultChar);
if (outsize > 0) {
if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
{
pos++;
memcpy(out, buffer, outsize);
out += outsize;
continue;
}
}
else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
PyErr_SetFromWindowsErr(0);
goto error;
}
rep = unicode_encode_call_errorhandler(
errors, &errorHandler, encoding, reason,
unicode, &exc,
pos, pos + 1, &newpos);
if (rep == NULL)
goto error;
pos = newpos;
if (PyBytes_Check(rep)) {
outsize = PyBytes_GET_SIZE(rep);
if (outsize != 1) {
Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
Py_DECREF(rep);
goto error;
}
out = PyBytes_AS_STRING(*outbytes) + offset;
}
memcpy(out, PyBytes_AS_STRING(rep), outsize);
out += outsize;
}
else {
Py_ssize_t i;
enum PyUnicode_Kind kind;
void *data;
if (PyUnicode_READY(rep) == -1) {
Py_DECREF(rep);
goto error;
}
outsize = PyUnicode_GET_LENGTH(rep);
if (outsize != 1) {
Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
Py_DECREF(rep);
goto error;
}
out = PyBytes_AS_STRING(*outbytes) + offset;
}
kind = PyUnicode_KIND(rep);
data = PyUnicode_DATA(rep);
for (i=0; i < outsize; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (ch > 127) {
raise_encode_exception(&exc,
encoding, unicode,
pos, pos + 1,
"unable to encode error handler result to ASCII");
Py_DECREF(rep);
goto error;
}
*out = (unsigned char)ch;
out++;
}
}
Py_DECREF(rep);
}
/* write a NUL byte */
*out = 0;
outsize = out - PyBytes_AS_STRING(*outbytes);
assert(outsize <= PyBytes_GET_SIZE(*outbytes));
if (_PyBytes_Resize(outbytes, outsize) < 0)
goto error;
ret = 0;
error:
Py_XDECREF(encoding_obj);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return ret;
}
static PyObject *
encode_code_page(int code_page,
PyObject *unicode,
const char *errors)
{
Py_ssize_t len;
PyObject *outbytes = NULL;
Py_ssize_t offset;
int chunk_len, ret, done;
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
}
if (PyUnicode_READY(unicode) == -1)
return NULL;
len = PyUnicode_GET_LENGTH(unicode);
if (code_page < 0) {
PyErr_SetString(PyExc_ValueError, "invalid code page number");
return NULL;
}
if (len == 0)
return PyBytes_FromStringAndSize(NULL, 0);
offset = 0;
do
{
#ifdef NEED_RETRY
/* UTF-16 encoding may double the size, so use only INT_MAX/2
chunks. */
if (len > INT_MAX/2) {
chunk_len = INT_MAX/2;
done = 0;
}
else
#endif
{
chunk_len = (int)len;
done = 1;
}
ret = encode_code_page_strict(code_page, &outbytes,
unicode, offset, chunk_len,
errors);
if (ret == -2)
ret = encode_code_page_errors(code_page, &outbytes,
unicode, offset,
chunk_len, errors);
if (ret < 0) {
Py_XDECREF(outbytes);
return NULL;
}
offset += chunk_len;
len -= chunk_len;
} while (!done);
return outbytes;
}
PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Py_ssize_t size,
const char *errors)
{
PyObject *unicode, *res;
unicode = PyUnicode_FromUnicode(p, size);
if (unicode == NULL)
return NULL;
res = encode_code_page(CP_ACP, unicode, errors);
Py_DECREF(unicode);
return res;
}
PyObject *
PyUnicode_EncodeCodePage(int code_page,
PyObject *unicode,
const char *errors)
{
return encode_code_page(code_page, unicode, errors);
}
PyObject *
PyUnicode_AsMBCSString(PyObject *unicode)
{
return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
}
#undef NEED_RETRY
#endif /* MS_WINDOWS */
/* --- Character Mapping Codec -------------------------------------------- */
static int