Decentralize Python native module linkage

We can now link even smaller Python binaries. For example, the hello.com program in the Python build directory is a compiled linked executable of hello.py which just prints hello world. Using decentralized sections, we can make that binary 1.9mb in size (noting that python.com is 6.3 megs!) This works for nontrivial programs too. For example, say we want an APE binary that's equivalent to python.com -m http.server. Our makefile now builds such a binary using the new launcher and it's only 3.2mb in size since Python sources get turned into ELF objects, which tell our linker that we need things like native hashing algorithm code.
2025-07-07 19:58:30 +00:00 · 2021-09-07 11:40:11 -07:00 · 2021-09-07 11:40:11 -07:00 · 559b024e1d
commit 559b024e1d
parent dfa0359b50
129 changed files with 2798 additions and 13514 deletions
--- a/third_party/python/Lib/_bootlocale.py
+++ b/third_party/python/Lib/_bootlocale.py
@ -7,28 +7,15 @@ Don't import directly from third-party code; use the `locale` module instead!
 import sys
 import _locale

-if sys.platform.startswith("win"):
-    def getpreferredencoding(do_setlocale=True):
-        return _locale._getdefaultlocale()[1]
-else:
-    try:
-        _locale.CODESET
-    except AttributeError:
-        def getpreferredencoding(do_setlocale=True):
-            # This path for legacy systems needs the more complex
-            # getdefaultlocale() function, import the full locale module.
-            import locale
-            return locale.getpreferredencoding(do_setlocale)
-    else:
-        def getpreferredencoding(do_setlocale=True):
-            assert not do_setlocale
-            result = _locale.nl_langinfo(_locale.CODESET)
-            if not result and sys.platform in ('darwin', 'cosmo'):
-                # nl_langinfo can return an empty string
-                # when the setting has an invalid value.
-                # Default to UTF-8 in that case because
-                # UTF-8 is the default charset on OSX and
-                # returning nothing will crash the
-                # interpreter.
-                result = 'UTF-8'
-            return result
+def getpreferredencoding(do_setlocale=True):
+    assert not do_setlocale
+    result = _locale.nl_langinfo(_locale.CODESET)
+    if not result and sys.platform in ('darwin', 'cosmo'):
+        # nl_langinfo can return an empty string
+        # when the setting has an invalid value.
+        # Default to UTF-8 in that case because
+        # UTF-8 is the default charset on OSX and
+        # returning nothing will crash the
+        # interpreter.
+        result = 'UTF-8'
+    return result
--- a/third_party/python/Lib/_strptime.py
+++ b/third_party/python/Lib/_strptime.py
@ -11,7 +11,7 @@ FUNCTIONS:

 """
 import time
-import locale
+# import locale
 import calendar
 from re import compile as re_compile
 from re import IGNORECASE
@ -28,7 +28,8 @@ __all__ = []

 def _getlang():
    # Figure out what the current language is set to.
-    return locale.getlocale(locale.LC_TIME)
+    # return locale.getlocale(locale.LC_TIME)
+    return (None, None)

 class LocaleTime(object):
    """Stores and handles locale-specific information related to time.
--- a/third_party/python/Lib/collections/init.py
+++ b/third_party/python/Lib/collections/init.py
@ -39,17 +39,10 @@ from _weakref import proxy as _proxy
 from itertools import repeat as _repeat, chain as _chain, starmap as _starmap
 from reprlib import recursive_repr as _recursive_repr

-try:
-    from _collections import deque
-except ImportError:
-    pass
-else:
-    MutableSequence.register(deque)
+from _collections import deque
+MutableSequence.register(deque)

-try:
-    from _collections import defaultdict
-except ImportError:
-    pass
+from _collections import defaultdict


 ################################################################################
--- a/third_party/python/Lib/hashlib.py
+++ b/third_party/python/Lib/hashlib.py
@ -11,8 +11,7 @@ new(name, data=b'', **kwargs) - returns a new hash object implementing the
 Named constructor functions are also available, these are faster
 than using new(name):

-md5(), sha1(), sha224(), sha256(), sha384(), sha512(),
-sha3_224, sha3_256, sha3_384, sha3_512, shake_128, and shake_256.
+md5(), sha1(), sha224(), sha256(), sha384(), sha512(), and blake2b256().

 More algorithms may be available on your platform but the above are guaranteed
 to exist.  See the algorithms_guaranteed and algorithms_available attributes
@ -56,9 +55,7 @@ More condensed:
 # This tuple and __get_builtin_constructor() must be modified if a new
 # always available algorithm is added.
 __always_supported = ('md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512',
-                      'sha3_224', 'sha3_256', 'sha3_384', 'sha3_512',
-                      'shake_128', 'shake_256')
-
+                      'blake2b256')

 algorithms_guaranteed = set(__always_supported)
 algorithms_available = set(__always_supported)
@ -73,23 +70,12 @@ def __get_builtin_constructor(name):
    constructor = cache.get(name)
    if constructor is not None:
        return constructor
-    try:
-        if name in ('SHA1', 'sha1'):
-            import _sha1
-            cache['SHA1'] = cache['sha1'] = _sha1.sha1
-        elif name in ('MD5', 'md5'):
-            import _md5
-            cache['MD5'] = cache['md5'] = _md5.md5
-        elif name in ('SHA256', 'sha256', 'SHA224', 'sha224'):
-            import _sha256
-            cache['SHA224'] = cache['sha224'] = _sha256.sha224
-            cache['SHA256'] = cache['sha256'] = _sha256.sha256
-        elif name in ('SHA512', 'sha512', 'SHA384', 'sha384'):
-            import _sha512
-            cache['SHA384'] = cache['sha384'] = _sha512.sha384
-            cache['SHA512'] = cache['sha512'] = _sha512.sha512
-        elif name in {'sha3_224', 'sha3_256', 'sha3_384', 'sha3_512',
-                      'shake_128', 'shake_256'}:
+    if name in ('MD5', 'md5'):
+        import _md5
+        cache['MD5'] = cache['md5'] = _md5.md5
+    elif name in {'sha3_224', 'sha3_256', 'sha3_384', 'sha3_512',
+                  'shake_128', 'shake_256'}:
+        try:
            import _sha3
            cache['sha3_224'] = _sha3.sha3_224
            cache['sha3_256'] = _sha3.sha3_256
@ -97,21 +83,19 @@ def __get_builtin_constructor(name):
            cache['sha3_512'] = _sha3.sha3_512
            cache['shake_128'] = _sha3.shake_128
            cache['shake_256'] = _sha3.shake_256
-    except ImportError:
-        pass  # no extension module, this hash is unsupported.
-
+        except ImportError:
+            raise ValueError('unsupported hash type ' + name)
    constructor = cache.get(name)
    if constructor is not None:
        return constructor
-
    raise ValueError('unsupported hash type ' + name)


-def __get_openssl_constructor(name):
+def __get_mbedtls_constructor(name):
    try:
-        f = getattr(_hashlib, 'openssl_' + name)
+        f = getattr(_hashlib, 'mbedtls_' + name)
        # Allow the C module to raise ValueError.  The function will be
-        # defined but the hash not actually available thanks to OpenSSL.
+        # defined but the hash not actually available thanks to Mbedtls.
        f()
        # Use the C function directly (very fast)
        return f
@ -134,25 +118,21 @@ def __hash_new(name, data=b'', **kwargs):
    try:
        return _hashlib.new(name, data)
    except ValueError:
-        # If the _hashlib module (OpenSSL) doesn't support the named
+        # If the _hashlib module (Mbedtls) doesn't support the named
        # hash, try using our builtin implementations.
        # This allows for SHA224/256 and SHA384/512 support even though
-        # the OpenSSL library prior to 0.9.8 doesn't provide them.
+        # the Mbedtls library prior to 0.9.8 doesn't provide them.
        return __get_builtin_constructor(name)(data)


-try:
-    import _hashlib
-    new = __hash_new
-    __get_hash = __get_openssl_constructor
-    algorithms_available = algorithms_available.union(
-            _hashlib.openssl_md_meth_names)
-except ImportError:
-    new = __py_new
-    __get_hash = __get_builtin_constructor
+import _hashlib
+new = __hash_new
+__get_hash = __get_mbedtls_constructor
+algorithms_available = algorithms_available.union(
+        _hashlib.mbedtls_md_meth_names)

 try:
-    # OpenSSL's PKCS5_PBKDF2_HMAC requires OpenSSL 1.0+ with HMAC and SHA
+    # Mbedtls's PKCS5_PBKDF2_HMAC requires Mbedtls 1.0+ with HMAC and SHA
    from _hashlib import pbkdf2_hmac
 except ImportError:
    _trans_5C = bytes((x ^ 0x5C) for x in range(256))
@ -162,7 +142,7 @@ except ImportError:
        """Password based key derivation function 2 (PKCS #5 v2.0)

        This Python implementations based on the hmac module about as fast
-        as OpenSSL's PKCS5_PBKDF2_HMAC for short passwords and much faster
+        as Mbedtls's PKCS5_PBKDF2_HMAC for short passwords and much faster
        for long passwords.
        """
        if not isinstance(hash_name, str):
@ -216,26 +196,19 @@ except ImportError:
        return dkey[:dklen]

 try:
-    # OpenSSL's scrypt requires OpenSSL 1.1+
+    # Mbedtls's scrypt requires Mbedtls 1.1+
    from _hashlib import scrypt
 except ImportError:
    pass

-
 md5 = __get_hash('md5')
 sha1 = __get_hash('sha1')
 sha224 = __get_hash('sha224')
 sha256 = __get_hash('sha256')
 sha384 = __get_hash('sha384')
 sha512 = __get_hash('sha512')
-sha3_224 = __get_hash('sha3_224')
-sha3_256 = __get_hash('sha3_256')
-sha3_384 = __get_hash('sha3_384')
-sha3_512 = __get_hash('sha3_512')
-shake_128 = __get_hash('shake_128')
-shake_256 = __get_hash('shake_256')
-
+blake2b256 = __get_hash('blake2b256')

 # Cleanup locals()
 del __always_supported, __get_hash
-del __py_new, __hash_new, __get_openssl_constructor
+del __py_new, __hash_new, __get_mbedtls_constructor
--- a/third_party/python/Lib/hello.py
+++ b/third_party/python/Lib/hello.py
@ -0,0 +1 @@
+print("hello world")
--- a/third_party/python/Lib/importlib/_bootstrap_external.py
+++ b/third_party/python/Lib/importlib/_bootstrap_external.py
@ -1364,7 +1364,7 @@ def _get_supported_file_loaders():
    extensions = ExtensionFileLoader, _imp.extension_suffixes()
    source = SourceFileLoader, SOURCE_SUFFIXES
    bytecode = SourcelessFileLoader, BYTECODE_SUFFIXES
-    return [extensions, bytecode, source]
+    return [bytecode, extensions, source]


 def _setup(_bootstrap_module):
--- a/third_party/python/Lib/launchpy.py
+++ b/third_party/python/Lib/launchpy.py
@ -0,0 +1,15 @@
+import sys
+from importlib import _bootstrap_external
+
+def run_module_as_main(mod_name):
+    path = "/zip/.python/%s.pyc" % (mod_name.replace(".", "/"))
+    loader = _bootstrap_external.SourcelessFileLoader(mod_name, path)
+    code = loader.get_code(mod_name)
+    globs = sys.modules["__main__"].__dict__
+    globs["__name__"] = "__main__"
+    globs["__file__"] = path
+    globs["__package__"] = None
+    globs["__loader__"] = loader
+    globs["__spec__"] = None
+    exec(code, globs)
+    return globs
--- a/third_party/python/Lib/os.py
+++ b/third_party/python/Lib/os.py
@ -1158,7 +1158,10 @@ def popen(cmd, mode="r", buffering=-1):
        raise ValueError("invalid mode %r" % mode)
    if buffering == 0 or buffering is None:
        raise ValueError("popen() does not support unbuffered streams")
-    import subprocess, io
+    try:
+        import subprocess, io
+    except ImportError:
+        raise ImportError('please use subprocess module')
    if mode == "r":
        proc = subprocess.Popen(cmd,
                                shell=True,
--- a/third_party/python/Lib/pydoc.py
+++ b/third_party/python/Lib/pydoc.py
@ -2193,10 +2193,13 @@ def _start_server(urlhandler, port):
        >>> print(serverthread.error)
        None
   """
-    import http.server
-    import email.message
-    import select
-    import threading
+    try:
+        import http.server
+        import email.message
+        import select
+        import threading
+    except ImportError:
+        sys.exit(1)

    class DocHandler(http.server.BaseHTTPRequestHandler):

--- a/third_party/python/Lib/shutil.py
+++ b/third_party/python/Lib/shutil.py
@ -626,7 +626,10 @@ def _make_tarball(base_name, base_dir, compress="gzip", verbose=0, dry_run=0,
        raise ValueError("bad value for 'compress', or compression format not "
                         "supported : {0}".format(compress))

-    import tarfile  # late import for breaking circular dependency
+    try:
+        import tarfile
+    except ImportError:
+        raise

    compress_ext = '.' + tar_compression if compress else ''
    archive_name = base_name + '.tar' + compress_ext
@ -669,7 +672,10 @@ def _make_zipfile(base_name, base_dir, verbose=0, dry_run=0, logger=None):
    The output zip file will be named 'base_name' + ".zip".  Returns the
    name of the output zip file.
    """
-    import zipfile  # late import for breaking circular dependency
+    try:
+        import zipfile
+    except ImportError:
+        raise

    zip_filename = base_name + ".zip"
    archive_dir = os.path.dirname(base_name)
@ -877,7 +883,10 @@ def _ensure_directory(path):
 def _unpack_zipfile(filename, extract_dir):
    """Unpack zip `filename` to `extract_dir`
    """
-    import zipfile  # late import for breaking circular dependency
+    try:
+        import zipfile
+    except ImportError:
+        raise

    if not zipfile.is_zipfile(filename):
        raise ReadError("%s is not a zip file" % filename)
@ -911,7 +920,10 @@ def _unpack_zipfile(filename, extract_dir):
 def _unpack_tarfile(filename, extract_dir):
    """Unpack tar/tar.gz/tar.bz2/tar.xz `filename` to `extract_dir`
    """
-    import tarfile  # late import for breaking circular dependency
+    try:
+        import tarfile
+    except ImportError:
+        raise
    try:
        tarobj = tarfile.open(filename)
    except tarfile.TarError:
@ -1003,22 +1015,6 @@ if hasattr(os, 'statvfs'):
        used = (st.f_blocks - st.f_bfree) * st.f_frsize
        return _ntuple_diskusage(total, used, free)

-elif os.name == 'nt':
-
-    import nt
-    __all__.append('disk_usage')
-    _ntuple_diskusage = collections.namedtuple('usage', 'total used free')
-
-    def disk_usage(path):
-        """Return disk usage statistics about the given path.
-
-        Returned values is a named tuple with attributes 'total', 'used' and
-        'free', which are the amount of total, used and free space, in bytes.
-        """
-        total, free = nt._getdiskusage(path)
-        used = total - free
-        return _ntuple_diskusage(total, used, free)
-

 def chown(path, user=None, group=None):
    """Change owner user and group of the given path.