Perform build and magnum tuning

Building o//third_party/python now takes 5 seconds on my PC This change works towards modifying Python to use runtime dispatching when appropriate. For example, when loading the magnums in the socket module, it's a good idea to check if the magnum is zero, because that means the local system platform doesn't support it.
2025-06-30 16:28:30 +00:00 · 2021-08-10 10:26:13 -07:00 · 2021-08-10 10:26:13 -07:00 · d26d7ae0e4
commit d26d7ae0e4
parent ee7e296339
1028 changed files with 6576 additions and 172777 deletions
--- a/third_party/python/Objects/abstract.c
+++ b/third_party/python/Objects/abstract.c
@ -1,11 +1,9 @@
+/* clang-format off */
 /* Abstract Object Interface (many thanks to Jim Fulton) */

-#include "Python.h"
-#include <ctype.h>
-#include "structmember.h" /* we need the offsetof() macro from there */
-#include "longintrepr.h"
-
-
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"
+#include "third_party/python/Include/longintrepr.h"

 /* Shorthands to return certain errors */

--- a/third_party/python/Objects/accu.c
+++ b/third_party/python/Objects/accu.c
@ -1,7 +1,8 @@
+/* clang-format off */
 /* Accumulator struct implementation */

-#include "Python.h"
-#include "accu.h"
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/accu.h"

 static PyObject *
 join_list_unicode(PyObject *lst)
--- a/third_party/python/Objects/boolobject.c
+++ b/third_party/python/Objects/boolobject.c
@ -1,7 +1,8 @@
+/* clang-format off */
 /* Boolean type, a subtype of int */

-#include "Python.h"
-#include "longintrepr.h"
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/longintrepr.h"

 /* We define bool_repr to return "False" or "True" */

--- a/third_party/python/Objects/bytearrayobject.c
+++ b/third_party/python/Objects/bytearrayobject.c
@ -1,11 +1,12 @@
+/* clang-format off */
 /* PyByteArray (bytearray) implementation */

 #define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "structmember.h"
-#include "bytes_methods.h"
-#include "bytesobject.h"
-#include "pystrhex.h"
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"
+#include "third_party/python/Include/bytes_methods.h"
+#include "third_party/python/Include/bytesobject.h"
+#include "third_party/python/Include/pystrhex.h"

 /*[clinic input]
 class bytearray "PyByteArrayObject *" "&PyByteArray_Type"
@ -90,7 +91,7 @@ _canresize(PyByteArrayObject *self)
    return 1;
 }

-#include "clinic/bytearrayobject.c.h"
+#include "third_party/python/Objects/clinic/bytearrayobject.inc"

 /* Direct API functions */

@ -1106,14 +1107,14 @@ bytearray_dealloc(PyByteArrayObject *self)
 #define STRINGLIB_CHECK_EXACT PyByteArray_CheckExact
 #define STRINGLIB_MUTABLE 1

-#include "stringlib/fastsearch.h"
-#include "stringlib/count.h"
-#include "stringlib/find.h"
-#include "stringlib/join.h"
-#include "stringlib/partition.h"
-#include "stringlib/split.h"
-#include "stringlib/ctype.h"
-#include "stringlib/transmogrify.h"
+#include "third_party/python/Objects/stringlib/fastsearch.inc"
+#include "third_party/python/Objects/stringlib/count.inc"
+#include "third_party/python/Objects/stringlib/find.inc"
+#include "third_party/python/Objects/stringlib/join.inc"
+#include "third_party/python/Objects/stringlib/partition.inc"
+#include "third_party/python/Objects/stringlib/split.inc"
+#include "third_party/python/Objects/stringlib/ctype.inc"
+#include "third_party/python/Objects/stringlib/transmogrify.inc"


 static PyObject *
--- a/third_party/python/Objects/bytes_methods.c
+++ b/third_party/python/Objects/bytes_methods.c
@ -1,6 +1,7 @@
+/* clang-format off */
 #define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "bytes_methods.h"
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/bytes_methods.h"

 PyDoc_STRVAR_shared(_Py_isspace__doc__,
 "B.isspace() -> bool\n\
@ -394,9 +395,9 @@ _Py_bytes_maketrans(Py_buffer *frm, Py_buffer *to)
 #define STRINGLIB_CHAR char
 #define STRINGLIB_SIZEOF_CHAR 1

-#include "stringlib/fastsearch.h"
-#include "stringlib/count.h"
-#include "stringlib/find.h"
+#include "third_party/python/Objects/stringlib/fastsearch.inc"
+#include "third_party/python/Objects/stringlib/count.inc"
+#include "third_party/python/Objects/stringlib/find.inc"

 /*
 Wraps stringlib_parse_args_finds() and additionally checks whether the
--- a/third_party/python/Objects/bytesobject.c
+++ b/third_party/python/Objects/bytesobject.c
@ -1,19 +1,18 @@
+/* clang-format off */
 /* bytes object implementation */

 #define PY_SSIZE_T_CLEAN

-#include "Python.h"
-
-#include "bytes_methods.h"
-#include "pystrhex.h"
-#include <stddef.h>
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/bytes_methods.h"
+#include "third_party/python/Include/pystrhex.h"

 /*[clinic input]
 class bytes "PyBytesObject *" "&PyBytes_Type"
 [clinic start generated code]*/
 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=7a238f965d64892b]*/

-#include "clinic/bytesobject.c.h"
+#include "third_party/python/Objects/clinic/bytesobject.inc"

 #ifdef COUNT_ALLOCS
 Py_ssize_t null_strings, one_strings;
@ -1318,17 +1317,17 @@ PyBytes_AsStringAndSize(PyObject *obj,
 /* -------------------------------------------------------------------- */
 /* Methods */

-#include "stringlib/stringdefs.h"
+#include "third_party/python/Objects/stringlib/stringdefs.inc"

-#include "stringlib/fastsearch.h"
-#include "stringlib/count.h"
-#include "stringlib/find.h"
-#include "stringlib/join.h"
-#include "stringlib/partition.h"
-#include "stringlib/split.h"
-#include "stringlib/ctype.h"
+#include "third_party/python/Objects/stringlib/fastsearch.inc"
+#include "third_party/python/Objects/stringlib/count.inc"
+#include "third_party/python/Objects/stringlib/find.inc"
+#include "third_party/python/Objects/stringlib/join.inc"
+#include "third_party/python/Objects/stringlib/partition.inc"
+#include "third_party/python/Objects/stringlib/split.inc"
+#include "third_party/python/Objects/stringlib/ctype.inc"

-#include "stringlib/transmogrify.h"
+#include "third_party/python/Objects/stringlib/transmogrify.inc"

 PyObject *
 PyBytes_Repr(PyObject *obj, int smartquotes)
--- a/third_party/python/Objects/capsule.c
+++ b/third_party/python/Objects/capsule.c
@ -1,6 +1,7 @@
+/* clang-format off */
 /* Wrap void * pointers to be passed between C modules */

-#include "Python.h"
+#include "third_party/python/Include/Python.h"

 /* Internal structure of PyCapsule */
 typedef struct {
--- a/third_party/python/Objects/cellobject.c
+++ b/third_party/python/Objects/cellobject.c
@ -1,6 +1,7 @@
+/* clang-format off */
 /* Cell object implementation */

-#include "Python.h"
+#include "third_party/python/Include/Python.h"

 PyObject *
 PyCell_New(PyObject *obj)
--- a/third_party/python/Objects/classobject.c
+++ b/third_party/python/Objects/classobject.c
@ -1,7 +1,8 @@
+/* clang-format off */
 /* Class object implementation (dead now except for methods) */

-#include "Python.h"
-#include "structmember.h"
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"

 #define TP_DESCR_GET(t) ((t)->tp_descr_get)

--- a/third_party/python/Objects/clinic/bytearrayobject.inc
+++ b/third_party/python/Objects/clinic/bytearrayobject.inc
--- a/third_party/python/Objects/clinic/bytesobject.inc
+++ b/third_party/python/Objects/clinic/bytesobject.inc
--- a/third_party/python/Objects/clinic/dictobject.inc
+++ b/third_party/python/Objects/clinic/dictobject.inc
--- a/third_party/python/Objects/clinic/unicodeobject.inc
+++ b/third_party/python/Objects/clinic/unicodeobject.inc
--- a/third_party/python/Objects/codeobject.c
+++ b/third_party/python/Objects/codeobject.c
@ -1,8 +1,7 @@
-#include <stdbool.h>
-
-#include "Python.h"
-#include "code.h"
-#include "structmember.h"
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/code.h"
+#include "third_party/python/Include/structmember.h"

 #define NAME_CHARS \
    "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"
--- a/third_party/python/Objects/complexobject.c
+++ b/third_party/python/Objects/complexobject.c
@ -1,13 +1,11 @@
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"

 /* Complex object implementation */
-
 /* Borrows heavily from floatobject.c */
-
 /* Submitted by Jim Hugunin */

-#include "Python.h"
-#include "structmember.h"
-
 /* elementary operations on complex numbers */

 static Py_complex c_1 = {1., 0.};
--- a/third_party/python/Objects/descrobject.c
+++ b/third_party/python/Objects/descrobject.c
@ -1,7 +1,8 @@
-/* Descriptors -- a new, flexible way to describe attributes */
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"

-#include "Python.h"
-#include "structmember.h" /* Why is this not included in Python.h? */
+/* Descriptors -- a new, flexible way to describe attributes */

 static void
 descr_dealloc(PyDescrObject *descr)
--- a/third_party/python/Objects/dict-common.h
+++ b/third_party/python/Objects/dict-common.h
@ -1,5 +1,6 @@
 #ifndef Py_DICT_COMMON_H
 #define Py_DICT_COMMON_H
+/* clang-format off */

 typedef struct {
    /* Cached hash code of me_key. */
--- a/third_party/python/Objects/dictnotes.txt
+++ b/third_party/python/Objects/dictnotes.txt
@ -1,149 +0,0 @@
-NOTES ON DICTIONARIES
-================================
-
-Principal Use Cases for Dictionaries
------------------------------------
-
-Passing keyword arguments
-    Typically, one read and one write for 1 to 3 elements.
-    Occurs frequently in normal python code.
-
-Class method lookup
-    Dictionaries vary in size with 8 to 16 elements being common.
-    Usually written once with many lookups.
-    When base classes are used, there are many failed lookups
-        followed by a lookup in a base class.
-
-Instance attribute lookup and Global variables
-    Dictionaries vary in size.  4 to 10 elements are common.
-    Both reads and writes are common.
-
-Builtins
-    Frequent reads.  Almost never written.
-    About 150 interned strings (as of Py3.3).
-    A few keys are accessed much more frequently than others.
-
-Uniquification
-    Dictionaries of any size.  Bulk of work is in creation.
-    Repeated writes to a smaller set of keys.
-    Single read of each key.
-    Some use cases have two consecutive accesses to the same key.
-
-    * Removing duplicates from a sequence.
-        dict.fromkeys(seqn).keys()
-
-    * Counting elements in a sequence.
-        for e in seqn:
-          d[e] = d.get(e,0) + 1
-
-    * Accumulating references in a dictionary of lists:
-
-        for pagenumber, page in enumerate(pages):
-          for word in page:
-            d.setdefault(word, []).append(pagenumber)
-
-    Note, the second example is a use case characterized by a get and set
-    to the same key.  There are similar use cases with a __contains__
-    followed by a get, set, or del to the same key.  Part of the
-    justification for d.setdefault is combining the two lookups into one.
-
-Membership Testing
-    Dictionaries of any size.  Created once and then rarely changes.
-    Single write to each key.
-    Many calls to __contains__() or has_key().
-    Similar access patterns occur with replacement dictionaries
-        such as with the % formatting operator.
-
-Dynamic Mappings
-    Characterized by deletions interspersed with adds and replacements.
-    Performance benefits greatly from the re-use of dummy entries.
-
-Data Layout
-----------
-
-Dictionaries are composed of 3 components:
-The dictobject struct itself
-A dict-keys object (keys & hashes)
-A values array
-
-
-Tunable Dictionary Parameters
-----------------------------
-
-See comments for PyDict_MINSIZE_SPLIT, PyDict_MINSIZE_COMBINED,
-USABLE_FRACTION and GROWTH_RATE in dictobject.c
-
-Tune-ups should be measured across a broad range of applications and
-use cases.  A change to any parameter will help in some situations and
-hurt in others.  The key is to find settings that help the most common
-cases and do the least damage to the less common cases.  Results will
-vary dramatically depending on the exact number of keys, whether the
-keys are all strings, whether reads or writes dominate, the exact
-hash values of the keys (some sets of values have fewer collisions than
-others).  Any one test or benchmark is likely to prove misleading.
-
-While making a dictionary more sparse reduces collisions, it impairs
-iteration and key listing.  Those methods loop over every potential
-entry.  Doubling the size of dictionary results in twice as many
-non-overlapping memory accesses for keys(), items(), values(),
-__iter__(), iterkeys(), iteritems(), itervalues(), and update().
-Also, every dictionary iterates at least twice, once for the memset()
-when it is created and once by dealloc().
-
-Dictionary operations involving only a single key can be O(1) unless
-resizing is possible.  By checking for a resize only when the
-dictionary can grow (and may *require* resizing), other operations
-remain O(1), and the odds of resize thrashing or memory fragmentation
-are reduced. In particular, an algorithm that empties a dictionary
-by repeatedly invoking .pop will see no resizing, which might
-not be necessary at all because the dictionary is eventually
-discarded entirely.
-
-The key differences between this implementation and earlier versions are:
-    1. The table can be split into two parts, the keys and the values.
-
-    2. There is an additional key-value combination: (key, NULL).
-       Unlike (<dummy>, NULL) which represents a deleted value, (key, NULL)
-       represented a yet to be inserted value. This combination can only occur
-       when the table is split.
-
-    3. No small table embedded in the dict,
-       as this would make sharing of key-tables impossible.
-
-
-These changes have the following consequences.
-   1. General dictionaries are slightly larger.
-
-   2. All object dictionaries of a single class can share a single key-table,
-      saving about 60% memory for such cases.
-
-Results of Cache Locality Experiments
--------------------------------------
-
-Experiments on an earlier design of dictionary, in which all tables were
-combined, showed the following:
-
-  When an entry is retrieved from memory, several adjacent entries are also
-  retrieved into a cache line.  Since accessing items in cache is *much*
-  cheaper than a cache miss, an enticing idea is to probe the adjacent
-  entries as a first step in collision resolution.  Unfortunately, the
-  introduction of any regularity into collision searches results in more
-  collisions than the current random chaining approach.
-
-  Exploiting cache locality at the expense of additional collisions fails
-  to payoff when the entries are already loaded in cache (the expense
-  is paid with no compensating benefit).  This occurs in small dictionaries
-  where the whole dictionary fits into a pair of cache lines.  It also
-  occurs frequently in large dictionaries which have a common access pattern
-  where some keys are accessed much more frequently than others.  The
-  more popular entries *and* their collision chains tend to remain in cache.
-
-  To exploit cache locality, change the collision resolution section
-  in lookdict() and lookdict_string().  Set i^=1 at the top of the
-  loop and move the  i = (i << 2) + i + perturb + 1 to an unrolled
-  version of the loop.
-
-For split tables, the above will apply to the keys, but the value will
-always be in a different cache line from the key.
-
-
--- a/third_party/python/Objects/dictobject.c
+++ b/third_party/python/Objects/dictobject.c
@ -1,3 +1,4 @@
+/* clang-format off */
 /* Dictionary object implementation using a hash table */

 /* The distribution includes a separate file, Objects/dictnotes.txt,
@ -109,9 +110,9 @@ converting the dict to the combined table.
 */
 #define PyDict_MINSIZE 8

-#include "Python.h"
-#include "dict-common.h"
-#include "stringlib/eq.h"    /* to get unicode_eq() */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Objects/dict-common.h"
+#include "third_party/python/Objects/stringlib/eq.inc"    /* to get unicode_eq() */

 /*[clinic input]
 class dict "PyDictObject *" "&PyDict_Type"
@ -255,7 +256,7 @@ static int numfree = 0;
 static PyDictKeysObject *keys_free_list[PyDict_MAXFREELIST];
 static int numfreekeys = 0;

-#include "clinic/dictobject.c.h"
+#include "third_party/python/Objects/clinic/dictobject.inc"

 int
 PyDict_ClearFreeList(void)
--- a/third_party/python/Objects/enumobject.c
+++ b/third_party/python/Objects/enumobject.c
@ -1,6 +1,7 @@
+/* clang-format off */
 /* enumerate object */

-#include "Python.h"
+#include "third_party/python/Include/Python.h"

 typedef struct {
    PyObject_HEAD
--- a/third_party/python/Objects/exceptions.c
+++ b/third_party/python/Objects/exceptions.c
@ -1,3 +1,4 @@
+/* clang-format off */
 /*
 * New exceptions.c written in Iceland by Richard Jones and Georg Brandl.
 *
@ -5,9 +6,9 @@
 */

 #define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include "structmember.h"
-#include "osdefs.h"
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"
+#include "third_party/python/Include/osdefs.h"


 /* Compatibility aliases */
--- a/third_party/python/Objects/fileobject.c
+++ b/third_party/python/Objects/fileobject.c
@ -1,7 +1,8 @@
+/* clang-format off */
 /* File object implementation (what's left of it -- see io.py) */

 #define PY_SSIZE_T_CLEAN
-#include "Python.h"
+#include "third_party/python/Include/Python.h"
 #include "libc/stdio/unlocked.h"

 #if defined(HAVE_GETC_UNLOCKED) && !defined(_Py_MEMORY_SANITIZER)
@ -21,10 +22,6 @@
 #define NEWLINE_LF 2            /* \n newline seen */
 #define NEWLINE_CRLF 4          /* \r\n newline seen */

-#ifdef __cplusplus
-extern "C" {
-#endif
-
 /* External C interface */

 PyObject *
@ -525,8 +522,3 @@ PyTypeObject PyStdPrinter_Type = {
    stdprinter_new,                             /* tp_new */
    PyObject_Del,                               /* tp_free */
 };
-
-
-#ifdef __cplusplus
-}
-#endif
--- a/third_party/python/Objects/floatobject.c
+++ b/third_party/python/Objects/floatobject.c
@ -1,15 +1,12 @@
+/* clang-format off */
 /* Float object implementation */

 /* XXX There should be overflow checks here, but it's hard to check
   for any kind of float exception without losing portability. */

-#include "Python.h"
+#include "third_party/python/Include/Python.h"
 #include "libc/runtime/fenv.h"

-#include <ctype.h>
-#include <float.h>
-
-
 /* Special free list
   free_list is a singly-linked list of available PyFloatObjects, linked
   via abuse of their ob_type members.
--- a/third_party/python/Objects/frameobject.c
+++ b/third_party/python/Objects/frameobject.c
@ -1,11 +1,11 @@
+/* clang-format off */
 /* Frame object implementation */

-#include "Python.h"
-
-#include "code.h"
-#include "frameobject.h"
-#include "opcode.h"
-#include "structmember.h"
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/code.h"
+#include "third_party/python/Include/frameobject.h"
+#include "third_party/python/Include/opcode.h"
+#include "third_party/python/Include/structmember.h"

 #define OFF(x) offsetof(PyFrameObject, x)

--- a/third_party/python/Objects/funcobject.c
+++ b/third_party/python/Objects/funcobject.c
@ -1,9 +1,10 @@
+/* clang-format off */

 /* Function object implementation */

-#include "Python.h"
-#include "code.h"
-#include "structmember.h"
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/code.h"
+#include "third_party/python/Include/structmember.h"

 PyObject *
 PyFunction_NewWithQualName(PyObject *code, PyObject *globals, PyObject *qualname)
--- a/third_party/python/Objects/genobject.c
+++ b/third_party/python/Objects/genobject.c
@ -1,9 +1,10 @@
+/* clang-format off */
 /* Generator object implementation */

-#include "Python.h"
-#include "frameobject.h"
-#include "structmember.h"
-#include "opcode.h"
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/frameobject.h"
+#include "third_party/python/Include/structmember.h"
+#include "third_party/python/Include/opcode.h"

 static PyObject *gen_close(PyGenObject *, PyObject *);
 static PyObject *async_gen_asend_new(PyAsyncGenObject *, PyObject *);
--- a/third_party/python/Objects/iterobject.c
+++ b/third_party/python/Objects/iterobject.c
@ -1,6 +1,7 @@
+/* clang-format off */
 /* Iterator objects */

-#include "Python.h"
+#include "third_party/python/Include/Python.h"

 typedef struct {
    PyObject_HEAD
--- a/third_party/python/Objects/listobject.c
+++ b/third_party/python/Objects/listobject.c
@ -1,13 +1,8 @@
+/* clang-format off */
 /* List object implementation */

-#include "Python.h"
-#include "accu.h"
-
-#ifdef STDC_HEADERS
-#include <stddef.h>
-#else
-#include <sys/types.h>          /* For size_t */
-#endif
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/accu.h"

 /* Ensure ob_item has room for at least newsize elements, and set
 * ob_size to newsize.  If newsize > ob_size on entry, the content
--- a/third_party/python/Objects/listsort.txt
+++ b/third_party/python/Objects/listsort.txt
@ -1,755 +0,0 @@
-Intro
-----
-This describes an adaptive, stable, natural mergesort, modestly called
-timsort (hey, I earned it <wink>).  It has supernatural performance on many
-kinds of partially ordered arrays (less than lg(N!) comparisons needed, and
-as few as N-1), yet as fast as Python's previous highly tuned samplesort
-hybrid on random arrays.
-
-In a nutshell, the main routine marches over the array once, left to right,
-alternately identifying the next run, then merging it into the previous
-runs "intelligently".  Everything else is complication for speed, and some
-hard-won measure of memory efficiency.
-
-
-Comparison with Python's Samplesort Hybrid
------------------------------------------
-+ timsort can require a temp array containing as many as N//2 pointers,
-  which means as many as 2*N extra bytes on 32-bit boxes.  It can be
-  expected to require a temp array this large when sorting random data; on
-  data with significant structure, it may get away without using any extra
-  heap memory.  This appears to be the strongest argument against it, but
-  compared to the size of an object, 2 temp bytes worst-case (also expected-
-  case for random data) doesn't scare me much.
-
-  It turns out that Perl is moving to a stable mergesort, and the code for
-  that appears always to require a temp array with room for at least N
-  pointers. (Note that I wouldn't want to do that even if space weren't an
-  issue; I believe its efforts at memory frugality also save timsort
-  significant pointer-copying costs, and allow it to have a smaller working
-  set.)
-
-+ Across about four hours of generating random arrays, and sorting them
-  under both methods, samplesort required about 1.5% more comparisons
-  (the program is at the end of this file).
-
-+ In real life, this may be faster or slower on random arrays than
-  samplesort was, depending on platform quirks.  Since it does fewer
-  comparisons on average, it can be expected to do better the more
-  expensive a comparison function is.  OTOH, it does more data movement
-  (pointer copying) than samplesort, and that may negate its small
-  comparison advantage (depending on platform quirks) unless comparison
-  is very expensive.
-
-+ On arrays with many kinds of pre-existing order, this blows samplesort out
-  of the water.  It's significantly faster than samplesort even on some
-  cases samplesort was special-casing the snot out of.  I believe that lists
-  very often do have exploitable partial order in real life, and this is the
-  strongest argument in favor of timsort (indeed, samplesort's special cases
-  for extreme partial order are appreciated by real users, and timsort goes
-  much deeper than those, in particular naturally covering every case where
-  someone has suggested "and it would be cool if list.sort() had a special
-  case for this too ... and for that ...").
-
-+ Here are exact comparison counts across all the tests in sortperf.py,
-  when run with arguments "15 20 1".
-
-  Column Key:
-      *sort: random data
-      \sort: descending data
-      /sort: ascending data
-      3sort: ascending, then 3 random exchanges
-      +sort: ascending, then 10 random at the end
-      %sort: ascending, then randomly replace 1% of elements w/ random values
-      ~sort: many duplicates
-      =sort: all equal
-      !sort: worst case scenario
-
-  First the trivial cases, trivial for samplesort because it special-cased
-  them, and trivial for timsort because it naturally works on runs.  Within
-  an "n" block, the first line gives the # of compares done by samplesort,
-  the second line by timsort, and the third line is the percentage by
-  which the samplesort count exceeds the timsort count:
-
-      n   \sort   /sort   =sort
-------  ------  ------  ------
-  32768   32768   32767   32767  samplesort
-          32767   32767   32767  timsort
-          0.00%   0.00%   0.00%  (samplesort - timsort) / timsort
-
-  65536   65536   65535   65535
-          65535   65535   65535
-          0.00%   0.00%   0.00%
-
- 131072  131072  131071  131071
-         131071  131071  131071
-          0.00%   0.00%   0.00%
-
- 262144  262144  262143  262143
-         262143  262143  262143
-          0.00%   0.00%   0.00%
-
- 524288  524288  524287  524287
-         524287  524287  524287
-          0.00%   0.00%   0.00%
-
-1048576 1048576 1048575 1048575
-        1048575 1048575 1048575
-          0.00%   0.00%   0.00%
-
-  The algorithms are effectively identical in these cases, except that
-  timsort does one less compare in \sort.
-
-  Now for the more interesting cases.  Where lg(x) is the logarithm of x to
-  the base 2 (e.g., lg(8)=3), lg(n!) is the information-theoretic limit for
-  the best any comparison-based sorting algorithm can do on average (across
-  all permutations).  When a method gets significantly below that, it's
-  either astronomically lucky, or is finding exploitable structure in the
-  data.
-
-
-      n   lg(n!)    *sort    3sort     +sort   %sort    ~sort     !sort
-------  -------   ------   -------  -------  ------  -------  --------
-  32768   444255   453096   453614    32908   452871   130491    469141 old
-                   448885    33016    33007    50426   182083     65534 new
-                    0.94% 1273.92%   -0.30%  798.09%  -28.33%   615.87% %ch from new
-
-  65536   954037   972699   981940    65686   973104   260029   1004607
-                   962991    65821    65808   101667   364341    131070
-                    1.01% 1391.83%   -0.19%  857.15%  -28.63%   666.47%
-
- 131072  2039137  2101881  2091491   131232  2092894   554790   2161379
-                  2057533   131410   131361   206193   728871    262142
-                    2.16% 1491.58%   -0.10%  915.02%  -23.88%   724.51%
-
- 262144  4340409  4464460  4403233   262314  4445884  1107842   4584560
-                  4377402   262437   262459   416347  1457945    524286
-                    1.99% 1577.82%   -0.06%  967.83%  -24.01%   774.44%
-
- 524288  9205096  9453356  9408463   524468  9441930  2218577   9692015
-                  9278734   524580   524633   837947  2916107   1048574
-                   1.88%  1693.52%   -0.03% 1026.79%  -23.92%   824.30%
-
-1048576 19458756 19950272 19838588  1048766 19912134  4430649  20434212
-                 19606028  1048958  1048941  1694896  5832445   2097150
-                    1.76% 1791.27%   -0.02% 1074.83%  -24.03%   874.38%
-
-  Discussion of cases:
-
-  *sort:  There's no structure in random data to exploit, so the theoretical
-  limit is lg(n!).  Both methods get close to that, and timsort is hugging
-  it (indeed, in a *marginal* sense, it's a spectacular improvement --
-  there's only about 1% left before hitting the wall, and timsort knows
-  darned well it's doing compares that won't pay on random data -- but so
-  does the samplesort hybrid).  For contrast, Hoare's original random-pivot
-  quicksort does about 39% more compares than the limit, and the median-of-3
-  variant about 19% more.
-
-  3sort, %sort, and !sort:  No contest; there's structure in this data, but
-  not of the specific kinds samplesort special-cases.  Note that structure
-  in !sort wasn't put there on purpose -- it was crafted as a worst case for
-  a previous quicksort implementation.  That timsort nails it came as a
-  surprise to me (although it's obvious in retrospect).
-
-  +sort:  samplesort special-cases this data, and does a few less compares
-  than timsort.  However, timsort runs this case significantly faster on all
-  boxes we have timings for, because timsort is in the business of merging
-  runs efficiently, while samplesort does much more data movement in this
-  (for it) special case.
-
-  ~sort:  samplesort's special cases for large masses of equal elements are
-  extremely effective on ~sort's specific data pattern, and timsort just
-  isn't going to get close to that, despite that it's clearly getting a
-  great deal of benefit out of the duplicates (the # of compares is much less
-  than lg(n!)).  ~sort has a perfectly uniform distribution of just 4
-  distinct values, and as the distribution gets more skewed, samplesort's
-  equal-element gimmicks become less effective, while timsort's adaptive
-  strategies find more to exploit; in a database supplied by Kevin Altis, a
-  sort on its highly skewed "on which stock exchange does this company's
-  stock trade?" field ran over twice as fast under timsort.
-
-  However, despite that timsort does many more comparisons on ~sort, and
-  that on several platforms ~sort runs highly significantly slower under
-  timsort, on other platforms ~sort runs highly significantly faster under
-  timsort.  No other kind of data has shown this wild x-platform behavior,
-  and we don't have an explanation for it.  The only thing I can think of
-  that could transform what "should be" highly significant slowdowns into
-  highly significant speedups on some boxes are catastrophic cache effects
-  in samplesort.
-
-  But timsort "should be" slower than samplesort on ~sort, so it's hard
-  to count that it isn't on some boxes as a strike against it <wink>.
-
-+ Here's the highwater mark for the number of heap-based temp slots (4
-  bytes each on this box) needed by each test, again with arguments
-  "15 20 1":
-
-   2**i  *sort \sort /sort  3sort  +sort  %sort  ~sort  =sort  !sort
-  32768  16384     0     0   6256      0  10821  12288      0  16383
-  65536  32766     0     0  21652      0  31276  24576      0  32767
- 131072  65534     0     0  17258      0  58112  49152      0  65535
- 262144 131072     0     0  35660      0 123561  98304      0 131071
- 524288 262142     0     0  31302      0 212057 196608      0 262143
-1048576 524286     0     0 312438      0 484942 393216      0 524287
-
-  Discussion:  The tests that end up doing (close to) perfectly balanced
-  merges (*sort, !sort) need all N//2 temp slots (or almost all).  ~sort
-  also ends up doing balanced merges, but systematically benefits a lot from
-  the preliminary pre-merge searches described under "Merge Memory" later.
-  %sort approaches having a balanced merge at the end because the random
-  selection of elements to replace is expected to produce an out-of-order
-  element near the midpoint.  \sort, /sort, =sort are the trivial one-run
-  cases, needing no merging at all.  +sort ends up having one very long run
-  and one very short, and so gets all the temp space it needs from the small
-  temparray member of the MergeState struct (note that the same would be
-  true if the new random elements were prefixed to the sorted list instead,
-  but not if they appeared "in the middle").  3sort approaches N//3 temp
-  slots twice, but the run lengths that remain after 3 random exchanges
-  clearly has very high variance.
-
-
-A detailed description of timsort follows.
-
-Runs
----
-count_run() returns the # of elements in the next run.  A run is either
-"ascending", which means non-decreasing:
-
-    a0 <= a1 <= a2 <= ...
-
-or "descending", which means strictly decreasing:
-
-    a0 > a1 > a2 > ...
-
-Note that a run is always at least 2 long, unless we start at the array's
-last element.
-
-The definition of descending is strict, because the main routine reverses
-a descending run in-place, transforming a descending run into an ascending
-run.  Reversal is done via the obvious fast "swap elements starting at each
-end, and converge at the middle" method, and that can violate stability if
-the slice contains any equal elements.  Using a strict definition of
-descending ensures that a descending run contains distinct elements.
-
-If an array is random, it's very unlikely we'll see long runs.  If a natural
-run contains less than minrun elements (see next section), the main loop
-artificially boosts it to minrun elements, via a stable binary insertion sort
-applied to the right number of array elements following the short natural
-run.  In a random array, *all* runs are likely to be minrun long as a
-result.  This has two primary good effects:
-
-1. Random data strongly tends then toward perfectly balanced (both runs have
-   the same length) merges, which is the most efficient way to proceed when
-   data is random.
-
-2. Because runs are never very short, the rest of the code doesn't make
-   heroic efforts to shave a few cycles off per-merge overheads.  For
-   example, reasonable use of function calls is made, rather than trying to
-   inline everything.  Since there are no more than N/minrun runs to begin
-   with, a few "extra" function calls per merge is barely measurable.
-
-
-Computing minrun
----------------
-If N < 64, minrun is N.  IOW, binary insertion sort is used for the whole
-array then; it's hard to beat that given the overheads of trying something
-fancier (see note BINSORT).
-
-When N is a power of 2, testing on random data showed that minrun values of
-16, 32, 64 and 128 worked about equally well.  At 256 the data-movement cost
-in binary insertion sort clearly hurt, and at 8 the increase in the number
-of function calls clearly hurt.  Picking *some* power of 2 is important
-here, so that the merges end up perfectly balanced (see next section).  We
-pick 32 as a good value in the sweet range; picking a value at the low end
-allows the adaptive gimmicks more opportunity to exploit shorter natural
-runs.
-
-Because sortperf.py only tries powers of 2, it took a long time to notice
-that 32 isn't a good choice for the general case!  Consider N=2112:
-
->>> divmod(2112, 32)
-(66, 0)
->>>
-
-If the data is randomly ordered, we're very likely to end up with 66 runs
-each of length 32.  The first 64 of these trigger a sequence of perfectly
-balanced merges (see next section), leaving runs of lengths 2048 and 64 to
-merge at the end.  The adaptive gimmicks can do that with fewer than 2048+64
-compares, but it's still more compares than necessary, and-- mergesort's
-bugaboo relative to samplesort --a lot more data movement (O(N) copies just
-to get 64 elements into place).
-
-If we take minrun=33 in this case, then we're very likely to end up with 64
-runs each of length 33, and then all merges are perfectly balanced.  Better!
-
-What we want to avoid is picking minrun such that in
-
-    q, r = divmod(N, minrun)
-
-q is a power of 2 and r>0 (then the last merge only gets r elements into
-place, and r < minrun is small compared to N), or q a little larger than a
-power of 2 regardless of r (then we've got a case similar to "2112", again
-leaving too little work for the last merge to do).
-
-Instead we pick a minrun in range(32, 65) such that N/minrun is exactly a
-power of 2, or if that isn't possible, is close to, but strictly less than,
-a power of 2.  This is easier to do than it may sound:  take the first 6
-bits of N, and add 1 if any of the remaining bits are set.  In fact, that
-rule covers every case in this section, including small N and exact powers
-of 2; merge_compute_minrun() is a deceptively simple function.
-
-
-The Merge Pattern
-----------------
-In order to exploit regularities in the data, we're merging on natural
-run lengths, and they can become wildly unbalanced.  That's a Good Thing
-for this sort!  It means we have to find a way to manage an assortment of
-potentially very different run lengths, though.
-
-Stability constrains permissible merging patterns.  For example, if we have
-3 consecutive runs of lengths
-
-    A:10000  B:20000  C:10000
-
-we dare not merge A with C first, because if A, B and C happen to contain
-a common element, it would get out of order wrt its occurrence(s) in B.  The
-merging must be done as (A+B)+C or A+(B+C) instead.
-
-So merging is always done on two consecutive runs at a time, and in-place,
-although this may require some temp memory (more on that later).
-
-When a run is identified, its base address and length are pushed on a stack
-in the MergeState struct.  merge_collapse() is then called to see whether it
-should merge it with preceding run(s).  We would like to delay merging as
-long as possible in order to exploit patterns that may come up later, but we
-like even more to do merging as soon as possible to exploit that the run just
-found is still high in the memory hierarchy.  We also can't delay merging
-"too long" because it consumes memory to remember the runs that are still
-unmerged, and the stack has a fixed size.
-
-What turned out to be a good compromise maintains two invariants on the
-stack entries, where A, B and C are the lengths of the three righmost not-yet
-merged slices:
-
-1.  A > B+C
-2.  B > C
-
-Note that, by induction, #2 implies the lengths of pending runs form a
-decreasing sequence.  #1 implies that, reading the lengths right to left,
-the pending-run lengths grow at least as fast as the Fibonacci numbers.
-Therefore the stack can never grow larger than about log_base_phi(N) entries,
-where phi = (1+sqrt(5))/2 ~= 1.618.  Thus a small # of stack slots suffice
-for very large arrays.
-
-If A <= B+C, the smaller of A and C is merged with B (ties favor C, for the
-freshness-in-cache reason), and the new run replaces the A,B or B,C entries;
-e.g., if the last 3 entries are
-
-    A:30  B:20  C:10
-
-then B is merged with C, leaving
-
-    A:30  BC:30
-
-on the stack.  Or if they were
-
-    A:500  B:400:  C:1000
-
-then A is merged with B, leaving
-
-    AB:900  C:1000
-
-on the stack.
-
-In both examples, the stack configuration after the merge still violates
-invariant #2, and merge_collapse() goes on to continue merging runs until
-both invariants are satisfied.  As an extreme case, suppose we didn't do the
-minrun gimmick, and natural runs were of lengths 128, 64, 32, 16, 8, 4, 2,
-and 2.  Nothing would get merged until the final 2 was seen, and that would
-trigger 7 perfectly balanced merges.
-
-The thrust of these rules when they trigger merging is to balance the run
-lengths as closely as possible, while keeping a low bound on the number of
-runs we have to remember.  This is maximally effective for random data,
-where all runs are likely to be of (artificially forced) length minrun, and
-then we get a sequence of perfectly balanced merges (with, perhaps, some
-oddballs at the end).
-
-OTOH, one reason this sort is so good for partly ordered data has to do
-with wildly unbalanced run lengths.
-
-
-Merge Memory
------------
-Merging adjacent runs of lengths A and B in-place, and in linear time, is
-difficult.  Theoretical constructions are known that can do it, but they're
-too difficult and slow for practical use.  But if we have temp memory equal
-to min(A, B), it's easy.
-
-If A is smaller (function merge_lo), copy A to a temp array, leave B alone,
-and then we can do the obvious merge algorithm left to right, from the temp
-area and B, starting the stores into where A used to live.  There's always a
-free area in the original area comprising a number of elements equal to the
-number not yet merged from the temp array (trivially true at the start;
-proceed by induction).  The only tricky bit is that if a comparison raises an
-exception, we have to remember to copy the remaining elements back in from
-the temp area, lest the array end up with duplicate entries from B.  But
-that's exactly the same thing we need to do if we reach the end of B first,
-so the exit code is pleasantly common to both the normal and error cases.
-
-If B is smaller (function merge_hi, which is merge_lo's "mirror image"),
-much the same, except that we need to merge right to left, copying B into a
-temp array and starting the stores at the right end of where B used to live.
-
-A refinement:  When we're about to merge adjacent runs A and B, we first do
-a form of binary search (more on that later) to see where B[0] should end up
-in A.  Elements in A preceding that point are already in their final
-positions, effectively shrinking the size of A.  Likewise we also search to
-see where A[-1] should end up in B, and elements of B after that point can
-also be ignored.  This cuts the amount of temp memory needed by the same
-amount.
-
-These preliminary searches may not pay off, and can be expected *not* to
-repay their cost if the data is random.  But they can win huge in all of
-time, copying, and memory savings when they do pay, so this is one of the
-"per-merge overheads" mentioned above that we're happy to endure because
-there is at most one very short run.  It's generally true in this algorithm
-that we're willing to gamble a little to win a lot, even though the net
-expectation is negative for random data.
-
-
-Merge Algorithms
----------------
-merge_lo() and merge_hi() are where the bulk of the time is spent.  merge_lo
-deals with runs where A <= B, and merge_hi where A > B.  They don't know
-whether the data is clustered or uniform, but a lovely thing about merging
-is that many kinds of clustering "reveal themselves" by how many times in a
-row the winning merge element comes from the same run.  We'll only discuss
-merge_lo here; merge_hi is exactly analogous.
-
-Merging begins in the usual, obvious way, comparing the first element of A
-to the first of B, and moving B[0] to the merge area if it's less than A[0],
-else moving A[0] to the merge area.  Call that the "one pair at a time"
-mode.  The only twist here is keeping track of how many times in a row "the
-winner" comes from the same run.
-
-If that count reaches MIN_GALLOP, we switch to "galloping mode".  Here
-we *search* B for where A[0] belongs, and move over all the B's before
-that point in one chunk to the merge area, then move A[0] to the merge
-area.  Then we search A for where B[0] belongs, and similarly move a
-slice of A in one chunk.  Then back to searching B for where A[0] belongs,
-etc.  We stay in galloping mode until both searches find slices to copy
-less than MIN_GALLOP elements long, at which point we go back to one-pair-
-at-a-time mode.
-
-A refinement:  The MergeState struct contains the value of min_gallop that
-controls when we enter galloping mode, initialized to MIN_GALLOP.
-merge_lo() and merge_hi() adjust this higher when galloping isn't paying
-off, and lower when it is.
-
-
-Galloping
---------
-Still without loss of generality, assume A is the shorter run.  In galloping
-mode, we first look for A[0] in B.  We do this via "galloping", comparing
-A[0] in turn to B[0], B[1], B[3], B[7], ..., B[2**j - 1], ..., until finding
-the k such that B[2**(k-1) - 1] < A[0] <= B[2**k - 1].  This takes at most
-roughly lg(B) comparisons, and, unlike a straight binary search, favors
-finding the right spot early in B (more on that later).
-
-After finding such a k, the region of uncertainty is reduced to 2**(k-1) - 1
-consecutive elements, and a straight binary search requires exactly k-1
-additional comparisons to nail it (see note REGION OF UNCERTAINTY).  Then we
-copy all the B's up to that point in one chunk, and then copy A[0].  Note
-that no matter where A[0] belongs in B, the combination of galloping + binary
-search finds it in no more than about 2*lg(B) comparisons.
-
-If we did a straight binary search, we could find it in no more than
-ceiling(lg(B+1)) comparisons -- but straight binary search takes that many
-comparisons no matter where A[0] belongs.  Straight binary search thus loses
-to galloping unless the run is quite long, and we simply can't guess
-whether it is in advance.
-
-If data is random and runs have the same length, A[0] belongs at B[0] half
-the time, at B[1] a quarter of the time, and so on:  a consecutive winning
-sub-run in B of length k occurs with probability 1/2**(k+1).  So long
-winning sub-runs are extremely unlikely in random data, and guessing that a
-winning sub-run is going to be long is a dangerous game.
-
-OTOH, if data is lopsided or lumpy or contains many duplicates, long
-stretches of winning sub-runs are very likely, and cutting the number of
-comparisons needed to find one from O(B) to O(log B) is a huge win.
-
-Galloping compromises by getting out fast if there isn't a long winning
-sub-run, yet finding such very efficiently when they exist.
-
-I first learned about the galloping strategy in a related context; see:
-
-    "Adaptive Set Intersections, Unions, and Differences" (2000)
-    Erik D. Demaine, Alejandro López-Ortiz, J. Ian Munro
-
-and its followup(s).  An earlier paper called the same strategy
-"exponential search":
-
-   "Optimistic Sorting and Information Theoretic Complexity"
-   Peter McIlroy
-   SODA (Fourth Annual ACM-SIAM Symposium on Discrete Algorithms), pp
-   467-474, Austin, Texas, 25-27 January 1993.
-
-and it probably dates back to an earlier paper by Bentley and Yao.  The
-McIlroy paper in particular has good analysis of a mergesort that's
-probably strongly related to this one in its galloping strategy.
-
-
-Galloping with a Broken Leg
---------------------------
-So why don't we always gallop?  Because it can lose, on two counts:
-
-1. While we're willing to endure small per-merge overheads, per-comparison
-   overheads are a different story.  Calling Yet Another Function per
-   comparison is expensive, and gallop_left() and gallop_right() are
-   too long-winded for sane inlining.
-
-2. Galloping can-- alas --require more comparisons than linear one-at-time
-   search, depending on the data.
-
-#2 requires details.  If A[0] belongs before B[0], galloping requires 1
-compare to determine that, same as linear search, except it costs more
-to call the gallop function.  If A[0] belongs right before B[1], galloping
-requires 2 compares, again same as linear search.  On the third compare,
-galloping checks A[0] against B[3], and if it's <=, requires one more
-compare to determine whether A[0] belongs at B[2] or B[3].  That's a total
-of 4 compares, but if A[0] does belong at B[2], linear search would have
-discovered that in only 3 compares, and that's a huge loss!  Really.  It's
-an increase of 33% in the number of compares needed, and comparisons are
-expensive in Python.
-
-index in B where    # compares linear  # gallop  # binary  gallop
-A[0] belongs        search needs       compares  compares  total
----------------    -----------------  --------  --------  ------
-               0                    1         1         0       1
-
-               1                    2         2         0       2
-
-               2                    3         3         1       4
-               3                    4         3         1       4
-
-               4                    5         4         2       6
-               5                    6         4         2       6
-               6                    7         4         2       6
-               7                    8         4         2       6
-
-               8                    9         5         3       8
-               9                   10         5         3       8
-              10                   11         5         3       8
-              11                   12         5         3       8
-                                        ...
-
-In general, if A[0] belongs at B[i], linear search requires i+1 comparisons
-to determine that, and galloping a total of 2*floor(lg(i))+2 comparisons.
-The advantage of galloping is unbounded as i grows, but it doesn't win at
-all until i=6.  Before then, it loses twice (at i=2 and i=4), and ties
-at the other values.  At and after i=6, galloping always wins.
-
-We can't guess in advance when it's going to win, though, so we do one pair
-at a time until the evidence seems strong that galloping may pay.  MIN_GALLOP
-is 7, and that's pretty strong evidence.  However, if the data is random, it
-simply will trigger galloping mode purely by luck every now and again, and
-it's quite likely to hit one of the losing cases next.  On the other hand,
-in cases like ~sort, galloping always pays, and MIN_GALLOP is larger than it
-"should be" then.  So the MergeState struct keeps a min_gallop variable
-that merge_lo and merge_hi adjust:  the longer we stay in galloping mode,
-the smaller min_gallop gets, making it easier to transition back to
-galloping mode (if we ever leave it in the current merge, and at the
-start of the next merge).  But whenever the gallop loop doesn't pay,
-min_gallop is increased by one, making it harder to transition back
-to galloping mode (and again both within a merge and across merges).  For
-random data, this all but eliminates the gallop penalty:  min_gallop grows
-large enough that we almost never get into galloping mode.  And for cases
-like ~sort, min_gallop can fall to as low as 1.  This seems to work well,
-but in all it's a minor improvement over using a fixed MIN_GALLOP value.
-
-
-Galloping Complication
----------------------
-The description above was for merge_lo.  merge_hi has to merge "from the
-other end", and really needs to gallop starting at the last element in a run
-instead of the first.  Galloping from the first still works, but does more
-comparisons than it should (this is significant -- I timed it both ways). For
-this reason, the gallop_left() and gallop_right() (see note LEFT OR RIGHT)
-functions have a "hint" argument, which is the index at which galloping
-should begin.  So galloping can actually start at any index, and proceed at
-offsets of 1, 3, 7, 15, ... or -1, -3, -7, -15, ... from the starting index.
-
-In the code as I type it's always called with either 0 or n-1 (where n is
-the # of elements in a run).  It's tempting to try to do something fancier,
-melding galloping with some form of interpolation search; for example, if
-we're merging a run of length 1 with a run of length 10000, index 5000 is
-probably a better guess at the final result than either 0 or 9999.  But
-it's unclear how to generalize that intuition usefully, and merging of
-wildly unbalanced runs already enjoys excellent performance.
-
-~sort is a good example of when balanced runs could benefit from a better
-hint value:  to the extent possible, this would like to use a starting
-offset equal to the previous value of acount/bcount.  Doing so saves about
-10% of the compares in ~sort.  However, doing so is also a mixed bag,
-hurting other cases.
-
-
-Comparing Average # of Compares on Random Arrays
------------------------------------------------
-[NOTE:  This was done when the new algorithm used about 0.1% more compares
- on random data than does its current incarnation.]
-
-Here list.sort() is samplesort, and list.msort() this sort:
-
-"""
-import random
-from time import clock as now
-
-def fill(n):
-    from random import random
-    return [random() for i in range(n)]
-
-def mycmp(x, y):
-    global ncmp
-    ncmp += 1
-    return cmp(x, y)
-
-def timeit(values, method):
-    global ncmp
-    X = values[:]
-    bound = getattr(X, method)
-    ncmp = 0
-    t1 = now()
-    bound(mycmp)
-    t2 = now()
-    return t2-t1, ncmp
-
-format = "%5s  %9.2f  %11d"
-f2     = "%5s  %9.2f  %11.2f"
-
-def drive():
-    count = sst = sscmp = mst = mscmp = nelts = 0
-    while True:
-        n = random.randrange(100000)
-        nelts += n
-        x = fill(n)
-
-        t, c = timeit(x, 'sort')
-        sst += t
-        sscmp += c
-
-        t, c = timeit(x, 'msort')
-        mst += t
-        mscmp += c
-
-        count += 1
-        if count % 10:
-            continue
-
-        print "count", count, "nelts", nelts
-        print format % ("sort",  sst, sscmp)
-        print format % ("msort", mst, mscmp)
-        print f2     % ("", (sst-mst)*1e2/mst, (sscmp-mscmp)*1e2/mscmp)
-
-drive()
-"""
-
-I ran this on Windows and kept using the computer lightly while it was
-running.  time.clock() is wall-clock time on Windows, with better than
-microsecond resolution.  samplesort started with a 1.52% #-of-comparisons
-disadvantage, fell quickly to 1.48%, and then fluctuated within that small
-range.  Here's the last chunk of output before I killed the job:
-
-count 2630 nelts 130906543
- sort    6110.80   1937887573
-msort    6002.78   1909389381
-            1.80         1.49
-
-We've done nearly 2 billion comparisons apiece at Python speed there, and
-that's enough <wink>.
-
-For random arrays of size 2 (yes, there are only 2 interesting ones),
-samplesort has a 50%(!) comparison disadvantage.  This is a consequence of
-samplesort special-casing at most one ascending run at the start, then
-falling back to the general case if it doesn't find an ascending run
-immediately.  The consequence is that it ends up using two compares to sort
-[2, 1].  Gratifyingly, timsort doesn't do any special-casing, so had to be
-taught how to deal with mixtures of ascending and descending runs
-efficiently in all cases.
-
-
-NOTES
-----
-
-BINSORT
-A "binary insertion sort" is just like a textbook insertion sort, but instead
-of locating the correct position of the next item via linear (one at a time)
-search, an equivalent to Python's bisect.bisect_right is used to find the
-correct position in logarithmic time.  Most texts don't mention this
-variation, and those that do usually say it's not worth the bother:  insertion
-sort remains quadratic (expected and worst cases) either way.  Speeding the
-search doesn't reduce the quadratic data movement costs.
-
-But in CPython's case, comparisons are extraordinarily expensive compared to
-moving data, and the details matter.  Moving objects is just copying
-pointers.  Comparisons can be arbitrarily expensive (can invoke arbitrary
-user-supplied Python code), but even in simple cases (like 3 < 4) _all_
-decisions are made at runtime:  what's the type of the left comparand?  the
-type of the right?  do they need to be coerced to a common type?  where's the
-code to compare these types?  And so on.  Even the simplest Python comparison
-triggers a large pile of C-level pointer dereferences, conditionals, and
-function calls.
-
-So cutting the number of compares is almost always measurably helpful in
-CPython, and the savings swamp the quadratic-time data movement costs for
-reasonable minrun values.
-
-
-LEFT OR RIGHT
-gallop_left() and gallop_right() are akin to the Python bisect module's
-bisect_left() and bisect_right():  they're the same unless the slice they're
-searching contains a (at least one) value equal to the value being searched
-for.  In that case, gallop_left() returns the position immediately before the
-leftmost equal value, and gallop_right() the position immediately after the
-rightmost equal value.  The distinction is needed to preserve stability.  In
-general, when merging adjacent runs A and B, gallop_left is used to search
-thru B for where an element from A belongs, and gallop_right to search thru A
-for where an element from B belongs.
-
-
-REGION OF UNCERTAINTY
-Two kinds of confusion seem to be common about the claim that after finding
-a k such that
-
-    B[2**(k-1) - 1] < A[0] <= B[2**k - 1]
-
-then a binary search requires exactly k-1 tries to find A[0]'s proper
-location. For concreteness, say k=3, so B[3] < A[0] <= B[7].
-
-The first confusion takes the form "OK, then the region of uncertainty is at
-indices 3, 4, 5, 6 and 7:  that's 5 elements, not the claimed 2**(k-1) - 1 =
-3"; or the region is viewed as a Python slice and the objection is "but that's
-the slice B[3:7], so has 7-3 = 4 elements".  Resolution:  we've already
-compared A[0] against B[3] and against B[7], so A[0]'s correct location is
-already known wrt _both_ endpoints.  What remains is to find A[0]'s correct
-location wrt B[4], B[5] and B[6], which spans 3 elements.  Or in general, the
-slice (leaving off both endpoints) (2**(k-1)-1)+1 through (2**k-1)-1
-inclusive = 2**(k-1) through (2**k-1)-1 inclusive, which has
-    (2**k-1)-1 - 2**(k-1) + 1 =
-    2**k-1 - 2**(k-1) =
-    2*2**k-1 - 2**(k-1) =
-    (2-1)*2**(k-1) - 1 =
-    2**(k-1) - 1
-elements.
-
-The second confusion:  "k-1 = 2 binary searches can find the correct location
-among 2**(k-1) = 4 elements, but you're only applying it to 3 elements:  we
-could make this more efficient by arranging for the region of uncertainty to
-span 2**(k-1) elements."  Resolution:  that confuses "elements" with
-"locations".  In a slice with N elements, there are N+1 _locations_.  In the
-example, with the region of uncertainty B[4], B[5], B[6], there are 4
-locations:  before B[4], between B[4] and B[5], between B[5] and B[6], and
-after B[6].  In general, across 2**(k-1)-1 elements, there are 2**(k-1)
-locations.  That's why k-1 binary searches are necessary and sufficient.
--- a/third_party/python/Objects/lnotab_notes.txt
+++ b/third_party/python/Objects/lnotab_notes.txt
@ -1,135 +0,0 @@
-All about co_lnotab, the line number table.
-
-Code objects store a field named co_lnotab.  This is an array of unsigned bytes
-disguised as a Python bytes object.  It is used to map bytecode offsets to
-source code line #s for tracebacks and to identify line number boundaries for
-line tracing.
-
-The array is conceptually a compressed list of
-    (bytecode offset increment, line number increment)
-pairs.  The details are important and delicate, best illustrated by example:
-
-    byte code offset    source code line number
-        0                   1
-        6                   2
-       50                   7
-      350                 207
-      361                 208
-
-Instead of storing these numbers literally, we compress the list by storing only
-the difference from one row to the next.  Conceptually, the stored list might
-look like:
-
-    0, 1,  6, 1,  44, 5,  300, 200,  11, 1
-
-The above doesn't really work, but it's a start. An unsigned byte (byte code
-offset) can't hold negative values, or values larger than 255, a signed byte
-(line number) can't hold values larger than 127 or less than -128, and the
-above example contains two such values.  (Note that before 3.6, line number
-was also encoded by an unsigned byte.)  So we make two tweaks:
-
- (a) there's a deep assumption that byte code offsets increase monotonically,
- and
- (b) if byte code offset jumps by more than 255 from one row to the next, or if
- source code line number jumps by more than 127 or less than -128 from one row
- to the next, more than one pair is written to the table. In case #b,
- there's no way to know from looking at the table later how many were written.
- That's the delicate part.  A user of co_lnotab desiring to find the source
- line number corresponding to a bytecode address A should do something like
- this:
-
-    lineno = addr = 0
-    for addr_incr, line_incr in co_lnotab:
-        addr += addr_incr
-        if addr > A:
-            return lineno
-        if line_incr >= 0x80:
-            line_incr -= 0x100
-        lineno += line_incr
-
-(In C, this is implemented by PyCode_Addr2Line().)  In order for this to work,
-when the addr field increments by more than 255, the line # increment in each
-pair generated must be 0 until the remaining addr increment is < 256.  So, in
-the example above, assemble_lnotab in compile.c should not (as was actually done
-until 2.2) expand 300, 200 to
-    255, 255, 45, 45,
-but to
-    255, 0, 45, 127, 0, 73.
-
-The above is sufficient to reconstruct line numbers for tracebacks, but not for
-line tracing.  Tracing is handled by PyCode_CheckLineNumber() in codeobject.c
-and maybe_call_line_trace() in ceval.c.
-
-*** Tracing ***
-
-To a first approximation, we want to call the tracing function when the line
-number of the current instruction changes.  Re-computing the current line for
-every instruction is a little slow, though, so each time we compute the line
-number we save the bytecode indices where it's valid:
-
-     *instr_lb <= frame->f_lasti < *instr_ub
-
-is true so long as execution does not change lines.  That is, *instr_lb holds
-the first bytecode index of the current line, and *instr_ub holds the first
-bytecode index of the next line.  As long as the above expression is true,
-maybe_call_line_trace() does not need to call PyCode_CheckLineNumber().  Note
-that the same line may appear multiple times in the lnotab, either because the
-bytecode jumped more than 255 indices between line number changes or because
-the compiler inserted the same line twice.  Even in that case, *instr_ub holds
-the first index of the next line.
-
-However, we don't *always* want to call the line trace function when the above
-test fails.
-
-Consider this code:
-
-1: def f(a):
-2:    while a:
-3:       print(1)
-4:       break
-5:    else:
-6:       print(2)
-
-which compiles to this:
-
-  2           0 SETUP_LOOP              26 (to 28)
-        >>    2 LOAD_FAST                0 (a)
-              4 POP_JUMP_IF_FALSE       18
-
-  3           6 LOAD_GLOBAL              0 (print)
-              8 LOAD_CONST               1 (1)
-             10 CALL_FUNCTION            1
-             12 POP_TOP
-
-  4          14 BREAK_LOOP
-             16 JUMP_ABSOLUTE            2
-        >>   18 POP_BLOCK
-
-  6          20 LOAD_GLOBAL              0 (print)
-             22 LOAD_CONST               2 (2)
-             24 CALL_FUNCTION            1
-             26 POP_TOP
-        >>   28 LOAD_CONST               0 (None)
-             30 RETURN_VALUE
-
-If 'a' is false, execution will jump to the POP_BLOCK instruction at offset 18
-and the co_lnotab will claim that execution has moved to line 4, which is wrong.
-In this case, we could instead associate the POP_BLOCK with line 5, but that
-would break jumps around loops without else clauses.
-
-We fix this by only calling the line trace function for a forward jump if the
-co_lnotab indicates we have jumped to the *start* of a line, i.e. if the current
-instruction offset matches the offset given for the start of a line by the
-co_lnotab.  For backward jumps, however, we always call the line trace function,
-which lets a debugger stop on every evaluation of a loop guard (which usually
-won't be the first opcode in a line).
-
-Why do we set f_lineno when tracing, and only just before calling the trace
-function?  Well, consider the code above when 'a' is true.  If stepping through
-this with 'n' in pdb, you would stop at line 1 with a "call" type event, then
-line events on lines 2, 3, and 4, then a "return" type event -- but because the
-code for the return actually falls in the range of the "line 6" opcodes, you
-would be shown line 6 during this event.  This is a change from the behaviour in
-2.2 and before, and I've found it confusing in practice.  By setting and using
-f_lineno when tracing, one can report a line number different from that
-suggested by f_lasti on this one occasion where it's desirable.
--- a/third_party/python/Objects/longobject.c
+++ b/third_party/python/Objects/longobject.c
@ -1,13 +1,11 @@
+/* clang-format off */
 /* Long (arbitrary precision) integer object implementation */

 /* XXX The functional organization of this file is terrible */

-#include "Python.h"
-#include "longintrepr.h"
-
-#include <float.h>
-#include <ctype.h>
-#include <stddef.h>
+#include "third_party/python/Include/Python.h"
+#include "libc/log/check.h"
+#include "third_party/python/Include/longintrepr.h"

 #ifndef NSMALLPOSINTS
 #define NSMALLPOSINTS           257
@ -1583,7 +1581,7 @@ long_to_decimal_string_internal(PyObject *aa,
    digit *pout, *pin, rem, tenpow;
    int negative;
    int d;
-    enum PyUnicode_Kind kind;
+    enum PyUnicode_Kind kind = -1;

    a = (PyLongObject *)aa;
    if (a == NULL || !PyLong_Check(a)) {
@ -1675,6 +1673,8 @@ long_to_decimal_string_internal(PyObject *aa,
        kind = PyUnicode_KIND(str);
    }

+    CHECK_NE(-1, kind); /* if this fails there's a serious bug upstream */
+
 #define WRITE_DIGITS(p)                                               \
    do {                                                              \
        /* pout[0] through pout[size-2] contribute exactly            \
@ -1773,7 +1773,7 @@ long_format_binary(PyObject *aa, int base, int alternate,
    PyObject *v = NULL;
    Py_ssize_t sz;
    Py_ssize_t size_a;
-    enum PyUnicode_Kind kind;
+    enum PyUnicode_Kind kind = -1;
    int negative;
    int bits;

@ -1840,6 +1840,8 @@ long_format_binary(PyObject *aa, int base, int alternate,
        kind = PyUnicode_KIND(v);
    }

+    CHECK_NE(-1, kind); /* if this fails there's a serious bug upstream */
+
 #define WRITE_DIGITS(p)                                                 \
    do {                                                                \
        if (size_a == 0) {                                              \
--- a/third_party/python/Objects/memoryobject.c
+++ b/third_party/python/Objects/memoryobject.c
@ -1,8 +1,8 @@
+/* clang-format off */
 /* Memoryview object implementation */

-#include "Python.h"
-#include "pystrhex.h"
-#include <stddef.h>
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/pystrhex.h"


 /****************************************************************************/
--- a/third_party/python/Objects/methodobject.c
+++ b/third_party/python/Objects/methodobject.c
@ -1,8 +1,9 @@
+/* clang-format off */

 /* Method object implementation */

-#include "Python.h"
-#include "structmember.h"
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"

 /* Free list for method objects to safe malloc/free overhead
 * The m_self element is used to chain the objects.
--- a/third_party/python/Objects/moduleobject.c
+++ b/third_party/python/Objects/moduleobject.c
@ -1,8 +1,6 @@
-
-/* Module object implementation */
-
-#include "Python.h"
-#include "structmember.h"
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"

 static Py_ssize_t max_module_number;

--- a/third_party/python/Objects/namespaceobject.c
+++ b/third_party/python/Objects/namespaceobject.c
@ -1,8 +1,6 @@
-// namespace object implementation
-
-#include "Python.h"
-#include "structmember.h"
-
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"

 typedef struct {
    PyObject_HEAD
--- a/third_party/python/Objects/object.c
+++ b/third_party/python/Objects/object.c
@ -1,12 +1,6 @@
-
-/* Generic object operations; and implementation of None */
-
-#include "Python.h"
-#include "frameobject.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/frameobject.h"

 _Py_IDENTIFIER(Py_Repr);
 _Py_IDENTIFIER(__bytes__);
@ -2074,7 +2068,3 @@ _Py_Dealloc(PyObject *op)
    (*Py_TYPE(op)->tp_dealloc)(op);
 }
 #endif
-
-#ifdef __cplusplus
-}
-#endif
--- a/third_party/python/Objects/obmalloc.c
+++ b/third_party/python/Objects/obmalloc.c
@ -1,6 +1,8 @@
-#include "Python.h"
-
-#include <stdbool.h>
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/prot.h"

 /* Defined in tracemalloc.c */
@ -61,14 +63,7 @@ static void _PyMem_DebugCheckAddress(char api_id, const void *p);

 #ifdef WITH_PYMALLOC

-#ifdef MS_WINDOWS
-#  include <windows.h>
-#elif defined(HAVE_MMAP)
-#  include <sys/mman.h>
-#  ifdef MAP_ANONYMOUS
-#    define ARENAS_USE_MMAP
-#  endif
-#endif
+#define ARENAS_USE_MMAP

 /* Forward declaration */
 static void* _PyObject_Malloc(void *ctx, size_t size);
--- a/third_party/python/Objects/odictobject.c
+++ b/third_party/python/Objects/odictobject.c
@ -1,3 +1,4 @@
+/* clang-format off */
 /* Ordered Dictionary object implementation.

 This implementation is necessarily explicitly equivalent to the pure Python
@ -464,11 +465,9 @@ later:

 */

-#include "Python.h"
-#include "structmember.h"
-#include "dict-common.h"
-#include <stddef.h>
-
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"
+#include "third_party/python/Objects/dict-common.h"

 typedef struct _odictnode _ODictNode;

--- a/third_party/python/Objects/rangeobject.c
+++ b/third_party/python/Objects/rangeobject.c
@ -1,7 +1,6 @@
-/* Range object implementation */
-
-#include "Python.h"
-#include "structmember.h"
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"

 /* Support objects whose length is > PY_SSIZE_T_MAX.

--- a/third_party/python/Objects/setobject.c
+++ b/third_party/python/Objects/setobject.c
@ -1,3 +1,6 @@
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"

 /* set object implementation

@ -24,9 +27,6 @@
   NULL if the rich comparison returns an error.
 */

-#include "Python.h"
-#include "structmember.h"
-
 /* Object used as dummy key to fill deleted entries */
 static PyObject _dummy_struct;

--- a/third_party/python/Objects/sliceobject.c
+++ b/third_party/python/Objects/sliceobject.c
@ -1,3 +1,7 @@
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"
+
 /*
 Written by Jim Hugunin and Chris Chase.

@ -13,9 +17,6 @@ the Py_NoneStruct in that there is no way to create other objects of
 this type and there is exactly one in existence.
 */

-#include "Python.h"
-#include "structmember.h"
-
 static PyObject *
 ellipsis_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
 {
--- a/third_party/python/Objects/stringlib/asciilib.inc
+++ b/third_party/python/Objects/stringlib/asciilib.inc
@ -1,3 +1,5 @@
+/* clang-format off */
+/* clang-format off */
 /* this is sort of a hack.  there's at least one place (formatting
   floats) where some stringlib code takes a different path if it's
   compiled as unicode. */
--- a/third_party/python/Objects/stringlib/codecs.inc
+++ b/third_party/python/Objects/stringlib/codecs.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 /* stringlib: codec implementations */

 #if !STRINGLIB_IS_UNICODE
--- a/third_party/python/Objects/stringlib/count.inc
+++ b/third_party/python/Objects/stringlib/count.inc
@ -1,7 +1,8 @@
+/* clang-format off */
 /* stringlib: count implementation */

 #ifndef STRINGLIB_FASTSEARCH_H
-#error must include "stringlib/fastsearch.h" before including this module
+#error must include fastsearch.inc before including this module
 #endif

 Py_LOCAL_INLINE(Py_ssize_t)
--- a/third_party/python/Objects/stringlib/ctype.inc
+++ b/third_party/python/Objects/stringlib/ctype.inc
@ -1,8 +1,9 @@
+/* clang-format off */
 #if STRINGLIB_IS_UNICODE
 # error "ctype.h only compatible with byte-wise strings"
 #endif

-#include "bytes_methods.h"
+#include "third_party/python/Include/bytes_methods.h"

 static PyObject*
 stringlib_isspace(PyObject *self)
--- a/third_party/python/Objects/stringlib/eq.inc
+++ b/third_party/python/Objects/stringlib/eq.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 /* Fast unicode equal function optimized for dictobject.c and setobject.c */

 /* Return 1 if two unicode objects are equal, 0 if not.
--- a/third_party/python/Objects/stringlib/fastsearch.inc
+++ b/third_party/python/Objects/stringlib/fastsearch.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 /* stringlib: fastsearch implementation */

 #define STRINGLIB_FASTSEARCH_H
--- a/third_party/python/Objects/stringlib/find.inc
+++ b/third_party/python/Objects/stringlib/find.inc
@ -1,7 +1,8 @@
+/* clang-format off */
 /* stringlib: find/index implementation */

 #ifndef STRINGLIB_FASTSEARCH_H
-#error must include "stringlib/fastsearch.h" before including this module
+#error must include fastsearch.inc before including this module
 #endif

 Py_LOCAL_INLINE(Py_ssize_t)
--- a/third_party/python/Objects/stringlib/find_max_char.inc
+++ b/third_party/python/Objects/stringlib/find_max_char.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 /* Finding the optimal width of unicode characters in a buffer */

 #if !STRINGLIB_IS_UNICODE
--- a/third_party/python/Objects/stringlib/join.inc
+++ b/third_party/python/Objects/stringlib/join.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 /* stringlib: bytes joining implementation */

 #if STRINGLIB_IS_UNICODE
--- a/third_party/python/Objects/stringlib/localeutil.inc
+++ b/third_party/python/Objects/stringlib/localeutil.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 /* _PyUnicode_InsertThousandsGrouping() helper functions */

 typedef struct {
--- a/third_party/python/Objects/stringlib/partition.inc
+++ b/third_party/python/Objects/stringlib/partition.inc
@ -1,7 +1,8 @@
+/* clang-format off */
 /* stringlib: partition implementation */

 #ifndef STRINGLIB_FASTSEARCH_H
-#error must include "stringlib/fastsearch.h" before including this module
+#error must include fastsearch.inc before including this module
 #endif

 Py_LOCAL_INLINE(PyObject*)
--- a/third_party/python/Objects/stringlib/replace.inc
+++ b/third_party/python/Objects/stringlib/replace.inc
@ -1,7 +1,8 @@
+/* clang-format off */
 /* stringlib: replace implementation */

 #ifndef STRINGLIB_FASTSEARCH_H
-#error must include "stringlib/fastsearch.h" before including this module
+#error must include fastsearch.inc before including this module
 #endif

 Py_LOCAL_INLINE(void)
--- a/third_party/python/Objects/stringlib/split.inc
+++ b/third_party/python/Objects/stringlib/split.inc
@ -1,7 +1,8 @@
+/* clang-format off */
 /* stringlib: split implementation */

 #ifndef STRINGLIB_FASTSEARCH_H
-#error must include "stringlib/fastsearch.h" before including this module
+#error must include fastsearch.inc before including this module
 #endif

 /* Overallocate the initial list to reduce the number of reallocs for small
--- a/third_party/python/Objects/stringlib/stringdefs.inc
+++ b/third_party/python/Objects/stringlib/stringdefs.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 #ifndef STRINGLIB_STRINGDEFS_H
 #define STRINGLIB_STRINGDEFS_H

--- a/third_party/python/Objects/stringlib/transmogrify.inc
+++ b/third_party/python/Objects/stringlib/transmogrify.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 #if STRINGLIB_IS_UNICODE
 # error "transmogrify.h only compatible with byte-wise strings"
 #endif
--- a/third_party/python/Objects/stringlib/ucs1lib.inc
+++ b/third_party/python/Objects/stringlib/ucs1lib.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 /* this is sort of a hack.  there's at least one place (formatting
   floats) where some stringlib code takes a different path if it's
   compiled as unicode. */
--- a/third_party/python/Objects/stringlib/ucs2lib.inc
+++ b/third_party/python/Objects/stringlib/ucs2lib.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 /* this is sort of a hack.  there's at least one place (formatting
   floats) where some stringlib code takes a different path if it's
   compiled as unicode. */
--- a/third_party/python/Objects/stringlib/ucs4lib.inc
+++ b/third_party/python/Objects/stringlib/ucs4lib.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 /* this is sort of a hack.  there's at least one place (formatting
   floats) where some stringlib code takes a different path if it's
   compiled as unicode. */
--- a/third_party/python/Objects/stringlib/undef.inc
+++ b/third_party/python/Objects/stringlib/undef.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 #undef  FASTSEARCH
 #undef  STRINGLIB
 #undef  STRINGLIB_SIZEOF_CHAR
--- a/third_party/python/Objects/stringlib/unicode_format.inc
+++ b/third_party/python/Objects/stringlib/unicode_format.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 /*
    unicode_format.h -- implementation of str.format().
 */
--- a/third_party/python/Objects/stringlib/unicodedefs.inc
+++ b/third_party/python/Objects/stringlib/unicodedefs.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 #ifndef STRINGLIB_UNICODEDEFS_H
 #define STRINGLIB_UNICODEDEFS_H

--- a/third_party/python/Objects/structseq.c
+++ b/third_party/python/Objects/structseq.c
@ -1,9 +1,10 @@
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"
+
 /* Implementation helper: a struct that looks like a tuple.  See timemodule
   and posixmodule for example uses. */

-#include "Python.h"
-#include "structmember.h"
-
 static const char visible_length_key[] = "n_sequence_fields";
 static const char real_length_key[] = "n_fields";
 static const char unnamed_fields_key[] = "n_unnamed_fields";
--- a/third_party/python/Objects/tupleobject.c
+++ b/third_party/python/Objects/tupleobject.c
@ -1,9 +1,9 @@
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/accu.h"

 /* Tuple object implementation */

-#include "Python.h"
-#include "accu.h"
-
 /* Speed optimization to avoid frequent malloc/free of small tuples */
 #ifndef PyTuple_MAXSAVESIZE
 #define PyTuple_MAXSAVESIZE     20  /* Largest tuple to save on free list */
--- a/third_party/python/Objects/typeobject.c
+++ b/third_party/python/Objects/typeobject.c
@ -1,12 +1,10 @@
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/frameobject.h"
+#include "third_party/python/Include/structmember.h"
+
 /* Type object implementation */

-#include "Python.h"
-#include "frameobject.h"
-#include "structmember.h"
-
-#include <ctype.h>
-
-
 /* Support type attribute cache */

 /* The cache can keep references to the names alive for longer than
--- a/third_party/python/Objects/typeslots.inc
+++ b/third_party/python/Objects/typeslots.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 /* Generated by typeslots.py */
 0,
 0,
--- a/third_party/python/Objects/unicodectype.c
+++ b/third_party/python/Objects/unicodectype.c
@ -1,3 +1,6 @@
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+
 /*
   Unicode character type helpers.

@ -8,8 +11,6 @@

 */

-#include "Python.h"
-
 #define ALPHA_MASK 0x01
 #define DECIMAL_MASK 0x02
 #define DIGIT_MASK 0x04
@ -40,7 +41,7 @@ typedef struct {
    const unsigned short flags;
 } _PyUnicode_TypeRecord;

-#include "unicodetype_db.h"
+#include "third_party/python/Objects/unicodetype_db.inc"

 static const _PyUnicode_TypeRecord *
 gettyperecord(Py_UCS4 code)
--- a/third_party/python/Objects/unicodeobject.c
+++ b/third_party/python/Objects/unicodeobject.c
@ -1,3 +1,10 @@
+/* clang-format off */
+#define PY_SSIZE_T_CLEAN
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/ucnhash.h"
+#include "third_party/python/Include/bytes_methods.h"
+#include "third_party/python/Objects/stringlib/eq.inc"
+
 /*

 Unicode implementation based on original code by Fredrik Lundh,
@ -38,16 +45,6 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

 */

-#define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "ucnhash.h"
-#include "bytes_methods.h"
-#include "stringlib/eq.h"
-
-#ifdef MS_WINDOWS
-#include <windows.h>
-#endif
-
 /*[clinic input]
 class str "PyUnicodeObject *" "&PyUnicode_Type"
 [clinic start generated code]*/
@ -332,7 +329,7 @@ static const unsigned char ascii_linebreak[] = {
    0, 0, 0, 0, 0, 0, 0, 0
 };

-#include "clinic/unicodeobject.c.h"
+#include "third_party/python/Objects/clinic/unicodeobject.inc"

 typedef enum {
    _Py_ERROR_UNKNOWN=0,
@ -805,50 +802,50 @@ ensure_unicode(PyObject *obj)

 /* Compilation of templated routines */

-#include "stringlib/asciilib.h"
-#include "stringlib/fastsearch.h"
-#include "stringlib/partition.h"
-#include "stringlib/split.h"
-#include "stringlib/count.h"
-#include "stringlib/find.h"
-#include "stringlib/find_max_char.h"
-#include "stringlib/undef.h"
+#include "third_party/python/Objects/stringlib/asciilib.inc"
+#include "third_party/python/Objects/stringlib/fastsearch.inc"
+#include "third_party/python/Objects/stringlib/partition.inc"
+#include "third_party/python/Objects/stringlib/split.inc"
+#include "third_party/python/Objects/stringlib/count.inc"
+#include "third_party/python/Objects/stringlib/find.inc"
+#include "third_party/python/Objects/stringlib/find_max_char.inc"
+#include "third_party/python/Objects/stringlib/undef.inc"

-#include "stringlib/ucs1lib.h"
-#include "stringlib/fastsearch.h"
-#include "stringlib/partition.h"
-#include "stringlib/split.h"
-#include "stringlib/count.h"
-#include "stringlib/find.h"
-#include "stringlib/replace.h"
-#include "stringlib/find_max_char.h"
-#include "stringlib/undef.h"
+#include "third_party/python/Objects/stringlib/ucs1lib.inc"
+#include "third_party/python/Objects/stringlib/fastsearch.inc"
+#include "third_party/python/Objects/stringlib/partition.inc"
+#include "third_party/python/Objects/stringlib/split.inc"
+#include "third_party/python/Objects/stringlib/count.inc"
+#include "third_party/python/Objects/stringlib/find.inc"
+#include "third_party/python/Objects/stringlib/replace.inc"
+#include "third_party/python/Objects/stringlib/find_max_char.inc"
+#include "third_party/python/Objects/stringlib/undef.inc"

-#include "stringlib/ucs2lib.h"
-#include "stringlib/fastsearch.h"
-#include "stringlib/partition.h"
-#include "stringlib/split.h"
-#include "stringlib/count.h"
-#include "stringlib/find.h"
-#include "stringlib/replace.h"
-#include "stringlib/find_max_char.h"
-#include "stringlib/undef.h"
+#include "third_party/python/Objects/stringlib/ucs2lib.inc"
+#include "third_party/python/Objects/stringlib/fastsearch.inc"
+#include "third_party/python/Objects/stringlib/partition.inc"
+#include "third_party/python/Objects/stringlib/split.inc"
+#include "third_party/python/Objects/stringlib/count.inc"
+#include "third_party/python/Objects/stringlib/find.inc"
+#include "third_party/python/Objects/stringlib/replace.inc"
+#include "third_party/python/Objects/stringlib/find_max_char.inc"
+#include "third_party/python/Objects/stringlib/undef.inc"

-#include "stringlib/ucs4lib.h"
-#include "stringlib/fastsearch.h"
-#include "stringlib/partition.h"
-#include "stringlib/split.h"
-#include "stringlib/count.h"
-#include "stringlib/find.h"
-#include "stringlib/replace.h"
-#include "stringlib/find_max_char.h"
-#include "stringlib/undef.h"
+#include "third_party/python/Objects/stringlib/ucs4lib.inc"
+#include "third_party/python/Objects/stringlib/fastsearch.inc"
+#include "third_party/python/Objects/stringlib/partition.inc"
+#include "third_party/python/Objects/stringlib/split.inc"
+#include "third_party/python/Objects/stringlib/count.inc"
+#include "third_party/python/Objects/stringlib/find.inc"
+#include "third_party/python/Objects/stringlib/replace.inc"
+#include "third_party/python/Objects/stringlib/find_max_char.inc"
+#include "third_party/python/Objects/stringlib/undef.inc"

-#include "stringlib/unicodedefs.h"
-#include "stringlib/fastsearch.h"
-#include "stringlib/count.h"
-#include "stringlib/find.h"
-#include "stringlib/undef.h"
+#include "third_party/python/Objects/stringlib/unicodedefs.inc"
+#include "third_party/python/Objects/stringlib/fastsearch.inc"
+#include "third_party/python/Objects/stringlib/count.inc"
+#include "third_party/python/Objects/stringlib/find.inc"
+#include "third_party/python/Objects/stringlib/undef.inc"

 /* --- Unicode Object ----------------------------------------------------- */

@ -4971,21 +4968,21 @@ PyUnicode_DecodeUTF8(const char *s,
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
 }

-#include "stringlib/asciilib.h"
-#include "stringlib/codecs.h"
-#include "stringlib/undef.h"
+#include "third_party/python/Objects/stringlib/asciilib.inc"
+#include "third_party/python/Objects/stringlib/codecs.inc"
+#include "third_party/python/Objects/stringlib/undef.inc"

-#include "stringlib/ucs1lib.h"
-#include "stringlib/codecs.h"
-#include "stringlib/undef.h"
+#include "third_party/python/Objects/stringlib/ucs1lib.inc"
+#include "third_party/python/Objects/stringlib/codecs.inc"
+#include "third_party/python/Objects/stringlib/undef.inc"

-#include "stringlib/ucs2lib.h"
-#include "stringlib/codecs.h"
-#include "stringlib/undef.h"
+#include "third_party/python/Objects/stringlib/ucs2lib.inc"
+#include "third_party/python/Objects/stringlib/codecs.inc"
+#include "third_party/python/Objects/stringlib/undef.inc"

-#include "stringlib/ucs4lib.h"
-#include "stringlib/codecs.h"
-#include "stringlib/undef.h"
+#include "third_party/python/Objects/stringlib/ucs4lib.inc"
+#include "third_party/python/Objects/stringlib/codecs.inc"
+#include "third_party/python/Objects/stringlib/undef.inc"

 /* Mask to quickly check whether a C 'long' contains a
   non-ASCII, UTF8-encoded char. */
@ -9436,7 +9433,7 @@ any_find_slice(PyObject* s1, PyObject* s2,
 }

 /* _PyUnicode_InsertThousandsGrouping() helper functions */
-#include "stringlib/localeutil.h"
+#include "third_party/python/Objects/stringlib/localeutil.inc"

 /**
 * InsertThousandsGrouping:
@ -13891,7 +13888,7 @@ _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
    Py_CLEAR(writer->buffer);
 }

-#include "stringlib/unicode_format.h"
+#include "third_party/python/Objects/stringlib/unicode_format.inc"

 PyDoc_STRVAR(format__doc__,
             "S.format(*args, **kwargs) -> str\n\
--- a/third_party/python/Objects/unicodetype_db.inc
+++ b/third_party/python/Objects/unicodetype_db.inc
@ -1,3 +1,4 @@
+/* clang-format off */
 /* this file was generated by Tools/unicode/makeunicodedata.py 3.2 */

 /* a list of unique character type descriptors */
--- a/third_party/python/Objects/weakrefobject.c
+++ b/third_party/python/Objects/weakrefobject.c
@ -1,6 +1,6 @@
-#include "Python.h"
-#include "structmember.h"
-
+/* clang-format off */
+#include "third_party/python/Include/Python.h"
+#include "third_party/python/Include/structmember.h"

 #define GET_WEAKREFS_LISTPTR(o) \
        ((PyWeakReference **) PyObject_GET_WEAKREFS_LISTPTR(o))