Perform build and magnum tuning

Building o//third_party/python now takes 5 seconds on my PC

This change works towards modifying Python to use runtime dispatching
when appropriate. For example, when loading the magnums in the socket
module, it's a good idea to check if the magnum is zero, because that
means the local system platform doesn't support it.
This commit is contained in:
Justine Tunney 2021-08-10 10:26:13 -07:00
parent ee7e296339
commit d26d7ae0e4
1028 changed files with 6576 additions and 172777 deletions

View file

@ -1,11 +1,9 @@
/* clang-format off */
/* Abstract Object Interface (many thanks to Jim Fulton) */
#include "Python.h"
#include <ctype.h>
#include "structmember.h" /* we need the offsetof() macro from there */
#include "longintrepr.h"
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
#include "third_party/python/Include/longintrepr.h"
/* Shorthands to return certain errors */

View file

@ -1,7 +1,8 @@
/* clang-format off */
/* Accumulator struct implementation */
#include "Python.h"
#include "accu.h"
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/accu.h"
static PyObject *
join_list_unicode(PyObject *lst)

View file

@ -1,7 +1,8 @@
/* clang-format off */
/* Boolean type, a subtype of int */
#include "Python.h"
#include "longintrepr.h"
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/longintrepr.h"
/* We define bool_repr to return "False" or "True" */

View file

@ -1,11 +1,12 @@
/* clang-format off */
/* PyByteArray (bytearray) implementation */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
#include "structmember.h"
#include "bytes_methods.h"
#include "bytesobject.h"
#include "pystrhex.h"
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
#include "third_party/python/Include/bytes_methods.h"
#include "third_party/python/Include/bytesobject.h"
#include "third_party/python/Include/pystrhex.h"
/*[clinic input]
class bytearray "PyByteArrayObject *" "&PyByteArray_Type"
@ -90,7 +91,7 @@ _canresize(PyByteArrayObject *self)
return 1;
}
#include "clinic/bytearrayobject.c.h"
#include "third_party/python/Objects/clinic/bytearrayobject.inc"
/* Direct API functions */
@ -1106,14 +1107,14 @@ bytearray_dealloc(PyByteArrayObject *self)
#define STRINGLIB_CHECK_EXACT PyByteArray_CheckExact
#define STRINGLIB_MUTABLE 1
#include "stringlib/fastsearch.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/join.h"
#include "stringlib/partition.h"
#include "stringlib/split.h"
#include "stringlib/ctype.h"
#include "stringlib/transmogrify.h"
#include "third_party/python/Objects/stringlib/fastsearch.inc"
#include "third_party/python/Objects/stringlib/count.inc"
#include "third_party/python/Objects/stringlib/find.inc"
#include "third_party/python/Objects/stringlib/join.inc"
#include "third_party/python/Objects/stringlib/partition.inc"
#include "third_party/python/Objects/stringlib/split.inc"
#include "third_party/python/Objects/stringlib/ctype.inc"
#include "third_party/python/Objects/stringlib/transmogrify.inc"
static PyObject *

View file

@ -1,6 +1,7 @@
/* clang-format off */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
#include "bytes_methods.h"
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/bytes_methods.h"
PyDoc_STRVAR_shared(_Py_isspace__doc__,
"B.isspace() -> bool\n\
@ -394,9 +395,9 @@ _Py_bytes_maketrans(Py_buffer *frm, Py_buffer *to)
#define STRINGLIB_CHAR char
#define STRINGLIB_SIZEOF_CHAR 1
#include "stringlib/fastsearch.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "third_party/python/Objects/stringlib/fastsearch.inc"
#include "third_party/python/Objects/stringlib/count.inc"
#include "third_party/python/Objects/stringlib/find.inc"
/*
Wraps stringlib_parse_args_finds() and additionally checks whether the

View file

@ -1,19 +1,18 @@
/* clang-format off */
/* bytes object implementation */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
#include "bytes_methods.h"
#include "pystrhex.h"
#include <stddef.h>
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/bytes_methods.h"
#include "third_party/python/Include/pystrhex.h"
/*[clinic input]
class bytes "PyBytesObject *" "&PyBytes_Type"
[clinic start generated code]*/
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=7a238f965d64892b]*/
#include "clinic/bytesobject.c.h"
#include "third_party/python/Objects/clinic/bytesobject.inc"
#ifdef COUNT_ALLOCS
Py_ssize_t null_strings, one_strings;
@ -1318,17 +1317,17 @@ PyBytes_AsStringAndSize(PyObject *obj,
/* -------------------------------------------------------------------- */
/* Methods */
#include "stringlib/stringdefs.h"
#include "third_party/python/Objects/stringlib/stringdefs.inc"
#include "stringlib/fastsearch.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/join.h"
#include "stringlib/partition.h"
#include "stringlib/split.h"
#include "stringlib/ctype.h"
#include "third_party/python/Objects/stringlib/fastsearch.inc"
#include "third_party/python/Objects/stringlib/count.inc"
#include "third_party/python/Objects/stringlib/find.inc"
#include "third_party/python/Objects/stringlib/join.inc"
#include "third_party/python/Objects/stringlib/partition.inc"
#include "third_party/python/Objects/stringlib/split.inc"
#include "third_party/python/Objects/stringlib/ctype.inc"
#include "stringlib/transmogrify.h"
#include "third_party/python/Objects/stringlib/transmogrify.inc"
PyObject *
PyBytes_Repr(PyObject *obj, int smartquotes)

View file

@ -1,6 +1,7 @@
/* clang-format off */
/* Wrap void * pointers to be passed between C modules */
#include "Python.h"
#include "third_party/python/Include/Python.h"
/* Internal structure of PyCapsule */
typedef struct {

View file

@ -1,6 +1,7 @@
/* clang-format off */
/* Cell object implementation */
#include "Python.h"
#include "third_party/python/Include/Python.h"
PyObject *
PyCell_New(PyObject *obj)

View file

@ -1,7 +1,8 @@
/* clang-format off */
/* Class object implementation (dead now except for methods) */
#include "Python.h"
#include "structmember.h"
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
#define TP_DESCR_GET(t) ((t)->tp_descr_get)

View file

@ -1,8 +1,7 @@
#include <stdbool.h>
#include "Python.h"
#include "code.h"
#include "structmember.h"
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/code.h"
#include "third_party/python/Include/structmember.h"
#define NAME_CHARS \
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"

View file

@ -1,13 +1,11 @@
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
/* Complex object implementation */
/* Borrows heavily from floatobject.c */
/* Submitted by Jim Hugunin */
#include "Python.h"
#include "structmember.h"
/* elementary operations on complex numbers */
static Py_complex c_1 = {1., 0.};

View file

@ -1,7 +1,8 @@
/* Descriptors -- a new, flexible way to describe attributes */
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
#include "Python.h"
#include "structmember.h" /* Why is this not included in Python.h? */
/* Descriptors -- a new, flexible way to describe attributes */
static void
descr_dealloc(PyDescrObject *descr)

View file

@ -1,5 +1,6 @@
#ifndef Py_DICT_COMMON_H
#define Py_DICT_COMMON_H
/* clang-format off */
typedef struct {
/* Cached hash code of me_key. */

View file

@ -1,149 +0,0 @@
NOTES ON DICTIONARIES
================================
Principal Use Cases for Dictionaries
------------------------------------
Passing keyword arguments
Typically, one read and one write for 1 to 3 elements.
Occurs frequently in normal python code.
Class method lookup
Dictionaries vary in size with 8 to 16 elements being common.
Usually written once with many lookups.
When base classes are used, there are many failed lookups
followed by a lookup in a base class.
Instance attribute lookup and Global variables
Dictionaries vary in size. 4 to 10 elements are common.
Both reads and writes are common.
Builtins
Frequent reads. Almost never written.
About 150 interned strings (as of Py3.3).
A few keys are accessed much more frequently than others.
Uniquification
Dictionaries of any size. Bulk of work is in creation.
Repeated writes to a smaller set of keys.
Single read of each key.
Some use cases have two consecutive accesses to the same key.
* Removing duplicates from a sequence.
dict.fromkeys(seqn).keys()
* Counting elements in a sequence.
for e in seqn:
d[e] = d.get(e,0) + 1
* Accumulating references in a dictionary of lists:
for pagenumber, page in enumerate(pages):
for word in page:
d.setdefault(word, []).append(pagenumber)
Note, the second example is a use case characterized by a get and set
to the same key. There are similar use cases with a __contains__
followed by a get, set, or del to the same key. Part of the
justification for d.setdefault is combining the two lookups into one.
Membership Testing
Dictionaries of any size. Created once and then rarely changes.
Single write to each key.
Many calls to __contains__() or has_key().
Similar access patterns occur with replacement dictionaries
such as with the % formatting operator.
Dynamic Mappings
Characterized by deletions interspersed with adds and replacements.
Performance benefits greatly from the re-use of dummy entries.
Data Layout
-----------
Dictionaries are composed of 3 components:
The dictobject struct itself
A dict-keys object (keys & hashes)
A values array
Tunable Dictionary Parameters
-----------------------------
See comments for PyDict_MINSIZE_SPLIT, PyDict_MINSIZE_COMBINED,
USABLE_FRACTION and GROWTH_RATE in dictobject.c
Tune-ups should be measured across a broad range of applications and
use cases. A change to any parameter will help in some situations and
hurt in others. The key is to find settings that help the most common
cases and do the least damage to the less common cases. Results will
vary dramatically depending on the exact number of keys, whether the
keys are all strings, whether reads or writes dominate, the exact
hash values of the keys (some sets of values have fewer collisions than
others). Any one test or benchmark is likely to prove misleading.
While making a dictionary more sparse reduces collisions, it impairs
iteration and key listing. Those methods loop over every potential
entry. Doubling the size of dictionary results in twice as many
non-overlapping memory accesses for keys(), items(), values(),
__iter__(), iterkeys(), iteritems(), itervalues(), and update().
Also, every dictionary iterates at least twice, once for the memset()
when it is created and once by dealloc().
Dictionary operations involving only a single key can be O(1) unless
resizing is possible. By checking for a resize only when the
dictionary can grow (and may *require* resizing), other operations
remain O(1), and the odds of resize thrashing or memory fragmentation
are reduced. In particular, an algorithm that empties a dictionary
by repeatedly invoking .pop will see no resizing, which might
not be necessary at all because the dictionary is eventually
discarded entirely.
The key differences between this implementation and earlier versions are:
1. The table can be split into two parts, the keys and the values.
2. There is an additional key-value combination: (key, NULL).
Unlike (<dummy>, NULL) which represents a deleted value, (key, NULL)
represented a yet to be inserted value. This combination can only occur
when the table is split.
3. No small table embedded in the dict,
as this would make sharing of key-tables impossible.
These changes have the following consequences.
1. General dictionaries are slightly larger.
2. All object dictionaries of a single class can share a single key-table,
saving about 60% memory for such cases.
Results of Cache Locality Experiments
--------------------------------------
Experiments on an earlier design of dictionary, in which all tables were
combined, showed the following:
When an entry is retrieved from memory, several adjacent entries are also
retrieved into a cache line. Since accessing items in cache is *much*
cheaper than a cache miss, an enticing idea is to probe the adjacent
entries as a first step in collision resolution. Unfortunately, the
introduction of any regularity into collision searches results in more
collisions than the current random chaining approach.
Exploiting cache locality at the expense of additional collisions fails
to payoff when the entries are already loaded in cache (the expense
is paid with no compensating benefit). This occurs in small dictionaries
where the whole dictionary fits into a pair of cache lines. It also
occurs frequently in large dictionaries which have a common access pattern
where some keys are accessed much more frequently than others. The
more popular entries *and* their collision chains tend to remain in cache.
To exploit cache locality, change the collision resolution section
in lookdict() and lookdict_string(). Set i^=1 at the top of the
loop and move the i = (i << 2) + i + perturb + 1 to an unrolled
version of the loop.
For split tables, the above will apply to the keys, but the value will
always be in a different cache line from the key.

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* Dictionary object implementation using a hash table */
/* The distribution includes a separate file, Objects/dictnotes.txt,
@ -109,9 +110,9 @@ converting the dict to the combined table.
*/
#define PyDict_MINSIZE 8
#include "Python.h"
#include "dict-common.h"
#include "stringlib/eq.h" /* to get unicode_eq() */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Objects/dict-common.h"
#include "third_party/python/Objects/stringlib/eq.inc" /* to get unicode_eq() */
/*[clinic input]
class dict "PyDictObject *" "&PyDict_Type"
@ -255,7 +256,7 @@ static int numfree = 0;
static PyDictKeysObject *keys_free_list[PyDict_MAXFREELIST];
static int numfreekeys = 0;
#include "clinic/dictobject.c.h"
#include "third_party/python/Objects/clinic/dictobject.inc"
int
PyDict_ClearFreeList(void)

View file

@ -1,6 +1,7 @@
/* clang-format off */
/* enumerate object */
#include "Python.h"
#include "third_party/python/Include/Python.h"
typedef struct {
PyObject_HEAD

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* New exceptions.c written in Iceland by Richard Jones and Georg Brandl.
*
@ -5,9 +6,9 @@
*/
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include "structmember.h"
#include "osdefs.h"
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
#include "third_party/python/Include/osdefs.h"
/* Compatibility aliases */

View file

@ -1,7 +1,8 @@
/* clang-format off */
/* File object implementation (what's left of it -- see io.py) */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
#include "third_party/python/Include/Python.h"
#include "libc/stdio/unlocked.h"
#if defined(HAVE_GETC_UNLOCKED) && !defined(_Py_MEMORY_SANITIZER)
@ -21,10 +22,6 @@
#define NEWLINE_LF 2 /* \n newline seen */
#define NEWLINE_CRLF 4 /* \r\n newline seen */
#ifdef __cplusplus
extern "C" {
#endif
/* External C interface */
PyObject *
@ -525,8 +522,3 @@ PyTypeObject PyStdPrinter_Type = {
stdprinter_new, /* tp_new */
PyObject_Del, /* tp_free */
};
#ifdef __cplusplus
}
#endif

View file

@ -1,15 +1,12 @@
/* clang-format off */
/* Float object implementation */
/* XXX There should be overflow checks here, but it's hard to check
for any kind of float exception without losing portability. */
#include "Python.h"
#include "third_party/python/Include/Python.h"
#include "libc/runtime/fenv.h"
#include <ctype.h>
#include <float.h>
/* Special free list
free_list is a singly-linked list of available PyFloatObjects, linked
via abuse of their ob_type members.

View file

@ -1,11 +1,11 @@
/* clang-format off */
/* Frame object implementation */
#include "Python.h"
#include "code.h"
#include "frameobject.h"
#include "opcode.h"
#include "structmember.h"
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/code.h"
#include "third_party/python/Include/frameobject.h"
#include "third_party/python/Include/opcode.h"
#include "third_party/python/Include/structmember.h"
#define OFF(x) offsetof(PyFrameObject, x)

View file

@ -1,9 +1,10 @@
/* clang-format off */
/* Function object implementation */
#include "Python.h"
#include "code.h"
#include "structmember.h"
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/code.h"
#include "third_party/python/Include/structmember.h"
PyObject *
PyFunction_NewWithQualName(PyObject *code, PyObject *globals, PyObject *qualname)

View file

@ -1,9 +1,10 @@
/* clang-format off */
/* Generator object implementation */
#include "Python.h"
#include "frameobject.h"
#include "structmember.h"
#include "opcode.h"
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/frameobject.h"
#include "third_party/python/Include/structmember.h"
#include "third_party/python/Include/opcode.h"
static PyObject *gen_close(PyGenObject *, PyObject *);
static PyObject *async_gen_asend_new(PyAsyncGenObject *, PyObject *);

View file

@ -1,6 +1,7 @@
/* clang-format off */
/* Iterator objects */
#include "Python.h"
#include "third_party/python/Include/Python.h"
typedef struct {
PyObject_HEAD

View file

@ -1,13 +1,8 @@
/* clang-format off */
/* List object implementation */
#include "Python.h"
#include "accu.h"
#ifdef STDC_HEADERS
#include <stddef.h>
#else
#include <sys/types.h> /* For size_t */
#endif
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/accu.h"
/* Ensure ob_item has room for at least newsize elements, and set
* ob_size to newsize. If newsize > ob_size on entry, the content

View file

@ -1,755 +0,0 @@
Intro
-----
This describes an adaptive, stable, natural mergesort, modestly called
timsort (hey, I earned it <wink>). It has supernatural performance on many
kinds of partially ordered arrays (less than lg(N!) comparisons needed, and
as few as N-1), yet as fast as Python's previous highly tuned samplesort
hybrid on random arrays.
In a nutshell, the main routine marches over the array once, left to right,
alternately identifying the next run, then merging it into the previous
runs "intelligently". Everything else is complication for speed, and some
hard-won measure of memory efficiency.
Comparison with Python's Samplesort Hybrid
------------------------------------------
+ timsort can require a temp array containing as many as N//2 pointers,
which means as many as 2*N extra bytes on 32-bit boxes. It can be
expected to require a temp array this large when sorting random data; on
data with significant structure, it may get away without using any extra
heap memory. This appears to be the strongest argument against it, but
compared to the size of an object, 2 temp bytes worst-case (also expected-
case for random data) doesn't scare me much.
It turns out that Perl is moving to a stable mergesort, and the code for
that appears always to require a temp array with room for at least N
pointers. (Note that I wouldn't want to do that even if space weren't an
issue; I believe its efforts at memory frugality also save timsort
significant pointer-copying costs, and allow it to have a smaller working
set.)
+ Across about four hours of generating random arrays, and sorting them
under both methods, samplesort required about 1.5% more comparisons
(the program is at the end of this file).
+ In real life, this may be faster or slower on random arrays than
samplesort was, depending on platform quirks. Since it does fewer
comparisons on average, it can be expected to do better the more
expensive a comparison function is. OTOH, it does more data movement
(pointer copying) than samplesort, and that may negate its small
comparison advantage (depending on platform quirks) unless comparison
is very expensive.
+ On arrays with many kinds of pre-existing order, this blows samplesort out
of the water. It's significantly faster than samplesort even on some
cases samplesort was special-casing the snot out of. I believe that lists
very often do have exploitable partial order in real life, and this is the
strongest argument in favor of timsort (indeed, samplesort's special cases
for extreme partial order are appreciated by real users, and timsort goes
much deeper than those, in particular naturally covering every case where
someone has suggested "and it would be cool if list.sort() had a special
case for this too ... and for that ...").
+ Here are exact comparison counts across all the tests in sortperf.py,
when run with arguments "15 20 1".
Column Key:
*sort: random data
\sort: descending data
/sort: ascending data
3sort: ascending, then 3 random exchanges
+sort: ascending, then 10 random at the end
%sort: ascending, then randomly replace 1% of elements w/ random values
~sort: many duplicates
=sort: all equal
!sort: worst case scenario
First the trivial cases, trivial for samplesort because it special-cased
them, and trivial for timsort because it naturally works on runs. Within
an "n" block, the first line gives the # of compares done by samplesort,
the second line by timsort, and the third line is the percentage by
which the samplesort count exceeds the timsort count:
n \sort /sort =sort
------- ------ ------ ------
32768 32768 32767 32767 samplesort
32767 32767 32767 timsort
0.00% 0.00% 0.00% (samplesort - timsort) / timsort
65536 65536 65535 65535
65535 65535 65535
0.00% 0.00% 0.00%
131072 131072 131071 131071
131071 131071 131071
0.00% 0.00% 0.00%
262144 262144 262143 262143
262143 262143 262143
0.00% 0.00% 0.00%
524288 524288 524287 524287
524287 524287 524287
0.00% 0.00% 0.00%
1048576 1048576 1048575 1048575
1048575 1048575 1048575
0.00% 0.00% 0.00%
The algorithms are effectively identical in these cases, except that
timsort does one less compare in \sort.
Now for the more interesting cases. Where lg(x) is the logarithm of x to
the base 2 (e.g., lg(8)=3), lg(n!) is the information-theoretic limit for
the best any comparison-based sorting algorithm can do on average (across
all permutations). When a method gets significantly below that, it's
either astronomically lucky, or is finding exploitable structure in the
data.
n lg(n!) *sort 3sort +sort %sort ~sort !sort
------- ------- ------ ------- ------- ------ ------- --------
32768 444255 453096 453614 32908 452871 130491 469141 old
448885 33016 33007 50426 182083 65534 new
0.94% 1273.92% -0.30% 798.09% -28.33% 615.87% %ch from new
65536 954037 972699 981940 65686 973104 260029 1004607
962991 65821 65808 101667 364341 131070
1.01% 1391.83% -0.19% 857.15% -28.63% 666.47%
131072 2039137 2101881 2091491 131232 2092894 554790 2161379
2057533 131410 131361 206193 728871 262142
2.16% 1491.58% -0.10% 915.02% -23.88% 724.51%
262144 4340409 4464460 4403233 262314 4445884 1107842 4584560
4377402 262437 262459 416347 1457945 524286
1.99% 1577.82% -0.06% 967.83% -24.01% 774.44%
524288 9205096 9453356 9408463 524468 9441930 2218577 9692015
9278734 524580 524633 837947 2916107 1048574
1.88% 1693.52% -0.03% 1026.79% -23.92% 824.30%
1048576 19458756 19950272 19838588 1048766 19912134 4430649 20434212
19606028 1048958 1048941 1694896 5832445 2097150
1.76% 1791.27% -0.02% 1074.83% -24.03% 874.38%
Discussion of cases:
*sort: There's no structure in random data to exploit, so the theoretical
limit is lg(n!). Both methods get close to that, and timsort is hugging
it (indeed, in a *marginal* sense, it's a spectacular improvement --
there's only about 1% left before hitting the wall, and timsort knows
darned well it's doing compares that won't pay on random data -- but so
does the samplesort hybrid). For contrast, Hoare's original random-pivot
quicksort does about 39% more compares than the limit, and the median-of-3
variant about 19% more.
3sort, %sort, and !sort: No contest; there's structure in this data, but
not of the specific kinds samplesort special-cases. Note that structure
in !sort wasn't put there on purpose -- it was crafted as a worst case for
a previous quicksort implementation. That timsort nails it came as a
surprise to me (although it's obvious in retrospect).
+sort: samplesort special-cases this data, and does a few less compares
than timsort. However, timsort runs this case significantly faster on all
boxes we have timings for, because timsort is in the business of merging
runs efficiently, while samplesort does much more data movement in this
(for it) special case.
~sort: samplesort's special cases for large masses of equal elements are
extremely effective on ~sort's specific data pattern, and timsort just
isn't going to get close to that, despite that it's clearly getting a
great deal of benefit out of the duplicates (the # of compares is much less
than lg(n!)). ~sort has a perfectly uniform distribution of just 4
distinct values, and as the distribution gets more skewed, samplesort's
equal-element gimmicks become less effective, while timsort's adaptive
strategies find more to exploit; in a database supplied by Kevin Altis, a
sort on its highly skewed "on which stock exchange does this company's
stock trade?" field ran over twice as fast under timsort.
However, despite that timsort does many more comparisons on ~sort, and
that on several platforms ~sort runs highly significantly slower under
timsort, on other platforms ~sort runs highly significantly faster under
timsort. No other kind of data has shown this wild x-platform behavior,
and we don't have an explanation for it. The only thing I can think of
that could transform what "should be" highly significant slowdowns into
highly significant speedups on some boxes are catastrophic cache effects
in samplesort.
But timsort "should be" slower than samplesort on ~sort, so it's hard
to count that it isn't on some boxes as a strike against it <wink>.
+ Here's the highwater mark for the number of heap-based temp slots (4
bytes each on this box) needed by each test, again with arguments
"15 20 1":
2**i *sort \sort /sort 3sort +sort %sort ~sort =sort !sort
32768 16384 0 0 6256 0 10821 12288 0 16383
65536 32766 0 0 21652 0 31276 24576 0 32767
131072 65534 0 0 17258 0 58112 49152 0 65535
262144 131072 0 0 35660 0 123561 98304 0 131071
524288 262142 0 0 31302 0 212057 196608 0 262143
1048576 524286 0 0 312438 0 484942 393216 0 524287
Discussion: The tests that end up doing (close to) perfectly balanced
merges (*sort, !sort) need all N//2 temp slots (or almost all). ~sort
also ends up doing balanced merges, but systematically benefits a lot from
the preliminary pre-merge searches described under "Merge Memory" later.
%sort approaches having a balanced merge at the end because the random
selection of elements to replace is expected to produce an out-of-order
element near the midpoint. \sort, /sort, =sort are the trivial one-run
cases, needing no merging at all. +sort ends up having one very long run
and one very short, and so gets all the temp space it needs from the small
temparray member of the MergeState struct (note that the same would be
true if the new random elements were prefixed to the sorted list instead,
but not if they appeared "in the middle"). 3sort approaches N//3 temp
slots twice, but the run lengths that remain after 3 random exchanges
clearly has very high variance.
A detailed description of timsort follows.
Runs
----
count_run() returns the # of elements in the next run. A run is either
"ascending", which means non-decreasing:
a0 <= a1 <= a2 <= ...
or "descending", which means strictly decreasing:
a0 > a1 > a2 > ...
Note that a run is always at least 2 long, unless we start at the array's
last element.
The definition of descending is strict, because the main routine reverses
a descending run in-place, transforming a descending run into an ascending
run. Reversal is done via the obvious fast "swap elements starting at each
end, and converge at the middle" method, and that can violate stability if
the slice contains any equal elements. Using a strict definition of
descending ensures that a descending run contains distinct elements.
If an array is random, it's very unlikely we'll see long runs. If a natural
run contains less than minrun elements (see next section), the main loop
artificially boosts it to minrun elements, via a stable binary insertion sort
applied to the right number of array elements following the short natural
run. In a random array, *all* runs are likely to be minrun long as a
result. This has two primary good effects:
1. Random data strongly tends then toward perfectly balanced (both runs have
the same length) merges, which is the most efficient way to proceed when
data is random.
2. Because runs are never very short, the rest of the code doesn't make
heroic efforts to shave a few cycles off per-merge overheads. For
example, reasonable use of function calls is made, rather than trying to
inline everything. Since there are no more than N/minrun runs to begin
with, a few "extra" function calls per merge is barely measurable.
Computing minrun
----------------
If N < 64, minrun is N. IOW, binary insertion sort is used for the whole
array then; it's hard to beat that given the overheads of trying something
fancier (see note BINSORT).
When N is a power of 2, testing on random data showed that minrun values of
16, 32, 64 and 128 worked about equally well. At 256 the data-movement cost
in binary insertion sort clearly hurt, and at 8 the increase in the number
of function calls clearly hurt. Picking *some* power of 2 is important
here, so that the merges end up perfectly balanced (see next section). We
pick 32 as a good value in the sweet range; picking a value at the low end
allows the adaptive gimmicks more opportunity to exploit shorter natural
runs.
Because sortperf.py only tries powers of 2, it took a long time to notice
that 32 isn't a good choice for the general case! Consider N=2112:
>>> divmod(2112, 32)
(66, 0)
>>>
If the data is randomly ordered, we're very likely to end up with 66 runs
each of length 32. The first 64 of these trigger a sequence of perfectly
balanced merges (see next section), leaving runs of lengths 2048 and 64 to
merge at the end. The adaptive gimmicks can do that with fewer than 2048+64
compares, but it's still more compares than necessary, and-- mergesort's
bugaboo relative to samplesort --a lot more data movement (O(N) copies just
to get 64 elements into place).
If we take minrun=33 in this case, then we're very likely to end up with 64
runs each of length 33, and then all merges are perfectly balanced. Better!
What we want to avoid is picking minrun such that in
q, r = divmod(N, minrun)
q is a power of 2 and r>0 (then the last merge only gets r elements into
place, and r < minrun is small compared to N), or q a little larger than a
power of 2 regardless of r (then we've got a case similar to "2112", again
leaving too little work for the last merge to do).
Instead we pick a minrun in range(32, 65) such that N/minrun is exactly a
power of 2, or if that isn't possible, is close to, but strictly less than,
a power of 2. This is easier to do than it may sound: take the first 6
bits of N, and add 1 if any of the remaining bits are set. In fact, that
rule covers every case in this section, including small N and exact powers
of 2; merge_compute_minrun() is a deceptively simple function.
The Merge Pattern
-----------------
In order to exploit regularities in the data, we're merging on natural
run lengths, and they can become wildly unbalanced. That's a Good Thing
for this sort! It means we have to find a way to manage an assortment of
potentially very different run lengths, though.
Stability constrains permissible merging patterns. For example, if we have
3 consecutive runs of lengths
A:10000 B:20000 C:10000
we dare not merge A with C first, because if A, B and C happen to contain
a common element, it would get out of order wrt its occurrence(s) in B. The
merging must be done as (A+B)+C or A+(B+C) instead.
So merging is always done on two consecutive runs at a time, and in-place,
although this may require some temp memory (more on that later).
When a run is identified, its base address and length are pushed on a stack
in the MergeState struct. merge_collapse() is then called to see whether it
should merge it with preceding run(s). We would like to delay merging as
long as possible in order to exploit patterns that may come up later, but we
like even more to do merging as soon as possible to exploit that the run just
found is still high in the memory hierarchy. We also can't delay merging
"too long" because it consumes memory to remember the runs that are still
unmerged, and the stack has a fixed size.
What turned out to be a good compromise maintains two invariants on the
stack entries, where A, B and C are the lengths of the three righmost not-yet
merged slices:
1. A > B+C
2. B > C
Note that, by induction, #2 implies the lengths of pending runs form a
decreasing sequence. #1 implies that, reading the lengths right to left,
the pending-run lengths grow at least as fast as the Fibonacci numbers.
Therefore the stack can never grow larger than about log_base_phi(N) entries,
where phi = (1+sqrt(5))/2 ~= 1.618. Thus a small # of stack slots suffice
for very large arrays.
If A <= B+C, the smaller of A and C is merged with B (ties favor C, for the
freshness-in-cache reason), and the new run replaces the A,B or B,C entries;
e.g., if the last 3 entries are
A:30 B:20 C:10
then B is merged with C, leaving
A:30 BC:30
on the stack. Or if they were
A:500 B:400: C:1000
then A is merged with B, leaving
AB:900 C:1000
on the stack.
In both examples, the stack configuration after the merge still violates
invariant #2, and merge_collapse() goes on to continue merging runs until
both invariants are satisfied. As an extreme case, suppose we didn't do the
minrun gimmick, and natural runs were of lengths 128, 64, 32, 16, 8, 4, 2,
and 2. Nothing would get merged until the final 2 was seen, and that would
trigger 7 perfectly balanced merges.
The thrust of these rules when they trigger merging is to balance the run
lengths as closely as possible, while keeping a low bound on the number of
runs we have to remember. This is maximally effective for random data,
where all runs are likely to be of (artificially forced) length minrun, and
then we get a sequence of perfectly balanced merges (with, perhaps, some
oddballs at the end).
OTOH, one reason this sort is so good for partly ordered data has to do
with wildly unbalanced run lengths.
Merge Memory
------------
Merging adjacent runs of lengths A and B in-place, and in linear time, is
difficult. Theoretical constructions are known that can do it, but they're
too difficult and slow for practical use. But if we have temp memory equal
to min(A, B), it's easy.
If A is smaller (function merge_lo), copy A to a temp array, leave B alone,
and then we can do the obvious merge algorithm left to right, from the temp
area and B, starting the stores into where A used to live. There's always a
free area in the original area comprising a number of elements equal to the
number not yet merged from the temp array (trivially true at the start;
proceed by induction). The only tricky bit is that if a comparison raises an
exception, we have to remember to copy the remaining elements back in from
the temp area, lest the array end up with duplicate entries from B. But
that's exactly the same thing we need to do if we reach the end of B first,
so the exit code is pleasantly common to both the normal and error cases.
If B is smaller (function merge_hi, which is merge_lo's "mirror image"),
much the same, except that we need to merge right to left, copying B into a
temp array and starting the stores at the right end of where B used to live.
A refinement: When we're about to merge adjacent runs A and B, we first do
a form of binary search (more on that later) to see where B[0] should end up
in A. Elements in A preceding that point are already in their final
positions, effectively shrinking the size of A. Likewise we also search to
see where A[-1] should end up in B, and elements of B after that point can
also be ignored. This cuts the amount of temp memory needed by the same
amount.
These preliminary searches may not pay off, and can be expected *not* to
repay their cost if the data is random. But they can win huge in all of
time, copying, and memory savings when they do pay, so this is one of the
"per-merge overheads" mentioned above that we're happy to endure because
there is at most one very short run. It's generally true in this algorithm
that we're willing to gamble a little to win a lot, even though the net
expectation is negative for random data.
Merge Algorithms
----------------
merge_lo() and merge_hi() are where the bulk of the time is spent. merge_lo
deals with runs where A <= B, and merge_hi where A > B. They don't know
whether the data is clustered or uniform, but a lovely thing about merging
is that many kinds of clustering "reveal themselves" by how many times in a
row the winning merge element comes from the same run. We'll only discuss
merge_lo here; merge_hi is exactly analogous.
Merging begins in the usual, obvious way, comparing the first element of A
to the first of B, and moving B[0] to the merge area if it's less than A[0],
else moving A[0] to the merge area. Call that the "one pair at a time"
mode. The only twist here is keeping track of how many times in a row "the
winner" comes from the same run.
If that count reaches MIN_GALLOP, we switch to "galloping mode". Here
we *search* B for where A[0] belongs, and move over all the B's before
that point in one chunk to the merge area, then move A[0] to the merge
area. Then we search A for where B[0] belongs, and similarly move a
slice of A in one chunk. Then back to searching B for where A[0] belongs,
etc. We stay in galloping mode until both searches find slices to copy
less than MIN_GALLOP elements long, at which point we go back to one-pair-
at-a-time mode.
A refinement: The MergeState struct contains the value of min_gallop that
controls when we enter galloping mode, initialized to MIN_GALLOP.
merge_lo() and merge_hi() adjust this higher when galloping isn't paying
off, and lower when it is.
Galloping
---------
Still without loss of generality, assume A is the shorter run. In galloping
mode, we first look for A[0] in B. We do this via "galloping", comparing
A[0] in turn to B[0], B[1], B[3], B[7], ..., B[2**j - 1], ..., until finding
the k such that B[2**(k-1) - 1] < A[0] <= B[2**k - 1]. This takes at most
roughly lg(B) comparisons, and, unlike a straight binary search, favors
finding the right spot early in B (more on that later).
After finding such a k, the region of uncertainty is reduced to 2**(k-1) - 1
consecutive elements, and a straight binary search requires exactly k-1
additional comparisons to nail it (see note REGION OF UNCERTAINTY). Then we
copy all the B's up to that point in one chunk, and then copy A[0]. Note
that no matter where A[0] belongs in B, the combination of galloping + binary
search finds it in no more than about 2*lg(B) comparisons.
If we did a straight binary search, we could find it in no more than
ceiling(lg(B+1)) comparisons -- but straight binary search takes that many
comparisons no matter where A[0] belongs. Straight binary search thus loses
to galloping unless the run is quite long, and we simply can't guess
whether it is in advance.
If data is random and runs have the same length, A[0] belongs at B[0] half
the time, at B[1] a quarter of the time, and so on: a consecutive winning
sub-run in B of length k occurs with probability 1/2**(k+1). So long
winning sub-runs are extremely unlikely in random data, and guessing that a
winning sub-run is going to be long is a dangerous game.
OTOH, if data is lopsided or lumpy or contains many duplicates, long
stretches of winning sub-runs are very likely, and cutting the number of
comparisons needed to find one from O(B) to O(log B) is a huge win.
Galloping compromises by getting out fast if there isn't a long winning
sub-run, yet finding such very efficiently when they exist.
I first learned about the galloping strategy in a related context; see:
"Adaptive Set Intersections, Unions, and Differences" (2000)
Erik D. Demaine, Alejandro López-Ortiz, J. Ian Munro
and its followup(s). An earlier paper called the same strategy
"exponential search":
"Optimistic Sorting and Information Theoretic Complexity"
Peter McIlroy
SODA (Fourth Annual ACM-SIAM Symposium on Discrete Algorithms), pp
467-474, Austin, Texas, 25-27 January 1993.
and it probably dates back to an earlier paper by Bentley and Yao. The
McIlroy paper in particular has good analysis of a mergesort that's
probably strongly related to this one in its galloping strategy.
Galloping with a Broken Leg
---------------------------
So why don't we always gallop? Because it can lose, on two counts:
1. While we're willing to endure small per-merge overheads, per-comparison
overheads are a different story. Calling Yet Another Function per
comparison is expensive, and gallop_left() and gallop_right() are
too long-winded for sane inlining.
2. Galloping can-- alas --require more comparisons than linear one-at-time
search, depending on the data.
#2 requires details. If A[0] belongs before B[0], galloping requires 1
compare to determine that, same as linear search, except it costs more
to call the gallop function. If A[0] belongs right before B[1], galloping
requires 2 compares, again same as linear search. On the third compare,
galloping checks A[0] against B[3], and if it's <=, requires one more
compare to determine whether A[0] belongs at B[2] or B[3]. That's a total
of 4 compares, but if A[0] does belong at B[2], linear search would have
discovered that in only 3 compares, and that's a huge loss! Really. It's
an increase of 33% in the number of compares needed, and comparisons are
expensive in Python.
index in B where # compares linear # gallop # binary gallop
A[0] belongs search needs compares compares total
---------------- ----------------- -------- -------- ------
0 1 1 0 1
1 2 2 0 2
2 3 3 1 4
3 4 3 1 4
4 5 4 2 6
5 6 4 2 6
6 7 4 2 6
7 8 4 2 6
8 9 5 3 8
9 10 5 3 8
10 11 5 3 8
11 12 5 3 8
...
In general, if A[0] belongs at B[i], linear search requires i+1 comparisons
to determine that, and galloping a total of 2*floor(lg(i))+2 comparisons.
The advantage of galloping is unbounded as i grows, but it doesn't win at
all until i=6. Before then, it loses twice (at i=2 and i=4), and ties
at the other values. At and after i=6, galloping always wins.
We can't guess in advance when it's going to win, though, so we do one pair
at a time until the evidence seems strong that galloping may pay. MIN_GALLOP
is 7, and that's pretty strong evidence. However, if the data is random, it
simply will trigger galloping mode purely by luck every now and again, and
it's quite likely to hit one of the losing cases next. On the other hand,
in cases like ~sort, galloping always pays, and MIN_GALLOP is larger than it
"should be" then. So the MergeState struct keeps a min_gallop variable
that merge_lo and merge_hi adjust: the longer we stay in galloping mode,
the smaller min_gallop gets, making it easier to transition back to
galloping mode (if we ever leave it in the current merge, and at the
start of the next merge). But whenever the gallop loop doesn't pay,
min_gallop is increased by one, making it harder to transition back
to galloping mode (and again both within a merge and across merges). For
random data, this all but eliminates the gallop penalty: min_gallop grows
large enough that we almost never get into galloping mode. And for cases
like ~sort, min_gallop can fall to as low as 1. This seems to work well,
but in all it's a minor improvement over using a fixed MIN_GALLOP value.
Galloping Complication
----------------------
The description above was for merge_lo. merge_hi has to merge "from the
other end", and really needs to gallop starting at the last element in a run
instead of the first. Galloping from the first still works, but does more
comparisons than it should (this is significant -- I timed it both ways). For
this reason, the gallop_left() and gallop_right() (see note LEFT OR RIGHT)
functions have a "hint" argument, which is the index at which galloping
should begin. So galloping can actually start at any index, and proceed at
offsets of 1, 3, 7, 15, ... or -1, -3, -7, -15, ... from the starting index.
In the code as I type it's always called with either 0 or n-1 (where n is
the # of elements in a run). It's tempting to try to do something fancier,
melding galloping with some form of interpolation search; for example, if
we're merging a run of length 1 with a run of length 10000, index 5000 is
probably a better guess at the final result than either 0 or 9999. But
it's unclear how to generalize that intuition usefully, and merging of
wildly unbalanced runs already enjoys excellent performance.
~sort is a good example of when balanced runs could benefit from a better
hint value: to the extent possible, this would like to use a starting
offset equal to the previous value of acount/bcount. Doing so saves about
10% of the compares in ~sort. However, doing so is also a mixed bag,
hurting other cases.
Comparing Average # of Compares on Random Arrays
------------------------------------------------
[NOTE: This was done when the new algorithm used about 0.1% more compares
on random data than does its current incarnation.]
Here list.sort() is samplesort, and list.msort() this sort:
"""
import random
from time import clock as now
def fill(n):
from random import random
return [random() for i in range(n)]
def mycmp(x, y):
global ncmp
ncmp += 1
return cmp(x, y)
def timeit(values, method):
global ncmp
X = values[:]
bound = getattr(X, method)
ncmp = 0
t1 = now()
bound(mycmp)
t2 = now()
return t2-t1, ncmp
format = "%5s %9.2f %11d"
f2 = "%5s %9.2f %11.2f"
def drive():
count = sst = sscmp = mst = mscmp = nelts = 0
while True:
n = random.randrange(100000)
nelts += n
x = fill(n)
t, c = timeit(x, 'sort')
sst += t
sscmp += c
t, c = timeit(x, 'msort')
mst += t
mscmp += c
count += 1
if count % 10:
continue
print "count", count, "nelts", nelts
print format % ("sort", sst, sscmp)
print format % ("msort", mst, mscmp)
print f2 % ("", (sst-mst)*1e2/mst, (sscmp-mscmp)*1e2/mscmp)
drive()
"""
I ran this on Windows and kept using the computer lightly while it was
running. time.clock() is wall-clock time on Windows, with better than
microsecond resolution. samplesort started with a 1.52% #-of-comparisons
disadvantage, fell quickly to 1.48%, and then fluctuated within that small
range. Here's the last chunk of output before I killed the job:
count 2630 nelts 130906543
sort 6110.80 1937887573
msort 6002.78 1909389381
1.80 1.49
We've done nearly 2 billion comparisons apiece at Python speed there, and
that's enough <wink>.
For random arrays of size 2 (yes, there are only 2 interesting ones),
samplesort has a 50%(!) comparison disadvantage. This is a consequence of
samplesort special-casing at most one ascending run at the start, then
falling back to the general case if it doesn't find an ascending run
immediately. The consequence is that it ends up using two compares to sort
[2, 1]. Gratifyingly, timsort doesn't do any special-casing, so had to be
taught how to deal with mixtures of ascending and descending runs
efficiently in all cases.
NOTES
-----
BINSORT
A "binary insertion sort" is just like a textbook insertion sort, but instead
of locating the correct position of the next item via linear (one at a time)
search, an equivalent to Python's bisect.bisect_right is used to find the
correct position in logarithmic time. Most texts don't mention this
variation, and those that do usually say it's not worth the bother: insertion
sort remains quadratic (expected and worst cases) either way. Speeding the
search doesn't reduce the quadratic data movement costs.
But in CPython's case, comparisons are extraordinarily expensive compared to
moving data, and the details matter. Moving objects is just copying
pointers. Comparisons can be arbitrarily expensive (can invoke arbitrary
user-supplied Python code), but even in simple cases (like 3 < 4) _all_
decisions are made at runtime: what's the type of the left comparand? the
type of the right? do they need to be coerced to a common type? where's the
code to compare these types? And so on. Even the simplest Python comparison
triggers a large pile of C-level pointer dereferences, conditionals, and
function calls.
So cutting the number of compares is almost always measurably helpful in
CPython, and the savings swamp the quadratic-time data movement costs for
reasonable minrun values.
LEFT OR RIGHT
gallop_left() and gallop_right() are akin to the Python bisect module's
bisect_left() and bisect_right(): they're the same unless the slice they're
searching contains a (at least one) value equal to the value being searched
for. In that case, gallop_left() returns the position immediately before the
leftmost equal value, and gallop_right() the position immediately after the
rightmost equal value. The distinction is needed to preserve stability. In
general, when merging adjacent runs A and B, gallop_left is used to search
thru B for where an element from A belongs, and gallop_right to search thru A
for where an element from B belongs.
REGION OF UNCERTAINTY
Two kinds of confusion seem to be common about the claim that after finding
a k such that
B[2**(k-1) - 1] < A[0] <= B[2**k - 1]
then a binary search requires exactly k-1 tries to find A[0]'s proper
location. For concreteness, say k=3, so B[3] < A[0] <= B[7].
The first confusion takes the form "OK, then the region of uncertainty is at
indices 3, 4, 5, 6 and 7: that's 5 elements, not the claimed 2**(k-1) - 1 =
3"; or the region is viewed as a Python slice and the objection is "but that's
the slice B[3:7], so has 7-3 = 4 elements". Resolution: we've already
compared A[0] against B[3] and against B[7], so A[0]'s correct location is
already known wrt _both_ endpoints. What remains is to find A[0]'s correct
location wrt B[4], B[5] and B[6], which spans 3 elements. Or in general, the
slice (leaving off both endpoints) (2**(k-1)-1)+1 through (2**k-1)-1
inclusive = 2**(k-1) through (2**k-1)-1 inclusive, which has
(2**k-1)-1 - 2**(k-1) + 1 =
2**k-1 - 2**(k-1) =
2*2**k-1 - 2**(k-1) =
(2-1)*2**(k-1) - 1 =
2**(k-1) - 1
elements.
The second confusion: "k-1 = 2 binary searches can find the correct location
among 2**(k-1) = 4 elements, but you're only applying it to 3 elements: we
could make this more efficient by arranging for the region of uncertainty to
span 2**(k-1) elements." Resolution: that confuses "elements" with
"locations". In a slice with N elements, there are N+1 _locations_. In the
example, with the region of uncertainty B[4], B[5], B[6], there are 4
locations: before B[4], between B[4] and B[5], between B[5] and B[6], and
after B[6]. In general, across 2**(k-1)-1 elements, there are 2**(k-1)
locations. That's why k-1 binary searches are necessary and sufficient.

View file

@ -1,135 +0,0 @@
All about co_lnotab, the line number table.
Code objects store a field named co_lnotab. This is an array of unsigned bytes
disguised as a Python bytes object. It is used to map bytecode offsets to
source code line #s for tracebacks and to identify line number boundaries for
line tracing.
The array is conceptually a compressed list of
(bytecode offset increment, line number increment)
pairs. The details are important and delicate, best illustrated by example:
byte code offset source code line number
0 1
6 2
50 7
350 207
361 208
Instead of storing these numbers literally, we compress the list by storing only
the difference from one row to the next. Conceptually, the stored list might
look like:
0, 1, 6, 1, 44, 5, 300, 200, 11, 1
The above doesn't really work, but it's a start. An unsigned byte (byte code
offset) can't hold negative values, or values larger than 255, a signed byte
(line number) can't hold values larger than 127 or less than -128, and the
above example contains two such values. (Note that before 3.6, line number
was also encoded by an unsigned byte.) So we make two tweaks:
(a) there's a deep assumption that byte code offsets increase monotonically,
and
(b) if byte code offset jumps by more than 255 from one row to the next, or if
source code line number jumps by more than 127 or less than -128 from one row
to the next, more than one pair is written to the table. In case #b,
there's no way to know from looking at the table later how many were written.
That's the delicate part. A user of co_lnotab desiring to find the source
line number corresponding to a bytecode address A should do something like
this:
lineno = addr = 0
for addr_incr, line_incr in co_lnotab:
addr += addr_incr
if addr > A:
return lineno
if line_incr >= 0x80:
line_incr -= 0x100
lineno += line_incr
(In C, this is implemented by PyCode_Addr2Line().) In order for this to work,
when the addr field increments by more than 255, the line # increment in each
pair generated must be 0 until the remaining addr increment is < 256. So, in
the example above, assemble_lnotab in compile.c should not (as was actually done
until 2.2) expand 300, 200 to
255, 255, 45, 45,
but to
255, 0, 45, 127, 0, 73.
The above is sufficient to reconstruct line numbers for tracebacks, but not for
line tracing. Tracing is handled by PyCode_CheckLineNumber() in codeobject.c
and maybe_call_line_trace() in ceval.c.
*** Tracing ***
To a first approximation, we want to call the tracing function when the line
number of the current instruction changes. Re-computing the current line for
every instruction is a little slow, though, so each time we compute the line
number we save the bytecode indices where it's valid:
*instr_lb <= frame->f_lasti < *instr_ub
is true so long as execution does not change lines. That is, *instr_lb holds
the first bytecode index of the current line, and *instr_ub holds the first
bytecode index of the next line. As long as the above expression is true,
maybe_call_line_trace() does not need to call PyCode_CheckLineNumber(). Note
that the same line may appear multiple times in the lnotab, either because the
bytecode jumped more than 255 indices between line number changes or because
the compiler inserted the same line twice. Even in that case, *instr_ub holds
the first index of the next line.
However, we don't *always* want to call the line trace function when the above
test fails.
Consider this code:
1: def f(a):
2: while a:
3: print(1)
4: break
5: else:
6: print(2)
which compiles to this:
2 0 SETUP_LOOP 26 (to 28)
>> 2 LOAD_FAST 0 (a)
4 POP_JUMP_IF_FALSE 18
3 6 LOAD_GLOBAL 0 (print)
8 LOAD_CONST 1 (1)
10 CALL_FUNCTION 1
12 POP_TOP
4 14 BREAK_LOOP
16 JUMP_ABSOLUTE 2
>> 18 POP_BLOCK
6 20 LOAD_GLOBAL 0 (print)
22 LOAD_CONST 2 (2)
24 CALL_FUNCTION 1
26 POP_TOP
>> 28 LOAD_CONST 0 (None)
30 RETURN_VALUE
If 'a' is false, execution will jump to the POP_BLOCK instruction at offset 18
and the co_lnotab will claim that execution has moved to line 4, which is wrong.
In this case, we could instead associate the POP_BLOCK with line 5, but that
would break jumps around loops without else clauses.
We fix this by only calling the line trace function for a forward jump if the
co_lnotab indicates we have jumped to the *start* of a line, i.e. if the current
instruction offset matches the offset given for the start of a line by the
co_lnotab. For backward jumps, however, we always call the line trace function,
which lets a debugger stop on every evaluation of a loop guard (which usually
won't be the first opcode in a line).
Why do we set f_lineno when tracing, and only just before calling the trace
function? Well, consider the code above when 'a' is true. If stepping through
this with 'n' in pdb, you would stop at line 1 with a "call" type event, then
line events on lines 2, 3, and 4, then a "return" type event -- but because the
code for the return actually falls in the range of the "line 6" opcodes, you
would be shown line 6 during this event. This is a change from the behaviour in
2.2 and before, and I've found it confusing in practice. By setting and using
f_lineno when tracing, one can report a line number different from that
suggested by f_lasti on this one occasion where it's desirable.

View file

@ -1,13 +1,11 @@
/* clang-format off */
/* Long (arbitrary precision) integer object implementation */
/* XXX The functional organization of this file is terrible */
#include "Python.h"
#include "longintrepr.h"
#include <float.h>
#include <ctype.h>
#include <stddef.h>
#include "third_party/python/Include/Python.h"
#include "libc/log/check.h"
#include "third_party/python/Include/longintrepr.h"
#ifndef NSMALLPOSINTS
#define NSMALLPOSINTS 257
@ -1583,7 +1581,7 @@ long_to_decimal_string_internal(PyObject *aa,
digit *pout, *pin, rem, tenpow;
int negative;
int d;
enum PyUnicode_Kind kind;
enum PyUnicode_Kind kind = -1;
a = (PyLongObject *)aa;
if (a == NULL || !PyLong_Check(a)) {
@ -1675,6 +1673,8 @@ long_to_decimal_string_internal(PyObject *aa,
kind = PyUnicode_KIND(str);
}
CHECK_NE(-1, kind); /* if this fails there's a serious bug upstream */
#define WRITE_DIGITS(p) \
do { \
/* pout[0] through pout[size-2] contribute exactly \
@ -1773,7 +1773,7 @@ long_format_binary(PyObject *aa, int base, int alternate,
PyObject *v = NULL;
Py_ssize_t sz;
Py_ssize_t size_a;
enum PyUnicode_Kind kind;
enum PyUnicode_Kind kind = -1;
int negative;
int bits;
@ -1840,6 +1840,8 @@ long_format_binary(PyObject *aa, int base, int alternate,
kind = PyUnicode_KIND(v);
}
CHECK_NE(-1, kind); /* if this fails there's a serious bug upstream */
#define WRITE_DIGITS(p) \
do { \
if (size_a == 0) { \

View file

@ -1,8 +1,8 @@
/* clang-format off */
/* Memoryview object implementation */
#include "Python.h"
#include "pystrhex.h"
#include <stddef.h>
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/pystrhex.h"
/****************************************************************************/

View file

@ -1,8 +1,9 @@
/* clang-format off */
/* Method object implementation */
#include "Python.h"
#include "structmember.h"
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
/* Free list for method objects to safe malloc/free overhead
* The m_self element is used to chain the objects.

View file

@ -1,8 +1,6 @@
/* Module object implementation */
#include "Python.h"
#include "structmember.h"
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
static Py_ssize_t max_module_number;

View file

@ -1,8 +1,6 @@
// namespace object implementation
#include "Python.h"
#include "structmember.h"
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
typedef struct {
PyObject_HEAD

View file

@ -1,12 +1,6 @@
/* Generic object operations; and implementation of None */
#include "Python.h"
#include "frameobject.h"
#ifdef __cplusplus
extern "C" {
#endif
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/frameobject.h"
_Py_IDENTIFIER(Py_Repr);
_Py_IDENTIFIER(__bytes__);
@ -2074,7 +2068,3 @@ _Py_Dealloc(PyObject *op)
(*Py_TYPE(op)->tp_dealloc)(op);
}
#endif
#ifdef __cplusplus
}
#endif

View file

@ -1,6 +1,8 @@
#include "Python.h"
#include <stdbool.h>
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/prot.h"
#include "libc/sysv/consts/prot.h"
/* Defined in tracemalloc.c */
@ -61,14 +63,7 @@ static void _PyMem_DebugCheckAddress(char api_id, const void *p);
#ifdef WITH_PYMALLOC
#ifdef MS_WINDOWS
# include <windows.h>
#elif defined(HAVE_MMAP)
# include <sys/mman.h>
# ifdef MAP_ANONYMOUS
# define ARENAS_USE_MMAP
# endif
#endif
#define ARENAS_USE_MMAP
/* Forward declaration */
static void* _PyObject_Malloc(void *ctx, size_t size);

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* Ordered Dictionary object implementation.
This implementation is necessarily explicitly equivalent to the pure Python
@ -464,11 +465,9 @@ later:
*/
#include "Python.h"
#include "structmember.h"
#include "dict-common.h"
#include <stddef.h>
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
#include "third_party/python/Objects/dict-common.h"
typedef struct _odictnode _ODictNode;

View file

@ -1,7 +1,6 @@
/* Range object implementation */
#include "Python.h"
#include "structmember.h"
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
/* Support objects whose length is > PY_SSIZE_T_MAX.

View file

@ -1,3 +1,6 @@
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
/* set object implementation
@ -24,9 +27,6 @@
NULL if the rich comparison returns an error.
*/
#include "Python.h"
#include "structmember.h"
/* Object used as dummy key to fill deleted entries */
static PyObject _dummy_struct;

View file

@ -1,3 +1,7 @@
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
/*
Written by Jim Hugunin and Chris Chase.
@ -13,9 +17,6 @@ the Py_NoneStruct in that there is no way to create other objects of
this type and there is exactly one in existence.
*/
#include "Python.h"
#include "structmember.h"
static PyObject *
ellipsis_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
{

View file

@ -1,3 +1,5 @@
/* clang-format off */
/* clang-format off */
/* this is sort of a hack. there's at least one place (formatting
floats) where some stringlib code takes a different path if it's
compiled as unicode. */

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* stringlib: codec implementations */
#if !STRINGLIB_IS_UNICODE

View file

@ -1,7 +1,8 @@
/* clang-format off */
/* stringlib: count implementation */
#ifndef STRINGLIB_FASTSEARCH_H
#error must include "stringlib/fastsearch.h" before including this module
#error must include fastsearch.inc before including this module
#endif
Py_LOCAL_INLINE(Py_ssize_t)

View file

@ -1,8 +1,9 @@
/* clang-format off */
#if STRINGLIB_IS_UNICODE
# error "ctype.h only compatible with byte-wise strings"
#endif
#include "bytes_methods.h"
#include "third_party/python/Include/bytes_methods.h"
static PyObject*
stringlib_isspace(PyObject *self)

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* Fast unicode equal function optimized for dictobject.c and setobject.c */
/* Return 1 if two unicode objects are equal, 0 if not.

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* stringlib: fastsearch implementation */
#define STRINGLIB_FASTSEARCH_H

View file

@ -1,7 +1,8 @@
/* clang-format off */
/* stringlib: find/index implementation */
#ifndef STRINGLIB_FASTSEARCH_H
#error must include "stringlib/fastsearch.h" before including this module
#error must include fastsearch.inc before including this module
#endif
Py_LOCAL_INLINE(Py_ssize_t)

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* Finding the optimal width of unicode characters in a buffer */
#if !STRINGLIB_IS_UNICODE

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* stringlib: bytes joining implementation */
#if STRINGLIB_IS_UNICODE

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* _PyUnicode_InsertThousandsGrouping() helper functions */
typedef struct {

View file

@ -1,7 +1,8 @@
/* clang-format off */
/* stringlib: partition implementation */
#ifndef STRINGLIB_FASTSEARCH_H
#error must include "stringlib/fastsearch.h" before including this module
#error must include fastsearch.inc before including this module
#endif
Py_LOCAL_INLINE(PyObject*)

View file

@ -1,7 +1,8 @@
/* clang-format off */
/* stringlib: replace implementation */
#ifndef STRINGLIB_FASTSEARCH_H
#error must include "stringlib/fastsearch.h" before including this module
#error must include fastsearch.inc before including this module
#endif
Py_LOCAL_INLINE(void)

View file

@ -1,7 +1,8 @@
/* clang-format off */
/* stringlib: split implementation */
#ifndef STRINGLIB_FASTSEARCH_H
#error must include "stringlib/fastsearch.h" before including this module
#error must include fastsearch.inc before including this module
#endif
/* Overallocate the initial list to reduce the number of reallocs for small

View file

@ -1,3 +1,4 @@
/* clang-format off */
#ifndef STRINGLIB_STRINGDEFS_H
#define STRINGLIB_STRINGDEFS_H

View file

@ -1,3 +1,4 @@
/* clang-format off */
#if STRINGLIB_IS_UNICODE
# error "transmogrify.h only compatible with byte-wise strings"
#endif

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* this is sort of a hack. there's at least one place (formatting
floats) where some stringlib code takes a different path if it's
compiled as unicode. */

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* this is sort of a hack. there's at least one place (formatting
floats) where some stringlib code takes a different path if it's
compiled as unicode. */

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* this is sort of a hack. there's at least one place (formatting
floats) where some stringlib code takes a different path if it's
compiled as unicode. */

View file

@ -1,3 +1,4 @@
/* clang-format off */
#undef FASTSEARCH
#undef STRINGLIB
#undef STRINGLIB_SIZEOF_CHAR

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
unicode_format.h -- implementation of str.format().
*/

View file

@ -1,3 +1,4 @@
/* clang-format off */
#ifndef STRINGLIB_UNICODEDEFS_H
#define STRINGLIB_UNICODEDEFS_H

View file

@ -1,9 +1,10 @@
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
/* Implementation helper: a struct that looks like a tuple. See timemodule
and posixmodule for example uses. */
#include "Python.h"
#include "structmember.h"
static const char visible_length_key[] = "n_sequence_fields";
static const char real_length_key[] = "n_fields";
static const char unnamed_fields_key[] = "n_unnamed_fields";

View file

@ -1,9 +1,9 @@
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/accu.h"
/* Tuple object implementation */
#include "Python.h"
#include "accu.h"
/* Speed optimization to avoid frequent malloc/free of small tuples */
#ifndef PyTuple_MAXSAVESIZE
#define PyTuple_MAXSAVESIZE 20 /* Largest tuple to save on free list */

View file

@ -1,12 +1,10 @@
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/frameobject.h"
#include "third_party/python/Include/structmember.h"
/* Type object implementation */
#include "Python.h"
#include "frameobject.h"
#include "structmember.h"
#include <ctype.h>
/* Support type attribute cache */
/* The cache can keep references to the names alive for longer than

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* Generated by typeslots.py */
0,
0,

View file

@ -1,3 +1,6 @@
/* clang-format off */
#include "third_party/python/Include/Python.h"
/*
Unicode character type helpers.
@ -8,8 +11,6 @@
*/
#include "Python.h"
#define ALPHA_MASK 0x01
#define DECIMAL_MASK 0x02
#define DIGIT_MASK 0x04
@ -40,7 +41,7 @@ typedef struct {
const unsigned short flags;
} _PyUnicode_TypeRecord;
#include "unicodetype_db.h"
#include "third_party/python/Objects/unicodetype_db.inc"
static const _PyUnicode_TypeRecord *
gettyperecord(Py_UCS4 code)

View file

@ -1,3 +1,10 @@
/* clang-format off */
#define PY_SSIZE_T_CLEAN
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/ucnhash.h"
#include "third_party/python/Include/bytes_methods.h"
#include "third_party/python/Objects/stringlib/eq.inc"
/*
Unicode implementation based on original code by Fredrik Lundh,
@ -38,16 +45,6 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#define PY_SSIZE_T_CLEAN
#include "Python.h"
#include "ucnhash.h"
#include "bytes_methods.h"
#include "stringlib/eq.h"
#ifdef MS_WINDOWS
#include <windows.h>
#endif
/*[clinic input]
class str "PyUnicodeObject *" "&PyUnicode_Type"
[clinic start generated code]*/
@ -332,7 +329,7 @@ static const unsigned char ascii_linebreak[] = {
0, 0, 0, 0, 0, 0, 0, 0
};
#include "clinic/unicodeobject.c.h"
#include "third_party/python/Objects/clinic/unicodeobject.inc"
typedef enum {
_Py_ERROR_UNKNOWN=0,
@ -805,50 +802,50 @@ ensure_unicode(PyObject *obj)
/* Compilation of templated routines */
#include "stringlib/asciilib.h"
#include "stringlib/fastsearch.h"
#include "stringlib/partition.h"
#include "stringlib/split.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"
#include "third_party/python/Objects/stringlib/asciilib.inc"
#include "third_party/python/Objects/stringlib/fastsearch.inc"
#include "third_party/python/Objects/stringlib/partition.inc"
#include "third_party/python/Objects/stringlib/split.inc"
#include "third_party/python/Objects/stringlib/count.inc"
#include "third_party/python/Objects/stringlib/find.inc"
#include "third_party/python/Objects/stringlib/find_max_char.inc"
#include "third_party/python/Objects/stringlib/undef.inc"
#include "stringlib/ucs1lib.h"
#include "stringlib/fastsearch.h"
#include "stringlib/partition.h"
#include "stringlib/split.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/replace.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"
#include "third_party/python/Objects/stringlib/ucs1lib.inc"
#include "third_party/python/Objects/stringlib/fastsearch.inc"
#include "third_party/python/Objects/stringlib/partition.inc"
#include "third_party/python/Objects/stringlib/split.inc"
#include "third_party/python/Objects/stringlib/count.inc"
#include "third_party/python/Objects/stringlib/find.inc"
#include "third_party/python/Objects/stringlib/replace.inc"
#include "third_party/python/Objects/stringlib/find_max_char.inc"
#include "third_party/python/Objects/stringlib/undef.inc"
#include "stringlib/ucs2lib.h"
#include "stringlib/fastsearch.h"
#include "stringlib/partition.h"
#include "stringlib/split.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/replace.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"
#include "third_party/python/Objects/stringlib/ucs2lib.inc"
#include "third_party/python/Objects/stringlib/fastsearch.inc"
#include "third_party/python/Objects/stringlib/partition.inc"
#include "third_party/python/Objects/stringlib/split.inc"
#include "third_party/python/Objects/stringlib/count.inc"
#include "third_party/python/Objects/stringlib/find.inc"
#include "third_party/python/Objects/stringlib/replace.inc"
#include "third_party/python/Objects/stringlib/find_max_char.inc"
#include "third_party/python/Objects/stringlib/undef.inc"
#include "stringlib/ucs4lib.h"
#include "stringlib/fastsearch.h"
#include "stringlib/partition.h"
#include "stringlib/split.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/replace.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"
#include "third_party/python/Objects/stringlib/ucs4lib.inc"
#include "third_party/python/Objects/stringlib/fastsearch.inc"
#include "third_party/python/Objects/stringlib/partition.inc"
#include "third_party/python/Objects/stringlib/split.inc"
#include "third_party/python/Objects/stringlib/count.inc"
#include "third_party/python/Objects/stringlib/find.inc"
#include "third_party/python/Objects/stringlib/replace.inc"
#include "third_party/python/Objects/stringlib/find_max_char.inc"
#include "third_party/python/Objects/stringlib/undef.inc"
#include "stringlib/unicodedefs.h"
#include "stringlib/fastsearch.h"
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/undef.h"
#include "third_party/python/Objects/stringlib/unicodedefs.inc"
#include "third_party/python/Objects/stringlib/fastsearch.inc"
#include "third_party/python/Objects/stringlib/count.inc"
#include "third_party/python/Objects/stringlib/find.inc"
#include "third_party/python/Objects/stringlib/undef.inc"
/* --- Unicode Object ----------------------------------------------------- */
@ -4971,21 +4968,21 @@ PyUnicode_DecodeUTF8(const char *s,
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
}
#include "stringlib/asciilib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
#include "third_party/python/Objects/stringlib/asciilib.inc"
#include "third_party/python/Objects/stringlib/codecs.inc"
#include "third_party/python/Objects/stringlib/undef.inc"
#include "stringlib/ucs1lib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
#include "third_party/python/Objects/stringlib/ucs1lib.inc"
#include "third_party/python/Objects/stringlib/codecs.inc"
#include "third_party/python/Objects/stringlib/undef.inc"
#include "stringlib/ucs2lib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
#include "third_party/python/Objects/stringlib/ucs2lib.inc"
#include "third_party/python/Objects/stringlib/codecs.inc"
#include "third_party/python/Objects/stringlib/undef.inc"
#include "stringlib/ucs4lib.h"
#include "stringlib/codecs.h"
#include "stringlib/undef.h"
#include "third_party/python/Objects/stringlib/ucs4lib.inc"
#include "third_party/python/Objects/stringlib/codecs.inc"
#include "third_party/python/Objects/stringlib/undef.inc"
/* Mask to quickly check whether a C 'long' contains a
non-ASCII, UTF8-encoded char. */
@ -9436,7 +9433,7 @@ any_find_slice(PyObject* s1, PyObject* s2,
}
/* _PyUnicode_InsertThousandsGrouping() helper functions */
#include "stringlib/localeutil.h"
#include "third_party/python/Objects/stringlib/localeutil.inc"
/**
* InsertThousandsGrouping:
@ -13891,7 +13888,7 @@ _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Py_CLEAR(writer->buffer);
}
#include "stringlib/unicode_format.h"
#include "third_party/python/Objects/stringlib/unicode_format.inc"
PyDoc_STRVAR(format__doc__,
"S.format(*args, **kwargs) -> str\n\

View file

@ -1,3 +1,4 @@
/* clang-format off */
/* this file was generated by Tools/unicode/makeunicodedata.py 3.2 */
/* a list of unique character type descriptors */

View file

@ -1,6 +1,6 @@
#include "Python.h"
#include "structmember.h"
/* clang-format off */
#include "third_party/python/Include/Python.h"
#include "third_party/python/Include/structmember.h"
#define GET_WEAKREFS_LISTPTR(o) \
((PyWeakReference **) PyObject_GET_WEAKREFS_LISTPTR(o))