Make numerous improvements

- Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
2025-10-22 01:15:58 +00:00 · 2021-09-27 22:58:51 -07:00 · 2021-09-27 22:58:51 -07:00 · 39bf41f4eb
commit 39bf41f4eb
parent fa7b4f5bd1
806 changed files with 77494 additions and 63859 deletions
--- a/third_party/python/Modules/_decimal/libmpdec/fourstep.c
+++ b/third_party/python/Modules/_decimal/libmpdec/fourstep.c
@ -40,12 +40,261 @@ libmpdec (BSD-2)\\n\
 Copyright 2008-2016 Stefan Krah\"");
 asm(".include \"libc/disclaimer.inc\"");

-
-/* Bignum: Cache efficient Matrix Fourier Transform for arrays of the
-   form 3 * 2**n (See literature/matrix-transform.txt). */
+/*
+                 Cache Efficient Matrix Fourier Transform
+                        for arrays of form 3×2ⁿ


-#ifndef PPRO
+The Matrix Fourier Transform
+════════════════════════════
+
+In libmpdec, the Matrix Fourier Transform [1] is called four-step
+transform after a variant that appears in [2]. The algorithm requires
+that the input array can be viewed as an R*C matrix.
+
+All operations are done modulo p. For readability, the proofs drop all
+instances of (mod p).
+
+
+Algorithm four-step (forward transform)
+───────────────────────────────────────
+
+  a := input array
+  d := len(a) = R * C
+  p := prime
+  w := primitive root of unity of the prime field
+  r := w**((p-1)/d)
+  A := output array
+
+  1) Apply a length R FNT to each column.
+
+  2) Multiply each matrix element (addressed by j*C+m) by r**(j*m).
+
+  3) Apply a length C FNT to each row.
+
+  4) Transpose the matrix.
+
+
+Proof (forward transform)
+─────────────────────────
+
+  The algorithm can be derived starting from the regular definition of
+  the finite-field transform of length d:
+
+            d-1
+           ,────
+           \
+    A[k] =  |  a[l]  × r**(k × l)
+           /
+           `────
+           l = 0
+
+
+  The sum can be rearranged into the sum of the sums of columns:
+
+            C-1     R-1
+           ,────   ,────
+           \       \
+         =  |       |  a[i × C + j] × r**(k × (i × C + j))
+           /       /
+           `────   `────
+           j = 0   i = 0
+
+
+  Extracting a constant from the inner sum:
+
+            C-1           R-1
+           ,────         ,────
+           \             \
+         =  |  rᵏ×j    ×  |  a[i × C + j] × r**(k × i × C)
+           /             /
+           `────         `────
+           j = 0         i = 0
+
+
+  Without any loss of generality, let k = n × R + m,
+  where n < C and m < R:
+
+                C-1                          R-1
+               ,────                        ,────
+               \                            \
+    A[n×R+m] =  |  r**(R×n×j) × r**(m×j)  ×  |  a[i×C+j] × r**(R×C×n×i) × r**(C×m×i)
+               /                            /
+               `────                        `────
+               j = 0                        i = 0
+
+
+  Since r = w ** ((p-1) / (R×C)):
+
+     a) r**(R×C×n×i) = w**((p-1)×n×i) = 1
+
+     b) r**(C×m×i) = w**((p-1) / R) ** (m×i) = r_R ** (m×i)
+
+     c) r**(R×n×j) = w**((p-1) / C) ** (n×j) = r_C ** (n×j)
+
+     r_R := root of the subfield of length R.
+     r_C := root of the subfield of length C.
+
+
+                C-1                             R-1
+               ,────                           ,────
+               \                               \
+    A[n×R+m] =  |  r_C**(n×j) × [ r**(m×j)  ×   |  a[i×C+j] × r_R**(m×i) ]
+               /                     ^         /
+               `────                 |         `────    1) transform the columns
+               j = 0                 |         i = 0
+                 ^                   |
+                 |                   `-- 2) multiply
+                 |
+                 `-- 3) transform the rows
+
+
+   Note that the entire RHS is a function of n and m and that the results
+   for each pair (n, m) are stored in Fortran order.
+
+   Let the term in square brackets be 𝑓(m, j). Step 1) and 2) precalculate
+   the term for all (m, j). After that, the original matrix is now a lookup
+   table with the mth element in the jth column at location m × C + j.
+
+   Let the complete RHS be g(m, n). Step 3) does an in-place transform of
+   length n on all rows. After that, the original matrix is now a lookup
+   table with the mth element in the nth column at location m × C + n.
+
+   But each (m, n) pair should be written to location n × R + m. Therefore,
+   step 4) transposes the result of step 3).
+
+
+
+Algorithm four-step (inverse transform)
+───────────────────────────────────────
+
+  A  := input array
+  d  := len(A) = R × C
+  p  := prime
+  d′ := d⁽ᵖ⁻²⁾               # inverse of d
+  w  := primitive root of unity of the prime field
+  r  := w**((p-1)/d)         # root of the subfield
+  r′ := w**((p-1) - (p-1)/d) # inverse of r
+  a  := output array
+
+  0) View the matrix as a C×R matrix.
+
+  1) Transpose the matrix, producing an R×C matrix.
+
+  2) Apply a length C FNT to each row.
+
+  3) Multiply each matrix element (addressed by i×C+n) by r**(i×n).
+
+  4) Apply a length R FNT to each column.
+
+
+Proof (inverse transform)
+─────────────────────────
+
+  The algorithm can be derived starting from the regular definition of
+  the finite-field inverse transform of length d:
+
+                  d-1
+                 ,────
+                 \
+    a[k] =  d′ ×  |  A[l]  × r′ ** (k × l)
+                 /
+                 `────
+                 l = 0
+
+
+  The sum can be rearranged into the sum of the sums of columns. Note
+  that at this stage we still have a C*R matrix, so C denotes the number
+  of rows:
+
+                  R-1     C-1
+                 ,────   ,────
+                 \       \
+         =  d′ ×  |       |  a[j × R + i] × r′ ** (k × (j × R + i))
+                 /       /
+                 `────   `────
+                 i = 0   j = 0
+
+
+  Extracting a constant from the inner sum:
+
+                  R-1                C-1
+                 ,────              ,────
+                 \                  \
+         =  d′ ×  |  r′ ** (k×i)  ×  |  a[j × R + i] × r′ ** (k × j × R)
+                 /                  /
+                 `────              `────
+                 i = 0              j = 0
+
+
+  Without any loss of generality, let k = m * C + n,
+  where m < R and n < C:
+
+                     R-1                                  C-1
+                    ,────                                ,────
+                    \                                    \
+    A[m×C+n] = d′ ×  |  r′ ** (C×m×i) ×  r′ ** (n×i)   ×  |  a[j×R+i] × r′ ** (R×C×m×j) × r′ ** (R×n×j)
+                    /                                    /
+                    `────                                `────
+                    i = 0                                j = 0
+
+
+  Since r′ = w**((p-1) - (p-1)/d) and d = R×C:
+
+     a) r′ ** (R×C×m×j) = w**((p-1)×R×C×m×j - (p-1)×m×j) = 1
+
+     b) r′ ** (C×m×i) = w**((p-1)×C - (p-1)/R) ** (m×i) = r_R′ ** (m×i)
+
+     c) r′ ** (R×n×j) = r_C′ ** (n×j)
+
+     d) d′ = d⁽ᵖ⁻²⁾ = (R×C)⁽ᵖ⁻²⁾ = R⁽ᵖ⁻²⁾ × C⁽ᵖ⁻²⁾ = R′ × C′
+
+     r_R′ := inverse of the root of the subfield of length R.
+     r_C′ := inverse of the root of the subfield of length C.
+     R′   := inverse of R
+     C′   := inverse of C
+
+
+                     R-1                                      C-1
+                    ,────                                    ,────  2) transform the rows of a^T
+                    \                                        \
+    A[m×C+n] = R′ ×  |  r_R′ ** (m×i) × [ r′ ** (n×i) × C′ ×  |  a[j×R+i] × r_C′ ** (n×j) ]
+                    /                           ^            /       ^
+                    `────                       |            `────   |
+                    i = 0                       |            j = 0   |
+                      ^                         |                    `── 1) Transpose input matrix
+                      |                         `── 3) multiply             to address elements by
+                      |                                                     i × C + j
+                      `── 3) transform the columns
+
+
+
+   Note that the entire RHS is a function of m and n and that the results
+   for each pair (m, n) are stored in C order.
+
+   Let the term in square brackets be 𝑓(n, i). Without step 1), the sum
+   would perform a length C transform on the columns of the input matrix.
+   This is a) inefficient and b) the results are needed in C order, so
+   step 1) exchanges rows and columns.
+
+   Step 2) and 3) precalculate 𝑓(n, i) for all (n, i). After that, the
+   original matrix is now a lookup table with the ith element in the nth
+   column at location i × C + n.
+
+   Let the complete RHS be g(m, n). Step 4) does an in-place transform of
+   length m on all columns. After that, the original matrix is now a lookup
+   table with the mth element in the nth column at location m × C + n,
+   which means that all A[k] = A[m × C + n] are in the correct order.
+
+
+──
+
+  [1] Joerg Arndt: "Matters Computational"
+      http://www.jjj.de/fxt/
+  [2] David H. Bailey: FFTs in External or Hierarchical Memory
+      http://crd.lbl.gov/~dhbailey/dhbpapers/
+*/
+
 static inline void
 std_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3,
              mpd_uint_t w3table[3], mpd_uint_t umod)
@ -53,90 +302,32 @@ std_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3,
    mpd_uint_t r1, r2;
    mpd_uint_t w;
    mpd_uint_t s, tmp;
-
-
    /* k = 0 -> w = 1 */
    s = *x1;
    s = addmod(s, *x2, umod);
    s = addmod(s, *x3, umod);
-
    r1 = s;
-
    /* k = 1 */
    s = *x1;
-
    w = w3table[1];
    tmp = MULMOD(*x2, w);
    s = addmod(s, tmp, umod);
-
    w = w3table[2];
    tmp = MULMOD(*x3, w);
    s = addmod(s, tmp, umod);
-
    r2 = s;
-
    /* k = 2 */
    s = *x1;
-
    w = w3table[2];
    tmp = MULMOD(*x2, w);
    s = addmod(s, tmp, umod);
-
    w = w3table[1];
    tmp = MULMOD(*x3, w);
    s = addmod(s, tmp, umod);
-
    *x3 = s;
    *x2 = r2;
    *x1 = r1;
 }
-#else /* PPRO */
-static inline void
-ppro_size3_ntt(mpd_uint_t *x1, mpd_uint_t *x2, mpd_uint_t *x3, mpd_uint_t w3table[3],
-               mpd_uint_t umod, double *dmod, uint32_t dinvmod[3])
-{
-    mpd_uint_t r1, r2;
-    mpd_uint_t w;
-    mpd_uint_t s, tmp;
-
-
-    /* k = 0 -> w = 1 */
-    s = *x1;
-    s = addmod(s, *x2, umod);
-    s = addmod(s, *x3, umod);
-
-    r1 = s;
-
-    /* k = 1 */
-    s = *x1;
-
-    w = w3table[1];
-    tmp = ppro_mulmod(*x2, w, dmod, dinvmod);
-    s = addmod(s, tmp, umod);
-
-    w = w3table[2];
-    tmp = ppro_mulmod(*x3, w, dmod, dinvmod);
-    s = addmod(s, tmp, umod);
-
-    r2 = s;
-
-    /* k = 2 */
-    s = *x1;
-
-    w = w3table[2];
-    tmp = ppro_mulmod(*x2, w, dmod, dinvmod);
-    s = addmod(s, tmp, umod);
-
-    w = w3table[1];
-    tmp = ppro_mulmod(*x3, w, dmod, dinvmod);
-    s = addmod(s, tmp, umod);
-
-    *x3 = s;
-    *x2 = r2;
-    *x1 = r1;
-}
-#endif
-

 /* forward transform, sign = -1; transform length = 3 * 2**n */
 int
@ -148,25 +339,15 @@ four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum)
    mpd_uint_t kernel, w0, w1, wstep;
    mpd_uint_t *s, *p0, *p1, *p2;
    mpd_uint_t umod;
-#ifdef PPRO
-    double dmod;
-    uint32_t dinvmod[3];
-#endif
    mpd_size_t i, k;
-
-
    assert(n >= 48);
    assert(n <= 3*MPD_MAXTRANSFORM_2N);
-
-
    /* Length R transform on the columns. */
    SETMODULUS(modnum);
    _mpd_init_w3table(w3table, -1, modnum);
    for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) {
-
        SIZE3_NTT(p0, p1, p2, w3table);
    }
-
    /* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */
    kernel = _mpd_getkernel(n, -1, modnum);
    for (i = 1; i < R; i++) {
@ -182,20 +363,17 @@ four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum)
            a[i*C+k+1] = x1;
        }
    }
-
    /* Length C transform on the rows. */
    for (s = a; s < a+n; s += C) {
        if (!six_step_fnt(s, C, modnum)) {
            return 0;
        }
    }
-
 #if 0
    /* An unordered transform is sufficient for convolution. */
    /* Transpose the matrix. */
    transpose_3xpow2(a, R, C);
 #endif
-
    return 1;
 }

@ -209,30 +387,20 @@ inv_four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum)
    mpd_uint_t kernel, w0, w1, wstep;
    mpd_uint_t *s, *p0, *p1, *p2;
    mpd_uint_t umod;
-#ifdef PPRO
-    double dmod;
-    uint32_t dinvmod[3];
-#endif
    mpd_size_t i, k;
-
-
    assert(n >= 48);
    assert(n <= 3*MPD_MAXTRANSFORM_2N);
-
-
 #if 0
    /* An unordered transform is sufficient for convolution. */
    /* Transpose the matrix, producing an R*C matrix. */
    transpose_3xpow2(a, C, R);
 #endif
-
    /* Length C transform on the rows. */
    for (s = a; s < a+n; s += C) {
        if (!inv_six_step_fnt(s, C, modnum)) {
            return 0;
        }
    }
-
    /* Multiply each matrix element (addressed by i*C+k) by r**(i*k). */
    SETMODULUS(modnum);
    kernel = _mpd_getkernel(n, 1, modnum);
@ -249,13 +417,10 @@ inv_four_step_fnt(mpd_uint_t *a, mpd_size_t n, int modnum)
            a[i*C+k+1] = x1;
        }
    }
-
    /* Length R transform on the columns. */
    _mpd_init_w3table(w3table, 1, modnum);
    for (p0=a, p1=p0+C, p2=p0+2*C; p0<a+C; p0++,p1++,p2++) {
-
        SIZE3_NTT(p0, p1, p2, w3table);
    }
-
    return 1;
 }