Perform build and magnum tuning

Building o//third_party/python now takes 5 seconds on my PC

This change works towards modifying Python to use runtime dispatching
when appropriate. For example, when loading the magnums in the socket
module, it's a good idea to check if the magnum is zero, because that
means the local system platform doesn't support it.
This commit is contained in:
Justine Tunney 2021-08-10 10:26:13 -07:00
parent ee7e296339
commit d26d7ae0e4
1028 changed files with 6576 additions and 172777 deletions

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,10 +28,6 @@
#include "mpdecimal.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "constants.h"
#include "typearith.h"
#include "basearith.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -31,7 +32,6 @@
#include "mpdecimal.h"
#include <stdio.h>
#include "typearith.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -31,7 +32,6 @@
#include "mpdecimal.h"
#include <stdio.h>
/* Check if n is a power of 2. */

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,7 +28,6 @@
#include "mpdecimal.h"
#include <stdio.h>
#include "constants.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -26,10 +27,9 @@
*/
#include "libc/calls/calls.h"
#include "libc/sysv/consts/sig.h"
#include "mpdecimal.h"
#include <stdio.h>
#include <string.h>
#include <signal.h>
void

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,7 +28,6 @@
#include "mpdecimal.h"
#include <stdio.h>
#include "bits.h"
#include "constants.h"
#include "fnt.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -31,7 +32,6 @@
#include "mpdecimal.h"
#include <stdio.h>
/* Internal header file: all symbols have local scope in the DSO */

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,8 +28,6 @@
#include "mpdecimal.h"
#include <stdio.h>
#include <assert.h>
#include "numbertheory.h"
#include "umodarith.h"
#include "crt.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -31,7 +32,6 @@
#include "mpdecimal.h"
#include <stdio.h>
/* Internal header file: all symbols have local scope in the DSO */

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,8 +28,6 @@
#include "mpdecimal.h"
#include <stdio.h>
#include <assert.h>
#include "bits.h"
#include "numbertheory.h"
#include "umodarith.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -31,7 +32,6 @@
#include "mpdecimal.h"
#include <stdio.h>
#include "numbertheory.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,12 +28,10 @@
#include "mpdecimal.h"
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "bits.h"
#include "difradix2.h"
#include "numbertheory.h"
#include "libc/assert.h"
#include "fnt.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -31,7 +32,6 @@
#include "mpdecimal.h"
#include <stdio.h>
/* Internal header file: all symbols have local scope in the DSO */

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,7 +28,6 @@
#include "mpdecimal.h"
#include <assert.h>
#include "numbertheory.h"
#include "sixstep.h"
#include "transpose.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -31,7 +32,6 @@
#include "mpdecimal.h"
#include <stdio.h>
/* Internal header file: all symbols have local scope in the DSO */

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,17 +28,12 @@
#include "mpdecimal.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <assert.h>
#include <errno.h>
#include "libc/unicode/locale.h"
#include "bits.h"
#include "constants.h"
#include "typearith.h"
#include "libc/errno.h"
#include "libc/fmt/fmt.h"
#include "io.h"

View file

@ -1,38 +1,7 @@
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef IO_H
#define IO_H
#include <errno.h>
#include "mpdecimal.h"
/* clang-format off */
#if SIZE_MAX == MPD_SIZE_MAX
#define mpd_strtossize _mpd_strtossize
@ -55,5 +24,4 @@ mpd_strtossize(const char *s, char **end, int base)
}
#endif
#endif

View file

@ -1,51 +0,0 @@
This document contains links to the literature used in the process of
creating the library. The list is probably not complete.
Mike Cowlishaw: General Decimal Arithmetic Specification
http://speleotrove.com/decimal/decarith.html
Jean-Michel Muller: On the definition of ulp (x)
lara.inist.fr/bitstream/2332/518/1/LIP-RR2005-09.pdf
T. E. Hull, A. Abrham: Properly rounded variable precision square root
http://portal.acm.org/citation.cfm?id=214413
T. E. Hull, A. Abrham: Variable precision exponential function
http://portal.acm.org/citation.cfm?id=6498
Roman E. Maeder: Storage allocation for the Karatsuba integer multiplication
algorithm. http://www.springerlink.com/content/w15058mj6v59t565/
J. M. Pollard: The fast Fourier transform in a finite field
http://www.ams.org/journals/mcom/1971-25-114/S0025-5718-1971-0301966-0/home.html
David H. Bailey: FFTs in External or Hierarchical Memory
http://crd.lbl.gov/~dhbailey/dhbpapers/
W. Morven Gentleman: Matrix Multiplication and Fast Fourier Transforms
http://www.alcatel-lucent.com/bstj/vol47-1968/articles/bstj47-6-1099.pdf
Mikko Tommila: Apfloat documentation
http://www.apfloat.org/apfloat/2.41/apfloat.pdf
Joerg Arndt: "Matters Computational"
http://www.jjj.de/fxt/
Karl Hasselstrom: Fast Division of Large Integers
www.treskal.com/kalle/exjobb/original-report.pdf

View file

@ -1,83 +0,0 @@
Bignum support (Fast Number Theoretic Transform or FNT):
========================================================
Bignum arithmetic in libmpdec uses the scheme for fast convolution
of integer sequences from:
J. M. Pollard: The fast Fourier transform in a finite field
http://www.ams.org/journals/mcom/1971-25-114/S0025-5718-1971-0301966-0/home.html
The transform in a finite field can be used for convolution in the same
way as the Fourier Transform. The main advantages of the Number Theoretic
Transform are that it is both exact and very memory efficient.
Convolution in pseudo-code:
~~~~~~~~~~~~~~~~~~~~~~~~~~~
fnt_convolute(a, b):
x = fnt(a) # forward transform of a
y = fnt(b) # forward transform of b
z = pairwise multiply x[i] and y[i]
result = inv_fnt(z) # backward transform of z.
Extending the maximum transform length (Chinese Remainder Theorem):
-------------------------------------------------------------------
The maximum transform length is quite limited when using a single
prime field. However, it is possible to use multiple primes and
recover the result using the Chinese Remainder Theorem.
Multiplication in pseudo-code:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
_mpd_fntmul(u, v):
c1 = fnt_convolute(u, v, P1) # convolute modulo prime1
c2 = fnt_convolute(u, v, P2) # convolute modulo prime2
c3 = fnt_convolute(u, v, P3) # convolute modulo prime3
result = crt3(c1, c2, c3) # Chinese Remainder Theorem
Optimized transform functions:
------------------------------
There are three different fnt() functions:
std_fnt: "standard" decimation in frequency transform for array lengths
of 2**n. Performs well up to 1024 words.
sixstep: Cache-friendly algorithm for array lengths of 2**n. Outperforms
std_fnt for large arrays.
fourstep: Algorithm for array lengths of 3 * 2**n. Also cache friendly
in large parts.
List of bignum-only files:
--------------------------
Functions from these files are only used in _mpd_fntmul().
umodarith.h -> fast low level routines for unsigned modular arithmetic
numbertheory.c -> routines for setting up the FNT
difradix2.c -> decimation in frequency transform, used as the
"base case" by the following three files:
fnt.c -> standard transform for smaller arrays
sixstep.c -> transform large arrays of length 2**n
fourstep.c -> transform arrays of length 3 * 2**n
convolute.c -> do the actual fast convolution, using one of
the three transform functions.
transpose.c -> transpositions needed for the sixstep algorithm.
crt.c -> Chinese Remainder Theorem: use information from three
transforms modulo three different primes to get the
final result.

View file

@ -1,208 +0,0 @@
#
# Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
######################################################################
# This file lists and checks some of the constants and limits used #
# in libmpdec's Number Theoretic Transform. At the end of the file #
# there is an example function for the plain DFT transform. #
######################################################################
#
# Number theoretic transforms are done in subfields of F(p). P[i]
# are the primes, D[i] = P[i] - 1 are highly composite and w[i]
# are the respective primitive roots of F(p).
#
# The strategy is to convolute two coefficients modulo all three
# primes, then use the Chinese Remainder Theorem on the three
# result arrays to recover the result in the usual base RADIX
# form.
#
# ======================================================================
# Primitive roots
# ======================================================================
#
# Verify primitive roots:
#
# For a prime field, r is a primitive root if and only if for all prime
# factors f of p-1, r**((p-1)/f) =/= 1 (mod p).
#
def prod(F, E):
"""Check that the factorization of P-1 is correct. F is the list of
factors of P-1, E lists the number of occurrences of each factor."""
x = 1
for y, z in zip(F, E):
x *= y**z
return x
def is_primitive_root(r, p, factors, exponents):
"""Check if r is a primitive root of F(p)."""
if p != prod(factors, exponents) + 1:
return False
for f in factors:
q, control = divmod(p-1, f)
if control != 0:
return False
if pow(r, q, p) == 1:
return False
return True
# =================================================================
# Constants and limits for the 64-bit version
# =================================================================
RADIX = 10**19
# Primes P1, P2 and P3:
P = [2**64-2**32+1, 2**64-2**34+1, 2**64-2**40+1]
# P-1, highly composite. The transform length d is variable and
# must divide D = P-1. Since all D are divisible by 3 * 2**32,
# transform lengths can be 2**n or 3 * 2**n (where n <= 32).
D = [2**32 * 3 * (5 * 17 * 257 * 65537),
2**34 * 3**2 * (7 * 11 * 31 * 151 * 331),
2**40 * 3**2 * (5 * 7 * 13 * 17 * 241)]
# Prime factors of P-1 and their exponents:
F = [(2,3,5,17,257,65537), (2,3,7,11,31,151,331), (2,3,5,7,13,17,241)]
E = [(32,1,1,1,1,1), (34,2,1,1,1,1,1), (40,2,1,1,1,1,1)]
# Maximum transform length for 2**n. Above that only 3 * 2**31
# or 3 * 2**32 are possible.
MPD_MAXTRANSFORM_2N = 2**32
# Limits in the terminology of Pollard's paper:
m2 = (MPD_MAXTRANSFORM_2N * 3) // 2 # Maximum length of the smaller array.
M1 = M2 = RADIX-1 # Maximum value per single word.
L = m2 * M1 * M2
P[0] * P[1] * P[2] > 2 * L
# Primitive roots of F(P1), F(P2) and F(P3):
w = [7, 10, 19]
# The primitive roots are correct:
for i in range(3):
if not is_primitive_root(w[i], P[i], F[i], E[i]):
print("FAIL")
# =================================================================
# Constants and limits for the 32-bit version
# =================================================================
RADIX = 10**9
# Primes P1, P2 and P3:
P = [2113929217, 2013265921, 1811939329]
# P-1, highly composite. All D = P-1 are divisible by 3 * 2**25,
# allowing for transform lengths up to 3 * 2**25 words.
D = [2**25 * 3**2 * 7,
2**27 * 3 * 5,
2**26 * 3**3]
# Prime factors of P-1 and their exponents:
F = [(2,3,7), (2,3,5), (2,3)]
E = [(25,2,1), (27,1,1), (26,3)]
# Maximum transform length for 2**n. Above that only 3 * 2**24 or
# 3 * 2**25 are possible.
MPD_MAXTRANSFORM_2N = 2**25
# Limits in the terminology of Pollard's paper:
m2 = (MPD_MAXTRANSFORM_2N * 3) // 2 # Maximum length of the smaller array.
M1 = M2 = RADIX-1 # Maximum value per single word.
L = m2 * M1 * M2
P[0] * P[1] * P[2] > 2 * L
# Primitive roots of F(P1), F(P2) and F(P3):
w = [5, 31, 13]
# The primitive roots are correct:
for i in range(3):
if not is_primitive_root(w[i], P[i], F[i], E[i]):
print("FAIL")
# ======================================================================
# Example transform using a single prime
# ======================================================================
def ntt(lst, dir):
"""Perform a transform on the elements of lst. len(lst) must
be 2**n or 3 * 2**n, where n <= 25. This is the slow DFT."""
p = 2113929217 # prime
d = len(lst) # transform length
d_prime = pow(d, (p-2), p) # inverse of d
xi = (p-1)//d
w = 5 # primitive root of F(p)
r = pow(w, xi, p) # primitive root of the subfield
r_prime = pow(w, (p-1-xi), p) # inverse of r
if dir == 1: # forward transform
a = lst # input array
A = [0] * d # transformed values
for i in range(d):
s = 0
for j in range(d):
s += a[j] * pow(r, i*j, p)
A[i] = s % p
return A
elif dir == -1: # backward transform
A = lst # input array
a = [0] * d # transformed values
for j in range(d):
s = 0
for i in range(d):
s += A[i] * pow(r_prime, i*j, p)
a[j] = (d_prime * s) % p
return a
def ntt_convolute(a, b):
"""convolute arrays a and b."""
assert(len(a) == len(b))
x = ntt(a, 1)
y = ntt(b, 1)
for i in range(len(a)):
y[i] = y[i] * x[i]
r = ntt(y, -1)
return r
# Example: Two arrays representing 21 and 81 in little-endian:
a = [1, 2, 0, 0]
b = [1, 8, 0, 0]
assert(ntt_convolute(a, b) == [1, 10, 16, 0])
assert(21 * 81 == (1*10**0 + 10*10**1 + 16*10**2 + 0*10**3))

View file

@ -1,256 +0,0 @@
(* Copyright (c) 2011 Stefan Krah. All rights reserved. *)
The Matrix Fourier Transform:
=============================
In libmpdec, the Matrix Fourier Transform [1] is called four-step transform
after a variant that appears in [2]. The algorithm requires that the input
array can be viewed as an R*C matrix.
All operations are done modulo p. For readability, the proofs drop all
instances of (mod p).
Algorithm four-step (forward transform):
----------------------------------------
a := input array
d := len(a) = R * C
p := prime
w := primitive root of unity of the prime field
r := w**((p-1)/d)
A := output array
1) Apply a length R FNT to each column.
2) Multiply each matrix element (addressed by j*C+m) by r**(j*m).
3) Apply a length C FNT to each row.
4) Transpose the matrix.
Proof (forward transform):
--------------------------
The algorithm can be derived starting from the regular definition of
the finite-field transform of length d:
d-1
,----
\
A[k] = | a[l] * r**(k * l)
/
`----
l = 0
The sum can be rearranged into the sum of the sums of columns:
C-1 R-1
,---- ,----
\ \
= | | a[i * C + j] * r**(k * (i * C + j))
/ /
`---- `----
j = 0 i = 0
Extracting a constant from the inner sum:
C-1 R-1
,---- ,----
\ \
= | r**k*j * | a[i * C + j] * r**(k * i * C)
/ /
`---- `----
j = 0 i = 0
Without any loss of generality, let k = n * R + m,
where n < C and m < R:
C-1 R-1
,---- ,----
\ \
A[n*R+m] = | r**(R*n*j) * r**(m*j) * | a[i*C+j] * r**(R*C*n*i) * r**(C*m*i)
/ /
`---- `----
j = 0 i = 0
Since r = w ** ((p-1) / (R*C)):
a) r**(R*C*n*i) = w**((p-1)*n*i) = 1
b) r**(C*m*i) = w**((p-1) / R) ** (m*i) = r_R ** (m*i)
c) r**(R*n*j) = w**((p-1) / C) ** (n*j) = r_C ** (n*j)
r_R := root of the subfield of length R.
r_C := root of the subfield of length C.
C-1 R-1
,---- ,----
\ \
A[n*R+m] = | r_C**(n*j) * [ r**(m*j) * | a[i*C+j] * r_R**(m*i) ]
/ ^ /
`---- | `---- 1) transform the columns
j = 0 | i = 0
^ |
| `-- 2) multiply
|
`-- 3) transform the rows
Note that the entire RHS is a function of n and m and that the results
for each pair (n, m) are stored in Fortran order.
Let the term in square brackets be f(m, j). Step 1) and 2) precalculate
the term for all (m, j). After that, the original matrix is now a lookup
table with the mth element in the jth column at location m * C + j.
Let the complete RHS be g(m, n). Step 3) does an in-place transform of
length n on all rows. After that, the original matrix is now a lookup
table with the mth element in the nth column at location m * C + n.
But each (m, n) pair should be written to location n * R + m. Therefore,
step 4) transposes the result of step 3).
Algorithm four-step (inverse transform):
----------------------------------------
A := input array
d := len(A) = R * C
p := prime
d' := d**(p-2) # inverse of d
w := primitive root of unity of the prime field
r := w**((p-1)/d) # root of the subfield
r' := w**((p-1) - (p-1)/d) # inverse of r
a := output array
0) View the matrix as a C*R matrix.
1) Transpose the matrix, producing an R*C matrix.
2) Apply a length C FNT to each row.
3) Multiply each matrix element (addressed by i*C+n) by r**(i*n).
4) Apply a length R FNT to each column.
Proof (inverse transform):
--------------------------
The algorithm can be derived starting from the regular definition of
the finite-field inverse transform of length d:
d-1
,----
\
a[k] = d' * | A[l] * r' ** (k * l)
/
`----
l = 0
The sum can be rearranged into the sum of the sums of columns. Note
that at this stage we still have a C*R matrix, so C denotes the number
of rows:
R-1 C-1
,---- ,----
\ \
= d' * | | a[j * R + i] * r' ** (k * (j * R + i))
/ /
`---- `----
i = 0 j = 0
Extracting a constant from the inner sum:
R-1 C-1
,---- ,----
\ \
= d' * | r' ** (k*i) * | a[j * R + i] * r' ** (k * j * R)
/ /
`---- `----
i = 0 j = 0
Without any loss of generality, let k = m * C + n,
where m < R and n < C:
R-1 C-1
,---- ,----
\ \
A[m*C+n] = d' * | r' ** (C*m*i) * r' ** (n*i) * | a[j*R+i] * r' ** (R*C*m*j) * r' ** (R*n*j)
/ /
`---- `----
i = 0 j = 0
Since r' = w**((p-1) - (p-1)/d) and d = R*C:
a) r' ** (R*C*m*j) = w**((p-1)*R*C*m*j - (p-1)*m*j) = 1
b) r' ** (C*m*i) = w**((p-1)*C - (p-1)/R) ** (m*i) = r_R' ** (m*i)
c) r' ** (R*n*j) = r_C' ** (n*j)
d) d' = d**(p-2) = (R*C) ** (p-2) = R**(p-2) * C**(p-2) = R' * C'
r_R' := inverse of the root of the subfield of length R.
r_C' := inverse of the root of the subfield of length C.
R' := inverse of R
C' := inverse of C
R-1 C-1
,---- ,---- 2) transform the rows of a^T
\ \
A[m*C+n] = R' * | r_R' ** (m*i) * [ r' ** (n*i) * C' * | a[j*R+i] * r_C' ** (n*j) ]
/ ^ / ^
`---- | `---- |
i = 0 | j = 0 |
^ | `-- 1) Transpose input matrix
| `-- 3) multiply to address elements by
| i * C + j
`-- 3) transform the columns
Note that the entire RHS is a function of m and n and that the results
for each pair (m, n) are stored in C order.
Let the term in square brackets be f(n, i). Without step 1), the sum
would perform a length C transform on the columns of the input matrix.
This is a) inefficient and b) the results are needed in C order, so
step 1) exchanges rows and columns.
Step 2) and 3) precalculate f(n, i) for all (n, i). After that, the
original matrix is now a lookup table with the ith element in the nth
column at location i * C + n.
Let the complete RHS be g(m, n). Step 4) does an in-place transform of
length m on all columns. After that, the original matrix is now a lookup
table with the mth element in the nth column at location m * C + n,
which means that all A[k] = A[m * C + n] are in the correct order.
--
[1] Joerg Arndt: "Matters Computational"
http://www.jjj.de/fxt/
[2] David H. Bailey: FFTs in External or Hierarchical Memory
http://crd.lbl.gov/~dhbailey/dhbpapers/

View file

@ -1,127 +0,0 @@
(* Copyright (c) 2011 Stefan Krah. All rights reserved. *)
==========================================================================
Calculate (a * b) % p using special primes
==========================================================================
A description of the algorithm can be found in the apfloat manual by
Tommila [1].
Definitions:
------------
In the whole document, "==" stands for "is congruent with".
Result of a * b in terms of high/low words:
(1) hi * 2**64 + lo = a * b
Special primes:
(2) p = 2**64 - z + 1, where z = 2**n
Single step modular reduction:
(3) R(hi, lo) = hi * z - hi + lo
Strategy:
---------
a) Set (hi, lo) to the result of a * b.
b) Set (hi', lo') to the result of R(hi, lo).
c) Repeat step b) until 0 <= hi' * 2**64 + lo' < 2*p.
d) If the result is less than p, return lo'. Otherwise return lo' - p.
The reduction step b) preserves congruence:
-------------------------------------------
hi * 2**64 + lo == hi * z - hi + lo (mod p)
Proof:
~~~~~~
hi * 2**64 + lo = (2**64 - z + 1) * hi + z * hi - hi + lo
= p * hi + z * hi - hi + lo
== z * hi - hi + lo (mod p)
Maximum numbers of step b):
---------------------------
# To avoid unnecessary formalism, define:
def R(hi, lo, z):
return divmod(hi * z - hi + lo, 2**64)
# For simplicity, assume hi=2**64-1, lo=2**64-1 after the
# initial multiplication a * b. This is of course impossible
# but certainly covers all cases.
# Then, for p1:
hi=2**64-1; lo=2**64-1; z=2**32
p1 = 2**64 - z + 1
hi, lo = R(hi, lo, z) # First reduction
hi, lo = R(hi, lo, z) # Second reduction
hi * 2**64 + lo < 2 * p1 # True
# For p2:
hi=2**64-1; lo=2**64-1; z=2**34
p2 = 2**64 - z + 1
hi, lo = R(hi, lo, z) # First reduction
hi, lo = R(hi, lo, z) # Second reduction
hi, lo = R(hi, lo, z) # Third reduction
hi * 2**64 + lo < 2 * p2 # True
# For p3:
hi=2**64-1; lo=2**64-1; z=2**40
p3 = 2**64 - z + 1
hi, lo = R(hi, lo, z) # First reduction
hi, lo = R(hi, lo, z) # Second reduction
hi, lo = R(hi, lo, z) # Third reduction
hi * 2**64 + lo < 2 * p3 # True
Step d) preserves congruence and yields a result < p:
-----------------------------------------------------
Case hi = 0:
Case lo < p: trivial.
Case lo >= p:
lo == lo - p (mod p) # result is congruent
p <= lo < 2*p -> 0 <= lo - p < p # result is in the correct range
Case hi = 1:
p < 2**64 /\ 2**64 + lo < 2*p -> lo < p # lo is always less than p
2**64 + lo == 2**64 + (lo - p) (mod p) # result is congruent
= lo - p # exactly the same value as the previous RHS
# in uint64_t arithmetic.
p < 2**64 + lo < 2*p -> 0 < 2**64 + (lo - p) < p # correct range
[1] http://www.apfloat.org/apfloat/2.40/apfloat.pdf

View file

@ -1,269 +0,0 @@
(* Copyright (c) 2011 Stefan Krah. All rights reserved. *)
========================================================================
Calculate (a * b) % p using the 80-bit x87 FPU
========================================================================
A description of the algorithm can be found in the apfloat manual by
Tommila [1].
The proof follows an argument made by Granlund/Montgomery in [2].
Definitions and assumptions:
----------------------------
The 80-bit extended precision format uses 64 bits for the significand:
(1) F = 64
The modulus is prime and less than 2**31:
(2) 2 <= p < 2**31
The factors are less than p:
(3) 0 <= a < p
(4) 0 <= b < p
The product a * b is less than 2**62 and is thus exact in 64 bits:
(5) n = a * b
The product can be represented in terms of quotient and remainder:
(6) n = q * p + r
Using (3), (4) and the fact that p is prime, the remainder is always
greater than zero:
(7) 0 <= q < p /\ 1 <= r < p
Strategy:
---------
Precalculate the 80-bit long double inverse of p, with a maximum
relative error of 2**(1-F):
(8) pinv = (long double)1.0 / p
Calculate an estimate for q = floor(n/p). The multiplication has another
maximum relative error of 2**(1-F):
(9) qest = n * pinv
If we can show that q < qest < q+1, then trunc(qest) = q. It is then
easy to recover the remainder r. The complete algorithm is:
a) Set the control word to 64-bit precision and truncation mode.
b) n = a * b # Calculate exact product.
c) qest = n * pinv # Calculate estimate for the quotient.
d) q = (qest+2**63)-2**63 # Truncate qest to the exact quotient.
f) r = n - q * p # Calculate remainder.
Proof for q < qest < q+1:
-------------------------
Using the cumulative error, the error bounds for qest are:
n n * (1 + 2**(1-F))**2
(9) --------------------- <= qest <= ---------------------
p * (1 + 2**(1-F))**2 p
Lemma 1:
--------
n q * p + r
(10) q < --------------------- = ---------------------
p * (1 + 2**(1-F))**2 p * (1 + 2**(1-F))**2
Proof:
~~~~~~
(I) q * p * (1 + 2**(1-F))**2 < q * p + r
(II) q * p * 2**(2-F) + q * p * 2**(2-2*F) < r
Using (1) and (7), it is sufficient to show that:
(III) q * p * 2**(-62) + q * p * 2**(-126) < 1 <= r
(III) can easily be verified by substituting the largest possible
values p = 2**31-1 and q = 2**31-2.
The critical cases occur when r = 1, n = m * p + 1. These cases
can be exhaustively verified with a test program.
Lemma 2:
--------
n * (1 + 2**(1-F))**2 (q * p + r) * (1 + 2**(1-F))**2
(11) --------------------- = ------------------------------- < q + 1
p p
Proof:
~~~~~~
(I) (q * p + r) + (q * p + r) * 2**(2-F) + (q * p + r) * 2**(2-2*F) < q * p + p
(II) (q * p + r) * 2**(2-F) + (q * p + r) * 2**(2-2*F) < p - r
Using (1) and (7), it is sufficient to show that:
(III) (q * p + r) * 2**(-62) + (q * p + r) * 2**(-126) < 1 <= p - r
(III) can easily be verified by substituting the largest possible
values p = 2**31-1, q = 2**31-2 and r = 2**31-2.
The critical cases occur when r = (p - 1), n = m * p - 1. These cases
can be exhaustively verified with a test program.
[1] http://www.apfloat.org/apfloat/2.40/apfloat.pdf
[2] http://gmplib.org/~tege/divcnst-pldi94.pdf
[Section 7: "Use of floating point"]
(* Coq proof for (10) and (11) *)
Require Import ZArith.
Require Import QArith.
Require Import Qpower.
Require Import Qabs.
Require Import Psatz.
Open Scope Q_scope.
Ltac qreduce T :=
rewrite <- (Qred_correct (T)); simpl (Qred (T)).
Theorem Qlt_move_right :
forall x y z:Q, x + z < y <-> x < y - z.
Proof.
intros.
split.
intros.
psatzl Q.
intros.
psatzl Q.
Qed.
Theorem Qlt_mult_by_z :
forall x y z:Q, 0 < z -> (x < y <-> x * z < y * z).
Proof.
intros.
split.
intros.
apply Qmult_lt_compat_r. trivial. trivial.
intros.
rewrite <- (Qdiv_mult_l x z). rewrite <- (Qdiv_mult_l y z).
apply Qmult_lt_compat_r.
apply Qlt_shift_inv_l.
trivial. psatzl Q. trivial. psatzl Q. psatzl Q.
Qed.
Theorem Qle_mult_quad :
forall (a b c d:Q),
0 <= a -> a <= c ->
0 <= b -> b <= d ->
a * b <= c * d.
intros.
psatz Q.
Qed.
Theorem q_lt_qest:
forall (p q r:Q),
(0 < p) -> (p <= (2#1)^31 - 1) ->
(0 <= q) -> (q <= p - 1) ->
(1 <= r) -> (r <= p - 1) ->
q < (q * p + r) / (p * (1 + (2#1)^(-63))^2).
Proof.
intros.
rewrite Qlt_mult_by_z with (z := (p * (1 + (2#1)^(-63))^2)).
unfold Qdiv.
rewrite <- Qmult_assoc.
rewrite (Qmult_comm (/ (p * (1 + (2 # 1) ^ (-63)) ^ 2)) (p * (1 + (2 # 1) ^ (-63)) ^ 2)).
rewrite Qmult_inv_r.
rewrite Qmult_1_r.
assert (q * (p * (1 + (2 # 1) ^ (-63)) ^ 2) == q * p + (q * p) * ((2 # 1) ^ (-62) + (2 # 1) ^ (-126))).
qreduce ((1 + (2 # 1) ^ (-63)) ^ 2).
qreduce ((2 # 1) ^ (-62) + (2 # 1) ^ (-126)).
ring_simplify.
reflexivity.
rewrite H5.
rewrite Qplus_comm.
rewrite Qlt_move_right.
ring_simplify (q * p + r - q * p).
qreduce ((2 # 1) ^ (-62) + (2 # 1) ^ (-126)).
apply Qlt_le_trans with (y := 1).
rewrite Qlt_mult_by_z with (z := 85070591730234615865843651857942052864 # 18446744073709551617).
ring_simplify.
apply Qle_lt_trans with (y := ((2 # 1) ^ 31 - (2#1)) * ((2 # 1) ^ 31 - 1)).
apply Qle_mult_quad.
assumption. psatzl Q. psatzl Q. psatzl Q. psatzl Q. psatzl Q. assumption. psatzl Q. psatzl Q.
Qed.
Theorem qest_lt_qplus1:
forall (p q r:Q),
(0 < p) -> (p <= (2#1)^31 - 1) ->
(0 <= q) -> (q <= p - 1) ->
(1 <= r) -> (r <= p - 1) ->
((q * p + r) * (1 + (2#1)^(-63))^2) / p < q + 1.
Proof.
intros.
rewrite Qlt_mult_by_z with (z := p).
unfold Qdiv.
rewrite <- Qmult_assoc.
rewrite (Qmult_comm (/ p) p).
rewrite Qmult_inv_r.
rewrite Qmult_1_r.
assert ((q * p + r) * (1 + (2 # 1) ^ (-63)) ^ 2 == q * p + r + (q * p + r) * ((2 # 1) ^ (-62) + (2 # 1) ^ (-126))).
qreduce ((1 + (2 # 1) ^ (-63)) ^ 2).
qreduce ((2 # 1) ^ (-62) + (2 # 1) ^ (-126)).
ring_simplify. reflexivity.
rewrite H5.
rewrite <- Qplus_assoc. rewrite <- Qplus_comm. rewrite Qlt_move_right.
ring_simplify ((q + 1) * p - q * p).
rewrite <- Qplus_comm. rewrite Qlt_move_right.
apply Qlt_le_trans with (y := 1).
qreduce ((2 # 1) ^ (-62) + (2 # 1) ^ (-126)).
rewrite Qlt_mult_by_z with (z := 85070591730234615865843651857942052864 # 18446744073709551617).
ring_simplify.
ring_simplify in H0.
apply Qle_lt_trans with (y := (2147483646 # 1) * (2147483647 # 1) + (2147483646 # 1)).
apply Qplus_le_compat.
apply Qle_mult_quad.
assumption. psatzl Q. auto with qarith. assumption. psatzl Q.
auto with qarith. auto with qarith.
psatzl Q. psatzl Q. assumption.
Qed.

View file

@ -1,63 +0,0 @@
(* Copyright (c) 2011 Stefan Krah. All rights reserved. *)
The Six Step Transform:
=======================
In libmpdec, the six-step transform is the Matrix Fourier Transform (See
matrix-transform.txt) in disguise. It is called six-step transform after
a variant that appears in [1]. The algorithm requires that the input
array can be viewed as an R*C matrix.
Algorithm six-step (forward transform):
---------------------------------------
1a) Transpose the matrix.
1b) Apply a length R FNT to each row.
1c) Transpose the matrix.
2) Multiply each matrix element (addressed by j*C+m) by r**(j*m).
3) Apply a length C FNT to each row.
4) Transpose the matrix.
Note that steps 1a) - 1c) are exactly equivalent to step 1) of the Matrix
Fourier Transform. For large R, it is faster to transpose twice and do
a transform on the rows than to perform a column transpose directly.
Algorithm six-step (inverse transform):
---------------------------------------
0) View the matrix as a C*R matrix.
1) Transpose the matrix, producing an R*C matrix.
2) Apply a length C FNT to each row.
3) Multiply each matrix element (addressed by i*C+n) by r**(i*n).
4a) Transpose the matrix.
4b) Apply a length R FNT to each row.
4c) Transpose the matrix.
Again, steps 4a) - 4c) are equivalent to step 4) of the Matrix Fourier
Transform.
--
[1] David H. Bailey: FFTs in External or Hierarchical Memory
http://crd.lbl.gov/~dhbailey/dhbpapers/

View file

@ -1,692 +0,0 @@
;
; Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
;
; 1. Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
; OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
; HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
; LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
; OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
; SUCH DAMAGE.
;
(in-package "ACL2")
(include-book "arithmetic/top-with-meta" :dir :system)
(include-book "arithmetic-2/floor-mod/floor-mod" :dir :system)
;; =====================================================================
;; Proofs for several functions in umodarith.h
;; =====================================================================
;; =====================================================================
;; Helper theorems
;; =====================================================================
(defthm elim-mod-m<x<2*m
(implies (and (<= m x)
(< x (* 2 m))
(rationalp x) (rationalp m))
(equal (mod x m)
(+ x (- m)))))
(defthm modaux-1a
(implies (and (< x m) (< 0 x) (< 0 m)
(rationalp x) (rationalp m))
(equal (mod (- x) m)
(+ (- x) m))))
(defthm modaux-1b
(implies (and (< (- x) m) (< x 0) (< 0 m)
(rationalp x) (rationalp m))
(equal (mod x m)
(+ x m)))
:hints (("Goal" :use ((:instance modaux-1a
(x (- x)))))))
(defthm modaux-1c
(implies (and (< x m) (< 0 x) (< 0 m)
(rationalp x) (rationalp m))
(equal (mod x m)
x)))
(defthm modaux-2a
(implies (and (< 0 b) (< b m)
(natp x) (natp b) (natp m)
(< (mod (+ b x) m) b))
(equal (mod (+ (- m) b x) m)
(+ (- m) b (mod x m)))))
(defthm modaux-2b
(implies (and (< 0 b) (< b m)
(natp x) (natp b) (natp m)
(< (mod (+ b x) m) b))
(equal (mod (+ b x) m)
(+ (- m) b (mod x m))))
:hints (("Goal" :use (modaux-2a))))
(defthm linear-mod-1
(implies (and (< x m) (< b m)
(natp x) (natp b)
(rationalp m))
(equal (< x (mod (+ (- b) x) m))
(< x b)))
:hints (("Goal" :use ((:instance modaux-1a
(x (+ b (- x))))))))
(defthm linear-mod-2
(implies (and (< 0 b) (< b m)
(natp x) (natp b)
(natp m))
(equal (< (mod x m)
(mod (+ (- b) x) m))
(< (mod x m) b))))
(defthm linear-mod-3
(implies (and (< x m) (< b m)
(natp x) (natp b)
(rationalp m))
(equal (<= b (mod (+ b x) m))
(< (+ b x) m)))
:hints (("Goal" :use ((:instance elim-mod-m<x<2*m
(x (+ b x)))))))
(defthm modaux-2c
(implies (and (< 0 b) (< b m)
(natp x) (natp b) (natp m)
(<= b (mod (+ b x) m)))
(equal (mod (+ b x) m)
(+ b (mod x m))))
:hints (("Subgoal *1/8''" :use (linear-mod-3))))
(defthmd modaux-2d
(implies (and (< x m) (< 0 x) (< 0 m)
(< (- m) b) (< b 0) (rationalp m)
(<= x (mod (+ b x) m))
(rationalp x) (rationalp b))
(equal (+ (- m) (mod (+ b x) m))
(+ b x)))
:hints (("Goal" :cases ((<= 0 (+ b x))))
("Subgoal 2'" :use ((:instance modaux-1b
(x (+ b x)))))))
(defthm mod-m-b
(implies (and (< 0 x) (< 0 b) (< 0 m)
(< x b) (< b m)
(natp x) (natp b) (natp m))
(equal (mod (+ (mod (- x) m) b) m)
(mod (- x) b))))
;; =====================================================================
;; addmod, submod
;; =====================================================================
(defun addmod (a b m base)
(let* ((s (mod (+ a b) base))
(s (if (< s a) (mod (- s m) base) s))
(s (if (>= s m) (mod (- s m) base) s)))
s))
(defthmd addmod-correct
(implies (and (< 0 m) (< m base)
(< a m) (<= b m)
(natp m) (natp base)
(natp a) (natp b))
(equal (addmod a b m base)
(mod (+ a b) m)))
:hints (("Goal" :cases ((<= base (+ a b))))
("Subgoal 2.1'" :use ((:instance elim-mod-m<x<2*m
(x (+ a b)))))))
(defun submod (a b m base)
(let* ((d (mod (- a b) base))
(d (if (< a d) (mod (+ d m) base) d)))
d))
(defthmd submod-aux1
(implies (and (< a (mod (+ a (- b)) base))
(< 0 base) (< a base) (<= b base)
(natp base) (natp a) (natp b))
(< a b))
:rule-classes :forward-chaining)
(defthmd submod-aux2
(implies (and (<= (mod (+ a (- b)) base) a)
(< 0 base) (< a base) (< b base)
(natp base) (natp a) (natp b))
(<= b a))
:rule-classes :forward-chaining)
(defthmd submod-correct
(implies (and (< 0 m) (< m base)
(< a m) (<= b m)
(natp m) (natp base)
(natp a) (natp b))
(equal (submod a b m base)
(mod (- a b) m)))
:hints (("Goal" :cases ((<= base (+ a b))))
("Subgoal 2.2" :use ((:instance submod-aux1)))
("Subgoal 2.2'''" :cases ((and (< 0 (+ a (- b) m))
(< (+ a (- b) m) m))))
("Subgoal 2.1" :use ((:instance submod-aux2)))
("Subgoal 1.2" :use ((:instance submod-aux1)))
("Subgoal 1.1" :use ((:instance submod-aux2)))))
(defun submod-2 (a b m base)
(let* ((d (mod (- a b) base))
(d (if (< a b) (mod (+ d m) base) d)))
d))
(defthm submod-2-correct
(implies (and (< 0 m) (< m base)
(< a m) (<= b m)
(natp m) (natp base)
(natp a) (natp b))
(equal (submod-2 a b m base)
(mod (- a b) m)))
:hints (("Subgoal 2'" :cases ((and (< 0 (+ a (- b) m))
(< (+ a (- b) m) m))))))
;; =========================================================================
;; ext-submod is correct
;; =========================================================================
; a < 2*m, b < 2*m
(defun ext-submod (a b m base)
(let* ((a (if (>= a m) (- a m) a))
(b (if (>= b m) (- b m) b))
(d (mod (- a b) base))
(d (if (< a b) (mod (+ d m) base) d)))
d))
; a < 2*m, b < 2*m
(defun ext-submod-2 (a b m base)
(let* ((a (mod a m))
(b (mod b m))
(d (mod (- a b) base))
(d (if (< a b) (mod (+ d m) base) d)))
d))
(defthmd ext-submod-ext-submod-2-equal
(implies (and (< 0 m) (< m base)
(< a (* 2 m)) (< b (* 2 m))
(natp m) (natp base)
(natp a) (natp b))
(equal (ext-submod a b m base)
(ext-submod-2 a b m base))))
(defthmd ext-submod-2-correct
(implies (and (< 0 m) (< m base)
(< a (* 2 m)) (< b (* 2 m))
(natp m) (natp base)
(natp a) (natp b))
(equal (ext-submod-2 a b m base)
(mod (- a b) m))))
;; =========================================================================
;; dw-reduce is correct
;; =========================================================================
(defun dw-reduce (hi lo m base)
(let* ((r1 (mod hi m))
(r2 (mod (+ (* r1 base) lo) m)))
r2))
(defthmd dw-reduce-correct
(implies (and (< 0 m) (< m base)
(< hi base) (< lo base)
(natp m) (natp base)
(natp hi) (natp lo))
(equal (dw-reduce hi lo m base)
(mod (+ (* hi base) lo) m))))
(defthmd <=-multiply-both-sides-by-z
(implies (and (rationalp x) (rationalp y)
(< 0 z) (rationalp z))
(equal (<= x y)
(<= (* z x) (* z y)))))
(defthmd dw-reduce-aux1
(implies (and (< 0 m) (< m base)
(natp m) (natp base)
(< lo base) (natp lo)
(< x m) (natp x))
(< (+ lo (* base x)) (* base m)))
:hints (("Goal" :cases ((<= (+ x 1) m)))
("Subgoal 1''" :cases ((<= (* base (+ x 1)) (* base m))))
("subgoal 1.2" :use ((:instance <=-multiply-both-sides-by-z
(x (+ 1 x))
(y m)
(z base))))))
(defthm dw-reduce-aux2
(implies (and (< x (* base m))
(< 0 m) (< m base)
(natp m) (natp base) (natp x))
(< (floor x m) base)))
;; This is the necessary condition for using _mpd_div_words().
(defthmd dw-reduce-second-quotient-fits-in-single-word
(implies (and (< 0 m) (< m base)
(< hi base) (< lo base)
(natp m) (natp base)
(natp hi) (natp lo)
(equal r1 (mod hi m)))
(< (floor (+ (* r1 base) lo) m)
base))
:hints (("Goal" :cases ((< r1 m)))
("Subgoal 1''" :cases ((< (+ lo (* base (mod hi m))) (* base m))))
("Subgoal 1.2" :use ((:instance dw-reduce-aux1
(x (mod hi m)))))))
;; =========================================================================
;; dw-submod is correct
;; =========================================================================
(defun dw-submod (a hi lo m base)
(let* ((r (dw-reduce hi lo m base))
(d (mod (- a r) base))
(d (if (< a r) (mod (+ d m) base) d)))
d))
(defthmd dw-submod-aux1
(implies (and (natp a) (< 0 m) (natp m)
(natp x) (equal r (mod x m)))
(equal (mod (- a x) m)
(mod (- a r) m))))
(defthmd dw-submod-correct
(implies (and (< 0 m) (< m base)
(natp a) (< a m)
(< hi base) (< lo base)
(natp m) (natp base)
(natp hi) (natp lo))
(equal (dw-submod a hi lo m base)
(mod (- a (+ (* base hi) lo)) m)))
:hints (("Goal" :in-theory (disable dw-reduce)
:use ((:instance dw-submod-aux1
(x (+ lo (* base hi)))
(r (dw-reduce hi lo m base)))
(:instance dw-reduce-correct)))))
;; =========================================================================
;; ANSI C arithmetic for uint64_t
;; =========================================================================
(defun add (a b)
(mod (+ a b)
(expt 2 64)))
(defun sub (a b)
(mod (- a b)
(expt 2 64)))
(defun << (w n)
(mod (* w (expt 2 n))
(expt 2 64)))
(defun >> (w n)
(floor w (expt 2 n)))
;; join upper and lower half of a double word, yielding a 128 bit number
(defun join (hi lo)
(+ (* (expt 2 64) hi) lo))
;; =============================================================================
;; Fast modular reduction
;; =============================================================================
;; These are the three primes used in the Number Theoretic Transform.
;; A fast modular reduction scheme exists for all of them.
(defmacro p1 ()
(+ (expt 2 64) (- (expt 2 32)) 1))
(defmacro p2 ()
(+ (expt 2 64) (- (expt 2 34)) 1))
(defmacro p3 ()
(+ (expt 2 64) (- (expt 2 40)) 1))
;; reduce the double word number hi*2**64 + lo (mod p1)
(defun simple-mod-reduce-p1 (hi lo)
(+ (* (expt 2 32) hi) (- hi) lo))
;; reduce the double word number hi*2**64 + lo (mod p2)
(defun simple-mod-reduce-p2 (hi lo)
(+ (* (expt 2 34) hi) (- hi) lo))
;; reduce the double word number hi*2**64 + lo (mod p3)
(defun simple-mod-reduce-p3 (hi lo)
(+ (* (expt 2 40) hi) (- hi) lo))
; ----------------------------------------------------------
; The modular reductions given above are correct
; ----------------------------------------------------------
(defthmd congruence-p1-aux
(equal (* (expt 2 64) hi)
(+ (* (p1) hi)
(* (expt 2 32) hi)
(- hi))))
(defthmd congruence-p2-aux
(equal (* (expt 2 64) hi)
(+ (* (p2) hi)
(* (expt 2 34) hi)
(- hi))))
(defthmd congruence-p3-aux
(equal (* (expt 2 64) hi)
(+ (* (p3) hi)
(* (expt 2 40) hi)
(- hi))))
(defthmd mod-augment
(implies (and (rationalp x)
(rationalp y)
(rationalp m))
(equal (mod (+ x y) m)
(mod (+ x (mod y m)) m))))
(defthmd simple-mod-reduce-p1-congruent
(implies (and (integerp hi)
(integerp lo))
(equal (mod (simple-mod-reduce-p1 hi lo) (p1))
(mod (join hi lo) (p1))))
:hints (("Goal''" :use ((:instance congruence-p1-aux)
(:instance mod-augment
(m (p1))
(x (+ (- hi) lo (* (expt 2 32) hi)))
(y (* (p1) hi)))))))
(defthmd simple-mod-reduce-p2-congruent
(implies (and (integerp hi)
(integerp lo))
(equal (mod (simple-mod-reduce-p2 hi lo) (p2))
(mod (join hi lo) (p2))))
:hints (("Goal''" :use ((:instance congruence-p2-aux)
(:instance mod-augment
(m (p2))
(x (+ (- hi) lo (* (expt 2 34) hi)))
(y (* (p2) hi)))))))
(defthmd simple-mod-reduce-p3-congruent
(implies (and (integerp hi)
(integerp lo))
(equal (mod (simple-mod-reduce-p3 hi lo) (p3))
(mod (join hi lo) (p3))))
:hints (("Goal''" :use ((:instance congruence-p3-aux)
(:instance mod-augment
(m (p3))
(x (+ (- hi) lo (* (expt 2 40) hi)))
(y (* (p3) hi)))))))
; ---------------------------------------------------------------------
; We need a number less than 2*p, so that we can use the trick from
; elim-mod-m<x<2*m for the final reduction.
; For p1, two modular reductions are sufficient, for p2 and p3 three.
; ---------------------------------------------------------------------
;; p1: the first reduction is less than 2**96
(defthmd simple-mod-reduce-p1-<-2**96
(implies (and (< hi (expt 2 64))
(< lo (expt 2 64))
(natp hi) (natp lo))
(< (simple-mod-reduce-p1 hi lo)
(expt 2 96))))
;; p1: the second reduction is less than 2*p1
(defthmd simple-mod-reduce-p1-<-2*p1
(implies (and (< hi (expt 2 64))
(< lo (expt 2 64))
(< (join hi lo) (expt 2 96))
(natp hi) (natp lo))
(< (simple-mod-reduce-p1 hi lo)
(* 2 (p1)))))
;; p2: the first reduction is less than 2**98
(defthmd simple-mod-reduce-p2-<-2**98
(implies (and (< hi (expt 2 64))
(< lo (expt 2 64))
(natp hi) (natp lo))
(< (simple-mod-reduce-p2 hi lo)
(expt 2 98))))
;; p2: the second reduction is less than 2**69
(defthmd simple-mod-reduce-p2-<-2*69
(implies (and (< hi (expt 2 64))
(< lo (expt 2 64))
(< (join hi lo) (expt 2 98))
(natp hi) (natp lo))
(< (simple-mod-reduce-p2 hi lo)
(expt 2 69))))
;; p3: the third reduction is less than 2*p2
(defthmd simple-mod-reduce-p2-<-2*p2
(implies (and (< hi (expt 2 64))
(< lo (expt 2 64))
(< (join hi lo) (expt 2 69))
(natp hi) (natp lo))
(< (simple-mod-reduce-p2 hi lo)
(* 2 (p2)))))
;; p3: the first reduction is less than 2**104
(defthmd simple-mod-reduce-p3-<-2**104
(implies (and (< hi (expt 2 64))
(< lo (expt 2 64))
(natp hi) (natp lo))
(< (simple-mod-reduce-p3 hi lo)
(expt 2 104))))
;; p3: the second reduction is less than 2**81
(defthmd simple-mod-reduce-p3-<-2**81
(implies (and (< hi (expt 2 64))
(< lo (expt 2 64))
(< (join hi lo) (expt 2 104))
(natp hi) (natp lo))
(< (simple-mod-reduce-p3 hi lo)
(expt 2 81))))
;; p3: the third reduction is less than 2*p3
(defthmd simple-mod-reduce-p3-<-2*p3
(implies (and (< hi (expt 2 64))
(< lo (expt 2 64))
(< (join hi lo) (expt 2 81))
(natp hi) (natp lo))
(< (simple-mod-reduce-p3 hi lo)
(* 2 (p3)))))
; -------------------------------------------------------------------------
; The simple modular reductions, adapted for compiler friendly C
; -------------------------------------------------------------------------
(defun mod-reduce-p1 (hi lo)
(let* ((y hi)
(x y)
(hi (>> hi 32))
(x (sub lo x))
(hi (if (> x lo) (+ hi -1) hi))
(y (<< y 32))
(lo (add y x))
(hi (if (< lo y) (+ hi 1) hi)))
(+ (* hi (expt 2 64)) lo)))
(defun mod-reduce-p2 (hi lo)
(let* ((y hi)
(x y)
(hi (>> hi 30))
(x (sub lo x))
(hi (if (> x lo) (+ hi -1) hi))
(y (<< y 34))
(lo (add y x))
(hi (if (< lo y) (+ hi 1) hi)))
(+ (* hi (expt 2 64)) lo)))
(defun mod-reduce-p3 (hi lo)
(let* ((y hi)
(x y)
(hi (>> hi 24))
(x (sub lo x))
(hi (if (> x lo) (+ hi -1) hi))
(y (<< y 40))
(lo (add y x))
(hi (if (< lo y) (+ hi 1) hi)))
(+ (* hi (expt 2 64)) lo)))
; -------------------------------------------------------------------------
; The compiler friendly versions are equal to the simple versions
; -------------------------------------------------------------------------
(defthm mod-reduce-aux1
(implies (and (<= 0 a) (natp a) (natp m)
(< (- m) b) (<= b 0)
(integerp b)
(< (mod (+ b a) m)
(mod a m)))
(equal (mod (+ b a) m)
(+ b (mod a m))))
:hints (("Subgoal 2" :use ((:instance modaux-1b
(x (+ a b)))))))
(defthm mod-reduce-aux2
(implies (and (<= 0 a) (natp a) (natp m)
(< b m) (natp b)
(< (mod (+ b a) m)
(mod a m)))
(equal (+ m (mod (+ b a) m))
(+ b (mod a m)))))
(defthm mod-reduce-aux3
(implies (and (< 0 a) (natp a) (natp m)
(< (- m) b) (< b 0)
(integerp b)
(<= (mod a m)
(mod (+ b a) m)))
(equal (+ (- m) (mod (+ b a) m))
(+ b (mod a m))))
:hints (("Subgoal 1.2'" :use ((:instance modaux-1b
(x b))))
("Subgoal 1''" :use ((:instance modaux-2d
(x I))))))
(defthm mod-reduce-aux4
(implies (and (< 0 a) (natp a) (natp m)
(< b m) (natp b)
(<= (mod a m)
(mod (+ b a) m)))
(equal (mod (+ b a) m)
(+ b (mod a m)))))
(defthm mod-reduce-p1==simple-mod-reduce-p1
(implies (and (< hi (expt 2 64))
(< lo (expt 2 64))
(natp hi) (natp lo))
(equal (mod-reduce-p1 hi lo)
(simple-mod-reduce-p1 hi lo)))
:hints (("Goal" :in-theory (disable expt)
:cases ((< 0 hi)))
("Subgoal 1.2.2'" :use ((:instance mod-reduce-aux1
(m (expt 2 64))
(b (+ (- HI) LO))
(a (* (expt 2 32) hi)))))
("Subgoal 1.2.1'" :use ((:instance mod-reduce-aux3
(m (expt 2 64))
(b (+ (- HI) LO))
(a (* (expt 2 32) hi)))))
("Subgoal 1.1.2'" :use ((:instance mod-reduce-aux2
(m (expt 2 64))
(b (+ (- HI) LO))
(a (* (expt 2 32) hi)))))
("Subgoal 1.1.1'" :use ((:instance mod-reduce-aux4
(m (expt 2 64))
(b (+ (- HI) LO))
(a (* (expt 2 32) hi)))))))
(defthm mod-reduce-p2==simple-mod-reduce-p2
(implies (and (< hi (expt 2 64))
(< lo (expt 2 64))
(natp hi) (natp lo))
(equal (mod-reduce-p2 hi lo)
(simple-mod-reduce-p2 hi lo)))
:hints (("Goal" :cases ((< 0 hi)))
("Subgoal 1.2.2'" :use ((:instance mod-reduce-aux1
(m (expt 2 64))
(b (+ (- HI) LO))
(a (* (expt 2 34) hi)))))
("Subgoal 1.2.1'" :use ((:instance mod-reduce-aux3
(m (expt 2 64))
(b (+ (- HI) LO))
(a (* (expt 2 34) hi)))))
("Subgoal 1.1.2'" :use ((:instance mod-reduce-aux2
(m (expt 2 64))
(b (+ (- HI) LO))
(a (* (expt 2 34) hi)))))
("Subgoal 1.1.1'" :use ((:instance mod-reduce-aux4
(m (expt 2 64))
(b (+ (- HI) LO))
(a (* (expt 2 34) hi)))))))
(defthm mod-reduce-p3==simple-mod-reduce-p3
(implies (and (< hi (expt 2 64))
(< lo (expt 2 64))
(natp hi) (natp lo))
(equal (mod-reduce-p3 hi lo)
(simple-mod-reduce-p3 hi lo)))
:hints (("Goal" :cases ((< 0 hi)))
("Subgoal 1.2.2'" :use ((:instance mod-reduce-aux1
(m (expt 2 64))
(b (+ (- HI) LO))
(a (* (expt 2 40) hi)))))
("Subgoal 1.2.1'" :use ((:instance mod-reduce-aux3
(m (expt 2 64))
(b (+ (- HI) LO))
(a (* (expt 2 40) hi)))))
("Subgoal 1.1.2'" :use ((:instance mod-reduce-aux2
(m (expt 2 64))
(b (+ (- HI) LO))
(a (* (expt 2 40) hi)))))
("Subgoal 1.1.1'" :use ((:instance mod-reduce-aux4
(m (expt 2 64))
(b (+ (- HI) LO))
(a (* (expt 2 40) hi)))))))

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,8 +28,6 @@
#include "mpdecimal.h"
#include <stdio.h>
#include <stdlib.h>
#include "typearith.h"
#include "mpalloc.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,17 +28,13 @@
#include "mpdecimal.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <math.h>
#include "basearith.h"
#include "bits.h"
#include "convolute.h"
#include "crt.h"
#include "mpalloc.h"
#include "typearith.h"
#include "libc/math.h"
#include "umodarith.h"
#ifdef PPRO

View file

@ -1,34 +1,11 @@
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef MPDECIMAL_H
#define MPDECIMAL_H
#include "libc/fmt/conv.h"
#include "libc/inttypes.h"
#include "libc/limits.h"
#include "libc/stdio/stdio.h"
#include "third_party/python/pyconfig.h"
/* clang-format off */
#ifdef __cplusplus
extern "C" {
@ -38,66 +15,31 @@ extern "C" {
#endif
#endif
#ifndef _MSC_VER
#include "pyconfig.h"
#ifndef __GNUC_STDC_INLINE__
#define __GNUC_STDC_INLINE__ 1
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <assert.h>
#include <stdint.h>
#include <inttypes.h>
#ifdef _MSC_VER
#include "vccompat.h"
#ifndef UNUSED
#define UNUSED
#endif
#define MPD_PRAGMA(x)
#define MPD_HIDE_SYMBOLS_START
#define MPD_HIDE_SYMBOLS_END
#define EXTINLINE extern inline
#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
#define UNUSED __attribute__((__unused__))
#else
#ifndef __GNUC_STDC_INLINE__
#define __GNUC_STDC_INLINE__ 1
#endif
#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
#define UNUSED __attribute__((unused))
#else
#define UNUSED
#endif
#if (defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)) && \
defined(__GNUC__) && __GNUC__ >= 4 && !defined(__INTEL_COMPILER)
#define MPD_PRAGMA(x) _Pragma(x)
#define MPD_HIDE_SYMBOLS_START "GCC visibility push(hidden)"
#define MPD_HIDE_SYMBOLS_END "GCC visibility pop"
#else
#define MPD_PRAGMA(x)
#define MPD_HIDE_SYMBOLS_START
#define MPD_HIDE_SYMBOLS_END
#endif
#define EXTINLINE
#define UNUSED
#endif
#if (defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)) && \
defined(__GNUC__) && __GNUC__ >= 4 && !defined(__INTEL_COMPILER)
#define MPD_PRAGMA(x) _Pragma(x)
#define MPD_HIDE_SYMBOLS_START "GCC visibility push(hidden)"
#define MPD_HIDE_SYMBOLS_END "GCC visibility pop"
#else
#define MPD_PRAGMA(x)
#define MPD_HIDE_SYMBOLS_START
#define MPD_HIDE_SYMBOLS_END
#endif
#define EXTINLINE
/* This header file is internal for the purpose of building _decimal.so.
* All symbols should have local scope in the DSO. */
MPD_PRAGMA(MPD_HIDE_SYMBOLS_START)
#if !defined(LEGACY_COMPILER)
#if !defined(UINT64_MAX)
/* The following #error is just a warning. If the compiler indeed does
* not have uint64_t, it is perfectly safe to comment out the #error. */
#error "Warning: Compiler without uint64_t. Comment out this line."
#define LEGACY_COMPILER
#endif
#endif
/******************************************************************************/
/* Version */
/******************************************************************************/

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,8 +28,6 @@
#include "mpdecimal.h"
#include <stdlib.h>
#include <assert.h>
#include "bits.h"
#include "umodarith.h"
#include "numbertheory.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,9 +28,6 @@
#include "mpdecimal.h"
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "bits.h"
#include "difradix2.h"
#include "numbertheory.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -31,7 +32,6 @@
#include "mpdecimal.h"
#include <stdio.h>
/* Internal header file: all symbols have local scope in the DSO */

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -27,11 +28,6 @@
#include "mpdecimal.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <assert.h>
#include "bits.h"
#include "constants.h"
#include "typearith.h"

View file

@ -1,38 +1,7 @@
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef TRANSPOSE_H
#define TRANSPOSE_H
#include "mpdecimal.h"
#include <stdio.h>
/* clang-format off */
/* Internal header file: all symbols have local scope in the DSO */
MPD_PRAGMA(MPD_HIDE_SYMBOLS_START)

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*
@ -30,6 +31,7 @@
#define TYPEARITH_H
#include "libc/assert.h"
#include "mpdecimal.h"

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*

View file

@ -1,3 +1,4 @@
/* clang-format off */
/*
* Copyright (c) 2008-2016 Stefan Krah. All rights reserved.
*

View file

@ -1,3 +1,4 @@
/* clang-format off */
// ISO C9x compliant stdint.h for Microsoft Visual Studio
// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
//