mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-05-22 21:32:31 +00:00
Make numerous improvements
- Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
This commit is contained in:
parent
fa7b4f5bd1
commit
39bf41f4eb
806 changed files with 77494 additions and 63859 deletions
|
@ -1,20 +1,14 @@
|
|||
#ifndef UMODARITH_H
|
||||
#define UMODARITH_H
|
||||
#include "libc/log/libfatal.internal.h"
|
||||
#include "third_party/python/Modules/_decimal/libmpdec/constants.h"
|
||||
#include "third_party/python/Modules/_decimal/libmpdec/mpdecimal.h"
|
||||
#include "third_party/python/Modules/_decimal/libmpdec/typearith.h"
|
||||
/* clang-format off */
|
||||
|
||||
|
||||
/* Bignum: Low level routines for unsigned modular arithmetic. These are
|
||||
used in the fast convolution functions for very large coefficients. */
|
||||
|
||||
|
||||
/**************************************************************************/
|
||||
/* ANSI modular arithmetic */
|
||||
/**************************************************************************/
|
||||
|
||||
|
||||
/*
|
||||
* Restrictions: a < m and b < m
|
||||
* ACL2 proof: umodarith.lisp: addmod-correct
|
||||
|
@ -23,11 +17,9 @@ static inline mpd_uint_t
|
|||
addmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
|
||||
{
|
||||
mpd_uint_t s;
|
||||
|
||||
s = a + b;
|
||||
s = (s < a) ? s - m : s;
|
||||
s = (s >= m) ? s - m : s;
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
|
@ -39,10 +31,8 @@ static inline mpd_uint_t
|
|||
submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
|
||||
{
|
||||
mpd_uint_t d;
|
||||
|
||||
d = a - b;
|
||||
d = (a < b) ? d + m : d;
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
|
@ -54,13 +44,10 @@ static inline mpd_uint_t
|
|||
ext_submod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
|
||||
{
|
||||
mpd_uint_t d;
|
||||
|
||||
a = (a >= m) ? a - m : a;
|
||||
b = (b >= m) ? b - m : b;
|
||||
|
||||
d = a - b;
|
||||
d = (a < b) ? d + m : d;
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
|
@ -73,10 +60,8 @@ static inline mpd_uint_t
|
|||
dw_reduce(mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
|
||||
{
|
||||
mpd_uint_t r1, r2, w;
|
||||
|
||||
_mpd_div_word(&w, &r1, hi, m);
|
||||
_mpd_div_words(&w, &r2, r1, lo, m);
|
||||
|
||||
return r2;
|
||||
}
|
||||
|
||||
|
@ -89,142 +74,213 @@ static inline mpd_uint_t
|
|||
dw_submod(mpd_uint_t a, mpd_uint_t hi, mpd_uint_t lo, mpd_uint_t m)
|
||||
{
|
||||
mpd_uint_t d, r;
|
||||
|
||||
r = dw_reduce(hi, lo, m);
|
||||
d = a - r;
|
||||
d = (a < r) ? d + m : d;
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_64
|
||||
|
||||
/**************************************************************************/
|
||||
/* 64-bit modular arithmetic */
|
||||
/**************************************************************************/
|
||||
|
||||
/*
|
||||
* A proof of the algorithm is in literature/mulmod-64.txt. An ACL2
|
||||
* proof is in umodarith.lisp: section "Fast modular reduction".
|
||||
/**
|
||||
* Calculates (a × b) % 𝑝 where 𝑝 is special
|
||||
*
|
||||
* Algorithm: calculate (a * b) % p:
|
||||
* In the whole comment, "⩭" stands for "is congruent with".
|
||||
*
|
||||
* a) hi, lo <- a * b # Calculate a * b.
|
||||
* Result of a × b in terms of high/low words:
|
||||
*
|
||||
* b) hi, lo <- R(hi, lo) # Reduce modulo p.
|
||||
* (1) hi × 2⁶⁴ + lo = a × b
|
||||
*
|
||||
* c) Repeat step b) until 0 <= hi * 2**64 + lo < 2*p.
|
||||
* Special primes:
|
||||
*
|
||||
* d) If the result is less than p, return lo. Otherwise return lo - p.
|
||||
* (2) 𝑝 = 2⁶⁴ - z + 1, where z = 2ⁿ
|
||||
*
|
||||
* i.e. 0xfffffffffffffc01
|
||||
* 0xfffffffffffff001
|
||||
* 0xffffffffff000001
|
||||
* 0xffffffff00000001
|
||||
* 0xfffffffc00000001
|
||||
* 0xffffff0000000001
|
||||
* 0xffffff0000000001
|
||||
*
|
||||
* Single step modular reduction:
|
||||
*
|
||||
* (3) R(hi, lo) = hi × z - hi + lo
|
||||
*
|
||||
*
|
||||
* Strategy
|
||||
* --------
|
||||
*
|
||||
* a) Set (hi, lo) to the result of a × b.
|
||||
*
|
||||
* b) Set (hi′, lo′) to the result of R(hi, lo).
|
||||
*
|
||||
* c) Repeat step b) until 0 ≤ hi′ × 2⁶⁴ + lo′ < 𝟸×𝑝.
|
||||
*
|
||||
* d) If the result is less than 𝑝, return lo′. Otherwise return lo′ - 𝑝.
|
||||
*
|
||||
*
|
||||
* The reduction step b) preserves congruence
|
||||
* ------------------------------------------
|
||||
*
|
||||
* hi × 2⁶⁴ + lo ⩭ hi × z - hi + lo (mod 𝑝)
|
||||
*
|
||||
* Proof:
|
||||
* ~~~~~~
|
||||
*
|
||||
* hi × 2⁶⁴ + lo = (2⁶⁴ - z + 1) × hi + z × hi - hi + lo
|
||||
*
|
||||
* = 𝑝 × hi + z × hi - hi + lo
|
||||
*
|
||||
* ⩭ z × hi - hi + lo (mod 𝑝)
|
||||
*
|
||||
*
|
||||
* Maximum numbers of step b)
|
||||
* --------------------------
|
||||
*
|
||||
* To avoid unnecessary formalism, define:
|
||||
*
|
||||
* def R(hi, lo, z):
|
||||
* return divmod(hi * z - hi + lo, 2**64)
|
||||
*
|
||||
* For simplicity, assume hi=2⁶⁴-1, lo=2⁶⁴-1 after the
|
||||
* initial multiplication a × b. This is of course impossible
|
||||
* but certainly covers all cases.
|
||||
*
|
||||
* Then, for p1:
|
||||
*
|
||||
* z = 2³²
|
||||
* hi = 2⁶⁴-1
|
||||
* lo = 2⁶⁴-1
|
||||
* p1 = 2⁶⁴ - z + 1
|
||||
* hi, lo = R(hi, lo, z) # First reduction
|
||||
* hi, lo = R(hi, lo, z) # Second reduction
|
||||
* hi × 2⁶⁴ + lo < 2 × p1 # True
|
||||
*
|
||||
* For p2:
|
||||
*
|
||||
* z = 2³⁴
|
||||
* hi = 2⁶⁴-1
|
||||
* lo = 2⁶⁴-1
|
||||
* p2 = 2⁶⁴ - z + 1
|
||||
* hi, lo = R(hi, lo, z) # First reduction
|
||||
* hi, lo = R(hi, lo, z) # Second reduction
|
||||
* hi, lo = R(hi, lo, z) # Third reduction
|
||||
* hi × 2⁶⁴ + lo < 2 × p2 # True
|
||||
*
|
||||
* For p3:
|
||||
*
|
||||
* z = 2⁴⁰
|
||||
* hi = 2⁶⁴-1
|
||||
* lo = 2⁶⁴-1
|
||||
* p3 = 2⁶⁴ - z + 1
|
||||
* hi, lo = R(hi, lo, z) # First reduction
|
||||
* hi, lo = R(hi, lo, z) # Second reduction
|
||||
* hi, lo = R(hi, lo, z) # Third reduction
|
||||
* hi × 2⁶⁴ + lo < 2 × p3 # True
|
||||
*
|
||||
* Step d) preserves congruence and yields a result < 𝑝
|
||||
* ----------------------------------------------------
|
||||
*
|
||||
* Case hi = 0:
|
||||
*
|
||||
* Case lo < 𝑝: trivial.
|
||||
*
|
||||
* Case lo ≥ 𝑝:
|
||||
*
|
||||
* lo ⩭ lo - 𝑝 (mod 𝑝) # result is congruent
|
||||
*
|
||||
* 𝑝 ≤ lo < 𝟸×𝑝 → 0 ≤ lo - 𝑝 < 𝑝 # result is in the correct range
|
||||
*
|
||||
* Case hi = 1:
|
||||
*
|
||||
* 𝑝 < 2⁶⁴ Λ 2⁶⁴ + lo < 𝟸×𝑝 → lo < 𝑝 # lo is always less than 𝑝
|
||||
*
|
||||
* 2⁶⁴ + lo ⩭ 2⁶⁴ + (lo - 𝑝) (mod 𝑝) # result is congruent
|
||||
*
|
||||
* = lo - 𝑝 # exactly the same value as the previous RHS
|
||||
* # in uint64_t arithmetic.
|
||||
*
|
||||
* 𝑝 < 2⁶⁴ + lo < 𝟸×𝑝 → 0 < 2⁶⁴ + (lo - 𝑝) < 𝑝 # correct range
|
||||
*
|
||||
*
|
||||
* [1] http://www.apfloat.org/apfloat/2.40/apfloat.pdf
|
||||
*/
|
||||
|
||||
static inline mpd_uint_t
|
||||
x64_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
|
||||
{
|
||||
mpd_uint_t hi, lo, x, y;
|
||||
|
||||
|
||||
_mpd_mul_words(&hi, &lo, a, b);
|
||||
|
||||
if (m & (1ULL<<32)) { /* P1 */
|
||||
|
||||
/* first reduction */
|
||||
x = y = hi;
|
||||
hi >>= 32;
|
||||
|
||||
x = lo - x;
|
||||
if (x > lo) hi--;
|
||||
|
||||
y <<= 32;
|
||||
lo = y + x;
|
||||
if (lo < y) hi++;
|
||||
|
||||
/* second reduction */
|
||||
x = y = hi;
|
||||
hi >>= 32;
|
||||
|
||||
x = lo - x;
|
||||
if (x > lo) hi--;
|
||||
|
||||
y <<= 32;
|
||||
lo = y + x;
|
||||
if (lo < y) hi++;
|
||||
|
||||
return (hi || lo >= m ? lo - m : lo);
|
||||
return hi || lo >= m ? lo - m : lo;
|
||||
}
|
||||
else if (m & (1ULL<<34)) { /* P2 */
|
||||
|
||||
/* first reduction */
|
||||
x = y = hi;
|
||||
hi >>= 30;
|
||||
|
||||
x = lo - x;
|
||||
if (x > lo) hi--;
|
||||
|
||||
y <<= 34;
|
||||
lo = y + x;
|
||||
if (lo < y) hi++;
|
||||
|
||||
/* second reduction */
|
||||
x = y = hi;
|
||||
hi >>= 30;
|
||||
|
||||
x = lo - x;
|
||||
if (x > lo) hi--;
|
||||
|
||||
y <<= 34;
|
||||
lo = y + x;
|
||||
if (lo < y) hi++;
|
||||
|
||||
/* third reduction */
|
||||
x = y = hi;
|
||||
hi >>= 30;
|
||||
|
||||
x = lo - x;
|
||||
if (x > lo) hi--;
|
||||
|
||||
y <<= 34;
|
||||
lo = y + x;
|
||||
if (lo < y) hi++;
|
||||
|
||||
return (hi || lo >= m ? lo - m : lo);
|
||||
return hi || lo >= m ? lo - m : lo;
|
||||
}
|
||||
else { /* P3 */
|
||||
|
||||
/* first reduction */
|
||||
x = y = hi;
|
||||
hi >>= 24;
|
||||
|
||||
x = lo - x;
|
||||
if (x > lo) hi--;
|
||||
|
||||
y <<= 40;
|
||||
lo = y + x;
|
||||
if (lo < y) hi++;
|
||||
|
||||
/* second reduction */
|
||||
x = y = hi;
|
||||
hi >>= 24;
|
||||
|
||||
x = lo - x;
|
||||
if (x > lo) hi--;
|
||||
|
||||
y <<= 40;
|
||||
lo = y + x;
|
||||
if (lo < y) hi++;
|
||||
|
||||
/* third reduction */
|
||||
x = y = hi;
|
||||
hi >>= 24;
|
||||
|
||||
x = lo - x;
|
||||
if (x > lo) hi--;
|
||||
|
||||
y <<= 40;
|
||||
lo = y + x;
|
||||
if (lo < y) hi++;
|
||||
|
||||
return (hi || lo >= m ? lo - m : lo);
|
||||
return hi || lo >= m ? lo - m : lo;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -247,375 +303,13 @@ static inline mpd_uint_t
|
|||
x64_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
|
||||
{
|
||||
mpd_uint_t r = 1;
|
||||
|
||||
while (exp > 0) {
|
||||
if (exp & 1)
|
||||
r = x64_mulmod(r, base, umod);
|
||||
base = x64_mulmod(base, base, umod);
|
||||
exp >>= 1;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
/* END CONFIG_64 */
|
||||
#else /* CONFIG_32 */
|
||||
|
||||
|
||||
/**************************************************************************/
|
||||
/* 32-bit modular arithmetic */
|
||||
/**************************************************************************/
|
||||
|
||||
#if defined(ANSI)
|
||||
#if !defined(LEGACY_COMPILER)
|
||||
/* HAVE_UINT64_T */
|
||||
static inline mpd_uint_t
|
||||
std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
|
||||
{
|
||||
return ((mpd_uuint_t) a * b) % m;
|
||||
}
|
||||
|
||||
static inline void
|
||||
std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
|
||||
{
|
||||
*a = ((mpd_uuint_t) *a * w) % m;
|
||||
*b = ((mpd_uuint_t) *b * w) % m;
|
||||
}
|
||||
|
||||
static inline void
|
||||
std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
|
||||
mpd_uint_t m)
|
||||
{
|
||||
*a0 = ((mpd_uuint_t) *a0 * b0) % m;
|
||||
*a1 = ((mpd_uuint_t) *a1 * b1) % m;
|
||||
}
|
||||
/* END HAVE_UINT64_T */
|
||||
#else
|
||||
/* LEGACY_COMPILER */
|
||||
static inline mpd_uint_t
|
||||
std_mulmod(mpd_uint_t a, mpd_uint_t b, mpd_uint_t m)
|
||||
{
|
||||
mpd_uint_t hi, lo, q, r;
|
||||
_mpd_mul_words(&hi, &lo, a, b);
|
||||
_mpd_div_words(&q, &r, hi, lo, m);
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline void
|
||||
std_mulmod2c(mpd_uint_t *a, mpd_uint_t *b, mpd_uint_t w, mpd_uint_t m)
|
||||
{
|
||||
*a = std_mulmod(*a, w, m);
|
||||
*b = std_mulmod(*b, w, m);
|
||||
}
|
||||
|
||||
static inline void
|
||||
std_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
|
||||
mpd_uint_t m)
|
||||
{
|
||||
*a0 = std_mulmod(*a0, b0, m);
|
||||
*a1 = std_mulmod(*a1, b1, m);
|
||||
}
|
||||
/* END LEGACY_COMPILER */
|
||||
#endif
|
||||
|
||||
static inline mpd_uint_t
|
||||
std_powmod(mpd_uint_t base, mpd_uint_t exp, mpd_uint_t umod)
|
||||
{
|
||||
mpd_uint_t r = 1;
|
||||
|
||||
while (exp > 0) {
|
||||
if (exp & 1)
|
||||
r = std_mulmod(r, base, umod);
|
||||
base = std_mulmod(base, base, umod);
|
||||
exp >>= 1;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
#endif /* ANSI CONFIG_32 */
|
||||
|
||||
|
||||
/**************************************************************************/
|
||||
/* Pentium Pro modular arithmetic */
|
||||
/**************************************************************************/
|
||||
|
||||
/*
|
||||
* A proof of the algorithm is in literature/mulmod-ppro.txt. The FPU
|
||||
* control word must be set to 64-bit precision and truncation mode
|
||||
* prior to using these functions.
|
||||
*
|
||||
* Algorithm: calculate (a * b) % p:
|
||||
*
|
||||
* p := prime < 2**31
|
||||
* pinv := (long double)1.0 / p (precalculated)
|
||||
*
|
||||
* a) n = a * b # Calculate exact product.
|
||||
* b) qest = n * pinv # Calculate estimate for q = n / p.
|
||||
* c) q = (qest+2**63)-2**63 # Truncate qest to the exact quotient.
|
||||
* d) r = n - q * p # Calculate remainder.
|
||||
*
|
||||
* Remarks:
|
||||
*
|
||||
* - p = dmod and pinv = dinvmod.
|
||||
* - dinvmod points to an array of three uint32_t, which is interpreted
|
||||
* as an 80 bit long double by fldt.
|
||||
* - Intel compilers prior to version 11 do not seem to handle the
|
||||
* __GNUC__ inline assembly correctly.
|
||||
* - random tests are provided in tests/extended/ppro_mulmod.c
|
||||
*/
|
||||
|
||||
#if defined(PPRO)
|
||||
#if defined(ASM)
|
||||
|
||||
/* Return (a * b) % dmod */
|
||||
static inline mpd_uint_t
|
||||
ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
|
||||
{
|
||||
mpd_uint_t retval;
|
||||
|
||||
__asm__ (
|
||||
"fildl %2\n\t"
|
||||
"fildl %1\n\t"
|
||||
"fmulp %%st, %%st(1)\n\t"
|
||||
"fldt (%4)\n\t"
|
||||
"fmul %%st(1), %%st\n\t"
|
||||
"flds %5\n\t"
|
||||
"fadd %%st, %%st(1)\n\t"
|
||||
"fsubrp %%st, %%st(1)\n\t"
|
||||
"fldl (%3)\n\t"
|
||||
"fmulp %%st, %%st(1)\n\t"
|
||||
"fsubrp %%st, %%st(1)\n\t"
|
||||
"fistpl %0\n\t"
|
||||
: "=m" (retval)
|
||||
: "m" (a), "m" (b), "r" (dmod), "r" (dinvmod), "m" (MPD_TWO63)
|
||||
: "st", "memory"
|
||||
);
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
/*
|
||||
* Two modular multiplications in parallel:
|
||||
* *a0 = (*a0 * w) % dmod
|
||||
* *a1 = (*a1 * w) % dmod
|
||||
*/
|
||||
static inline void
|
||||
ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
|
||||
double *dmod, uint32_t *dinvmod)
|
||||
{
|
||||
__asm__ (
|
||||
"fildl %2\n\t"
|
||||
"fildl (%1)\n\t"
|
||||
"fmul %%st(1), %%st\n\t"
|
||||
"fxch %%st(1)\n\t"
|
||||
"fildl (%0)\n\t"
|
||||
"fmulp %%st, %%st(1) \n\t"
|
||||
"fldt (%4)\n\t"
|
||||
"flds %5\n\t"
|
||||
"fld %%st(2)\n\t"
|
||||
"fmul %%st(2)\n\t"
|
||||
"fadd %%st(1)\n\t"
|
||||
"fsub %%st(1)\n\t"
|
||||
"fmull (%3)\n\t"
|
||||
"fsubrp %%st, %%st(3)\n\t"
|
||||
"fxch %%st(2)\n\t"
|
||||
"fistpl (%0)\n\t"
|
||||
"fmul %%st(2)\n\t"
|
||||
"fadd %%st(1)\n\t"
|
||||
"fsubp %%st, %%st(1)\n\t"
|
||||
"fmull (%3)\n\t"
|
||||
"fsubrp %%st, %%st(1)\n\t"
|
||||
"fistpl (%1)\n\t"
|
||||
: : "r" (a0), "r" (a1), "m" (w),
|
||||
"r" (dmod), "r" (dinvmod),
|
||||
"m" (MPD_TWO63)
|
||||
: "st", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
/*
|
||||
* Two modular multiplications in parallel:
|
||||
* *a0 = (*a0 * b0) % dmod
|
||||
* *a1 = (*a1 * b1) % dmod
|
||||
*/
|
||||
static inline void
|
||||
ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
|
||||
double *dmod, uint32_t *dinvmod)
|
||||
{
|
||||
__asm__ (
|
||||
"fildl %3\n\t"
|
||||
"fildl (%2)\n\t"
|
||||
"fmulp %%st, %%st(1)\n\t"
|
||||
"fildl %1\n\t"
|
||||
"fildl (%0)\n\t"
|
||||
"fmulp %%st, %%st(1)\n\t"
|
||||
"fldt (%5)\n\t"
|
||||
"fld %%st(2)\n\t"
|
||||
"fmul %%st(1), %%st\n\t"
|
||||
"fxch %%st(1)\n\t"
|
||||
"fmul %%st(2), %%st\n\t"
|
||||
"flds %6\n\t"
|
||||
"fldl (%4)\n\t"
|
||||
"fxch %%st(3)\n\t"
|
||||
"fadd %%st(1), %%st\n\t"
|
||||
"fxch %%st(2)\n\t"
|
||||
"fadd %%st(1), %%st\n\t"
|
||||
"fxch %%st(2)\n\t"
|
||||
"fsub %%st(1), %%st\n\t"
|
||||
"fxch %%st(2)\n\t"
|
||||
"fsubp %%st, %%st(1)\n\t"
|
||||
"fxch %%st(1)\n\t"
|
||||
"fmul %%st(2), %%st\n\t"
|
||||
"fxch %%st(1)\n\t"
|
||||
"fmulp %%st, %%st(2)\n\t"
|
||||
"fsubrp %%st, %%st(3)\n\t"
|
||||
"fsubrp %%st, %%st(1)\n\t"
|
||||
"fxch %%st(1)\n\t"
|
||||
"fistpl (%2)\n\t"
|
||||
"fistpl (%0)\n\t"
|
||||
: : "r" (a0), "m" (b0), "r" (a1), "m" (b1),
|
||||
"r" (dmod), "r" (dinvmod),
|
||||
"m" (MPD_TWO63)
|
||||
: "st", "memory"
|
||||
);
|
||||
}
|
||||
/* END PPRO GCC ASM */
|
||||
#elif defined(MASM)
|
||||
|
||||
/* Return (a * b) % dmod */
|
||||
static inline mpd_uint_t __cdecl
|
||||
ppro_mulmod(mpd_uint_t a, mpd_uint_t b, double *dmod, uint32_t *dinvmod)
|
||||
{
|
||||
mpd_uint_t retval;
|
||||
|
||||
__asm {
|
||||
mov eax, dinvmod
|
||||
mov edx, dmod
|
||||
fild b
|
||||
fild a
|
||||
fmulp st(1), st
|
||||
fld TBYTE PTR [eax]
|
||||
fmul st, st(1)
|
||||
fld MPD_TWO63
|
||||
fadd st(1), st
|
||||
fsubp st(1), st
|
||||
fld QWORD PTR [edx]
|
||||
fmulp st(1), st
|
||||
fsubp st(1), st
|
||||
fistp retval
|
||||
}
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
/*
|
||||
* Two modular multiplications in parallel:
|
||||
* *a0 = (*a0 * w) % dmod
|
||||
* *a1 = (*a1 * w) % dmod
|
||||
*/
|
||||
static inline mpd_uint_t __cdecl
|
||||
ppro_mulmod2c(mpd_uint_t *a0, mpd_uint_t *a1, mpd_uint_t w,
|
||||
double *dmod, uint32_t *dinvmod)
|
||||
{
|
||||
__asm {
|
||||
mov ecx, dmod
|
||||
mov edx, a1
|
||||
mov ebx, dinvmod
|
||||
mov eax, a0
|
||||
fild w
|
||||
fild DWORD PTR [edx]
|
||||
fmul st, st(1)
|
||||
fxch st(1)
|
||||
fild DWORD PTR [eax]
|
||||
fmulp st(1), st
|
||||
fld TBYTE PTR [ebx]
|
||||
fld MPD_TWO63
|
||||
fld st(2)
|
||||
fmul st, st(2)
|
||||
fadd st, st(1)
|
||||
fsub st, st(1)
|
||||
fmul QWORD PTR [ecx]
|
||||
fsubp st(3), st
|
||||
fxch st(2)
|
||||
fistp DWORD PTR [eax]
|
||||
fmul st, st(2)
|
||||
fadd st, st(1)
|
||||
fsubrp st(1), st
|
||||
fmul QWORD PTR [ecx]
|
||||
fsubp st(1), st
|
||||
fistp DWORD PTR [edx]
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Two modular multiplications in parallel:
|
||||
* *a0 = (*a0 * b0) % dmod
|
||||
* *a1 = (*a1 * b1) % dmod
|
||||
*/
|
||||
static inline void __cdecl
|
||||
ppro_mulmod2(mpd_uint_t *a0, mpd_uint_t b0, mpd_uint_t *a1, mpd_uint_t b1,
|
||||
double *dmod, uint32_t *dinvmod)
|
||||
{
|
||||
__asm {
|
||||
mov ecx, dmod
|
||||
mov edx, a1
|
||||
mov ebx, dinvmod
|
||||
mov eax, a0
|
||||
fild b1
|
||||
fild DWORD PTR [edx]
|
||||
fmulp st(1), st
|
||||
fild b0
|
||||
fild DWORD PTR [eax]
|
||||
fmulp st(1), st
|
||||
fld TBYTE PTR [ebx]
|
||||
fld st(2)
|
||||
fmul st, st(1)
|
||||
fxch st(1)
|
||||
fmul st, st(2)
|
||||
fld DWORD PTR MPD_TWO63
|
||||
fld QWORD PTR [ecx]
|
||||
fxch st(3)
|
||||
fadd st, st(1)
|
||||
fxch st(2)
|
||||
fadd st, st(1)
|
||||
fxch st(2)
|
||||
fsub st, st(1)
|
||||
fxch st(2)
|
||||
fsubrp st(1), st
|
||||
fxch st(1)
|
||||
fmul st, st(2)
|
||||
fxch st(1)
|
||||
fmulp st(2), st
|
||||
fsubp st(3), st
|
||||
fsubp st(1), st
|
||||
fxch st(1)
|
||||
fistp DWORD PTR [edx]
|
||||
fistp DWORD PTR [eax]
|
||||
}
|
||||
}
|
||||
#endif /* PPRO MASM (_MSC_VER) */
|
||||
|
||||
|
||||
/* Return (base ** exp) % dmod */
|
||||
static inline mpd_uint_t
|
||||
ppro_powmod(mpd_uint_t base, mpd_uint_t exp, double *dmod, uint32_t *dinvmod)
|
||||
{
|
||||
mpd_uint_t r = 1;
|
||||
|
||||
while (exp > 0) {
|
||||
if (exp & 1)
|
||||
r = ppro_mulmod(r, base, dmod, dinvmod);
|
||||
base = ppro_mulmod(base, base, dmod, dinvmod);
|
||||
exp >>= 1;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
#endif /* PPRO */
|
||||
#endif /* CONFIG_32 */
|
||||
|
||||
|
||||
#endif /* UMODARITH_H */
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue