From ea83cc0ad03763607a2a272afccd9d39d3cde40d Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@gmail.com>
Date: Mon, 26 Jul 2021 15:16:43 -0700
Subject: [PATCH] Make stronger crypto nearly as fast
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

One of the disadvantages of x25519 and ℘256 is it only provides 126 bits
of security, so that seems like a weak link in the chain, if we're using
ECDHE-ECDSA-AES256-GCM-SHA384. The U.S. government wants classified data
to be encrypted using a curve at least as strong as ℘384, which provides
192 bits of security, but if you read the consensus of stack exchange it
would give you the impression that ℘384 is three times slower.

This change (as well as the previous one) makes ℘384 three times as fast
by tuning its modulus and multiplication subroutines with new tests that
should convincingly show: the optimized code behaves the same way as the
old code. Some of the diff noise from the previous change is now removed
too, so that our vendored fork can be more easily compared with upstream
sources. So you can now have stronger cryptography without compromises.

℘384 modulus Justine                        l:         28𝑐          9𝑛𝑠
℘384 modulus MbedTLS NIST                   l:        127𝑐         41𝑛𝑠
℘384 modulus MbedTLS MPI                    l:      1,850𝑐        597𝑛𝑠

The benchmarks above show the improvements made by secp384r1() which is
an important function since it needs to be called 13,000 times whenever
someone establishes a connection to your web server. The same's true of
Mul6x6Adx() which is able to multiply 384-bit numbers in 73 cycles, but
only if your CPU was purchased after 2014 when Broadwell was introduced
---
 libc/nexgen32e/adc.S                         |   39 -
 libc/nexgen32e/mul4x4adx.S                   |  140 +-
 libc/nexgen32e/mul6x6adx.S                   |  208 +-
 libc/nexgen32e/mul8x8.S                      |  483 ----
 libc/nexgen32e/mul8x8adx.S                   |  495 ++++
 libc/nexgen32e/sub.S                         |   41 -
 test/net/https/mbedtls_test.c                |   58 +-
 third_party/mbedtls/bignum.c                 | 2369 +++++++++---------
 third_party/mbedtls/config.h                 |    4 +-
 third_party/mbedtls/ecdh.h                   |    2 +-
 third_party/mbedtls/ecdh_everest.c           |  279 +++
 third_party/mbedtls/ecdh_everest.h           |   43 +
 third_party/mbedtls/ecdsa.c                  |   25 +-
 third_party/mbedtls/ecp.c                    |   12 +-
 third_party/mbedtls/ecp256.c                 |  145 +-
 third_party/mbedtls/ecp384.c                 |  180 +-
 third_party/mbedtls/ecp_curves.c             |  142 +-
 third_party/mbedtls/everest.c                | 1361 ++--------
 third_party/mbedtls/everest.h                |   54 +-
 third_party/mbedtls/mbedtls.mk               |    7 +-
 third_party/mbedtls/secp256r1.c              |    2 +-
 third_party/mbedtls/secp384r1.c              |  269 +-
 third_party/mbedtls/ssl_ciphersuites.c       |    1 -
 third_party/mbedtls/test/everest_test.c      |   77 +
 third_party/mbedtls/test/everest_unravaged.c |  899 +++++++
 third_party/mbedtls/test/secp384r1_test.c    |  294 +++
 third_party/mbedtls/test/test.mk             |   23 +-
 27 files changed, 4291 insertions(+), 3361 deletions(-)
 delete mode 100644 libc/nexgen32e/adc.S
 delete mode 100644 libc/nexgen32e/mul8x8.S
 create mode 100644 libc/nexgen32e/mul8x8adx.S
 delete mode 100644 libc/nexgen32e/sub.S
 create mode 100644 third_party/mbedtls/ecdh_everest.c
 create mode 100644 third_party/mbedtls/ecdh_everest.h
 create mode 100644 third_party/mbedtls/test/everest_test.c
 create mode 100644 third_party/mbedtls/test/everest_unravaged.c
 create mode 100644 third_party/mbedtls/test/secp384r1_test.c

diff --git a/libc/nexgen32e/adc.S b/libc/nexgen32e/adc.S
deleted file mode 100644
index d58f7089b..000000000
--- a/libc/nexgen32e/adc.S
+++ /dev/null
@@ -1,39 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-
-//	Computes C = A + B
-//
-//	@param	rdi is C
-//	@param	rsi is A
-//	@param	rdx is B
-//	@param	rcx is number of additions
-//	@return	al has carry
-adc:	.leafprologue
-	test	%ecx,%ecx
-	jz	1f
-	xor	%r9d,%r9d
-0:	mov	(%rsi,%r9,8),%rax
-	adc	(%rdx,%r9,8),%rax
-	mov	%rax,(%rdi,%r9,8)
-	inc	%r9d
-	loop	0b
-1:	setb	%al
-	.leafepilogue
-	.endfn	adc,globl
diff --git a/libc/nexgen32e/mul4x4adx.S b/libc/nexgen32e/mul4x4adx.S
index 268d91668..86a02797f 100644
--- a/libc/nexgen32e/mul4x4adx.S
+++ b/libc/nexgen32e/mul4x4adx.S
@@ -18,34 +18,47 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/macros.internal.h"
 
+//	Computes 512-bit product of 256-bit and 256-bit numbers.
+//
+//		Instructions:        88
+//		Total Cycles:        36
+//		Total uOps:         120
+//		uOps Per Cycle:    3.33
+//		IPC:               2.44
+//		Block RThroughput: 20.0
+//
+//	@param	rdi receives 8 quadword result
+//	@param	rsi is left hand side which must have 4 quadwords
+//	@param	rdx is right hand side which must have 4 quadwords
+//	@note	words are host endian while array is little endian
+//	@mayalias
 Mul4x4Adx:
 	push	%rbp
 	mov	%rsp,%rbp
 	.profilable
-	push	%r15
-	push	%r14
-	push	%r13
-	push	%r12
+	sub	$56,%rsp
+	mov	%r15,-8(%rbp)
+	mov	%r14,-16(%rbp)
+	mov	%r13,-24(%rbp)
+	mov	%r12,-32(%rbp)
+	mov	%rbx,-40(%rbp)
 	mov	%rdx,%r12
-	push	%rbx
-	sub	$16,%rsp
 	mov	(%rdx),%rdx
 	mov	(%rsi),%rax
 	mov	16(%rsi),%r11
 	mov	24(%rsi),%r10
-	xor	%r13d,%r13d
 	mulx	%rax,%rbx,%rax
 	mov	%rbx,-48(%rbp)
 	mov	8(%rsi),%rbx
 	mulx	%rbx,%rdx,%rcx
-	adox	%rdx,%rax
+	add	%rdx,%rax
 	mov	(%r12),%rdx
 	mulx	%r11,%rdx,%r9
-	adox	%rdx,%rcx
+	adc	%rdx,%rcx
 	mov	(%r12),%rdx
 	mulx	%r10,%rdx,%r8
-	adox	%rdx,%r9
-	adox	%r13,%r8
+	adc	%rdx,%r9
+	adc	$0,%r8
 	xor	%r13d,%r13d
 	mov	(%rsi),%r14
 	mov	8(%r12),%rdx
@@ -105,12 +118,103 @@ Mul4x4Adx:
 	adox	%r14,%r10
 	mov	%rsi,(%rdi)
 	mov	%r10,56(%rdi)
-	add	$16,%rsp
-	pop	%rbx
-	pop	%r12
-	pop	%r13
-	pop	%r14
-	pop	%r15
-	pop	%rbp
+	mov	-8(%rbp),%r15
+	mov	-16(%rbp),%r14
+	mov	-24(%rbp),%r13
+	mov	-32(%rbp),%r12
+	mov	-40(%rbp),%rbx
+	leave
 	ret
 	.endfn	Mul4x4Adx,globl
+
+	.end
+TIMELINE VIEW       0123456789          012345
+Index     0123456789          0123456789
+[0,0]     DeER .    .    .    .    .    .    .   subq	$56, %rsp
+[0,1]     DeER .    .    .    .    .    .    .   movq	%r15, -8(%rbp)
+[0,2]     D=eER.    .    .    .    .    .    .   movq	%r14, -16(%rbp)
+[0,3]     D==eER    .    .    .    .    .    .   movq	%r13, -24(%rbp)
+[0,4]     D===eER   .    .    .    .    .    .   movq	%r12, -32(%rbp)
+[0,5]     D====eER  .    .    .    .    .    .   movq	%rbx, -40(%rbp)
+[0,6]     .DeE---R  .    .    .    .    .    .   movq	%rdx, %r12
+[0,7]     .DeeeeeER .    .    .    .    .    .   movq	(%rdx), %rdx
+[0,8]     .D=eeeeeER.    .    .    .    .    .   movq	(%rsi), %rax
+[0,9]     .D=eeeeeER.    .    .    .    .    .   movq	16(%rsi), %r11
+[0,10]    .D==eeeeeER    .    .    .    .    .   movq	24(%rsi), %r10
+[0,11]    . D=====eeeeER .    .    .    .    .   mulxq	%rax, %rbx, %rax
+[0,12]    . D========eER .    .    .    .    .   movq	%rbx, -48(%rbp)
+[0,13]    . D=eeeeeE---R .    .    .    .    .   movq	8(%rsi), %rbx
+[0,14]    .  D=====eeeeER.    .    .    .    .   mulxq	%rbx, %rdx, %rcx
+[0,15]    .  D========eER.    .    .    .    .   addq	%rdx, %rax
+[0,16]    .  D=eeeeeE---R.    .    .    .    .   movq	(%r12), %rdx
+[0,17]    .   D=====eeeeER    .    .    .    .   mulxq	%r11, %rdx, %r9
+[0,18]    .   D========eER    .    .    .    .   adcq	%rdx, %rcx
+[0,19]    .   DeeeeeE----R    .    .    .    .   movq	(%r12), %rdx
+[0,20]    .    D=====eeeeER   .    .    .    .   mulxq	%r10, %rdx, %r8
+[0,21]    .    D========eER   .    .    .    .   adcq	%rdx, %r9
+[0,22]    .    D=========eER  .    .    .    .   adcq	$0, %r8
+[0,23]    .    D-----------R  .    .    .    .   xorl	%r13d, %r13d
+[0,24]    .    .DeeeeeE----R  .    .    .    .   movq	(%rsi), %r14
+[0,25]    .    .DeeeeeE----R  .    .    .    .   movq	8(%r12), %rdx
+[0,26]    .    .D=====eeeeER  .    .    .    .   mulxq	%r14, %r14, %r15
+[0,27]    .    .D========eER  .    .    .    .   adoxq	%r14, %rax
+[0,28]    .    . D========eER .    .    .    .   adcxq	%r15, %rcx
+[0,29]    .    . D========eER .    .    .    .   movq	%rax, -56(%rbp)
+[0,30]    .    . D=====eeeeER .    .    .    .   mulxq	%rbx, %r14, %rax
+[0,31]    .    . D=========eER.    .    .    .   adoxq	%r14, %rcx
+[0,32]    .    .  D=========eER    .    .    .   adcxq	%rax, %r9
+[0,33]    .    .  D=====eeeeE-R    .    .    .   mulxq	%r11, %r14, %rax
+[0,34]    .    .  D==========eER   .    .    .   adoxq	%r14, %r9
+[0,35]    .    .  D===========eER  .    .    .   adcxq	%rax, %r8
+[0,36]    .    .   D=====eeeeE--R  .    .    .   mulxq	%r10, %rdx, %rax
+[0,37]    .    .   D===========eER .    .    .   adoxq	%rdx, %r8
+[0,38]    .    .   DeeeeeE-------R .    .    .   movq	16(%r12), %rdx
+[0,39]    .    .   D============eER.    .    .   adcxq	%r13, %rax
+[0,40]    .    .    D============eER    .    .   adoxq	%r13, %rax
+[0,41]    .    .    DeeeeeE--------R    .    .   movq	(%rsi), %r13
+[0,42]    .    .    D=====E--------R    .    .   xorl	%r15d, %r15d
+[0,43]    .    .    D=====eeeeE----R    .    .   mulxq	%r13, %r13, %r14
+[0,44]    .    .    .D=======eE----R    .    .   adoxq	%r13, %rcx
+[0,45]    .    .    .D========eE---R    .    .   adcxq	%r14, %r9
+[0,46]    .    .    .D=====eeeeE---R    .    .   mulxq	%rbx, %r14, %r13
+[0,47]    .    .    .D=========eE--R    .    .   adoxq	%r14, %r9
+[0,48]    .    .    . D=========eE-R    .    .   adcxq	%r13, %r8
+[0,49]    .    .    . D=====eeeeE--R    .    .   mulxq	%r11, %r14, %r13
+[0,50]    .    .    . D==========eER    .    .   adoxq	%r14, %r8
+[0,51]    .    .    . D===========eER   .    .   adcxq	%r13, %rax
+[0,52]    .    .    .  DeeeeeE------R   .    .   movq	(%rsi), %rsi
+[0,53]    .    .    .  D=====eeeeE--R   .    .   mulxq	%r10, %rdx, %r13
+[0,54]    .    .    .  D===========eER  .    .   adoxq	%rdx, %rax
+[0,55]    .    .    .  D============eER .    .   adcxq	%r15, %r13
+[0,56]    .    .    .   DeeeeeE-------R .    .   movq	24(%r12), %rdx
+[0,57]    .    .    .   D============eER.    .   adoxq	%r15, %r13
+[0,58]    .    .    .   D=====eeeeE----R.    .   mulxq	%rsi, %r12, %rsi
+[0,59]    .    .    .   D======E-------R.    .   xorl	%r14d, %r14d
+[0,60]    .    .    .    D========eE---R.    .   adoxq	%r12, %r9
+[0,61]    .    .    .    D=========eE--R.    .   adcxq	%rsi, %r8
+[0,62]    .    .    .    D=====eeeeE---R.    .   mulxq	%rbx, %rsi, %rbx
+[0,63]    .    .    .    D==========eE-R.    .   adoxq	%rsi, %r8
+[0,64]    .    .    .    .D==========eER.    .   adcxq	%rbx, %rax
+[0,65]    .    .    .    .D=====eeeeE--R.    .   mulxq	%r11, %r11, %rsi
+[0,66]    .    .    .    .DeeeeeE------R.    .   movq	-56(%rbp), %rbx
+[0,67]    .    .    .    .D===eE-------R.    .   movq	%rcx, 16(%rdi)
+[0,68]    .    .    .    . D==========eER    .   adcxq	%rsi, %r13
+[0,69]    .    .    .    . DeeeeeE------R    .   movq	-48(%rbp), %rsi
+[0,70]    .    .    .    . D====eE------R    .   movq	%rbx, 8(%rdi)
+[0,71]    .    .    .    . D===========eER   .   adoxq	%r11, %rax
+[0,72]    .    .    .    . D=======eE----R   .   movq	%r9, 24(%rdi)
+[0,73]    .    .    .    . D=========eE--R   .   movq	%r8, 32(%rdi)
+[0,74]    .    .    .    .  D===========eER  .   movq	%rax, 40(%rdi)
+[0,75]    .    .    .    .  D====eeeeE----R  .   mulxq	%r10, %rdx, %r10
+[0,76]    .    .    .    .  D===========eER  .   adoxq	%rdx, %r13
+[0,77]    .    .    .    .  D============eER .   adcxq	%r14, %r10
+[0,78]    .    .    .    .   D===========eER .   movq	%r13, 48(%rdi)
+[0,79]    .    .    .    .   D============eER.   adoxq	%r14, %r10
+[0,80]    .    .    .    .   D============eER.   movq	%rsi, (%rdi)
+[0,81]    .    .    .    .   D=============eER   movq	%r10, 56(%rdi)
+[0,82]    .    .    .    .   DeeeeeE---------R   movq	-8(%rbp), %r15
+[0,83]    .    .    .    .   DeeeeeE---------R   movq	-16(%rbp), %r14
+[0,84]    .    .    .    .    DeeeeeE--------R   movq	-24(%rbp), %r13
+[0,85]    .    .    .    .    DeeeeeE--------R   movq	-32(%rbp), %r12
+[0,86]    .    .    .    .    D=eeeeeE-------R   movq	-40(%rbp), %rbx
+[0,87]    .    .    .    .    D===eE---------R   addq	$56, %rsp
diff --git a/libc/nexgen32e/mul6x6adx.S b/libc/nexgen32e/mul6x6adx.S
index b90906014..313658bec 100644
--- a/libc/nexgen32e/mul6x6adx.S
+++ b/libc/nexgen32e/mul6x6adx.S
@@ -18,37 +18,50 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/macros.internal.h"
 
+//	Computes 768-bit product of 384-bit and 384-bit numbers.
+//
+//		Instructions:       153
+//		Total Cycles:        73
+//		Total uOps:         261
+//		uOps Per Cycle:    3.58
+//		IPC:               2.10
+//		Block RThroughput: 43.5
+//
+//	@param	rdi receives 8 quadword result
+//	@param	rsi is left hand side which must have 4 quadwords
+//	@param	rdx is right hand side which must have 4 quadwords
+//	@note	words are host endian while array is little endian
+//	@mayalias
 Mul6x6Adx:
 	push	%rbp
 	mov	%rsp,%rbp
 	.profilable
-	push	%r15
-	push	%r14
-	push	%r13
-	push	%r12
-	push	%rbx
+	sub	$64,%rsp
+	mov	%r15,-8(%rbp)
+	mov	%r14,-16(%rbp)
+	mov	%r13,-24(%rbp)
+	mov	%r12,-32(%rbp)
+	mov	%rbx,-40(%rbp)
 	mov	%rdx,%rbx
-	sub	$24,%rsp
 	mov	(%rdx),%rdx
-	xor	%r8d,%r8d
 	mulx	(%rsi),%rcx,%rax
 	mulx	8(%rsi),%rdx,%r12
 	mov	%rcx,-48(%rbp)
-	adox	%rdx,%rax
+	add	%rdx,%rax
 	mov	(%rbx),%rdx
 	mulx	16(%rsi),%rdx,%r15
-	adox	%rdx,%r12
+	adc	%rdx,%r12
 	mov	(%rbx),%rdx
 	mulx	24(%rsi),%rdx,%r10
-	adox	%rdx,%r15
+	adc	%rdx,%r15
 	mov	(%rbx),%rdx
 	mulx	32(%rsi),%rdx,%r9
-	adox	%rdx,%r10
+	adc	%rdx,%r10
 	mov	(%rbx),%rdx
 	mulx	40(%rsi),%rdx,%rcx
-	adox	%rdx,%r9
+	adc	%rdx,%r9
 	mov	8(%rbx),%rdx
-	adox	%r8,%rcx
+	adc	$0,%rcx
 	mulx	(%rsi),%r13,%r11
 	xor	%r8d,%r8d
 	adox	%r13,%rax
@@ -171,12 +184,167 @@ Mul6x6Adx:
 	mov	%r8,64(%rdi)
 	mov	%r11,72(%rdi)
 	mov	%rdx,88(%rdi)
-	add	$24,%rsp
-	pop	%rbx
-	pop	%r12
-	pop	%r13
-	pop	%r14
-	pop	%r15
-	pop	%rbp
+	mov	-8(%rbp),%r15
+	mov	-16(%rbp),%r14
+	mov	-24(%rbp),%r13
+	mov	-32(%rbp),%r12
+	mov	-40(%rbp),%rbx
+	leave
 	ret
 	.endfn	Mul6x6Adx,globl
+
+	.end
+SIMULATION          0123456789          0123456789          0123456789          012
+Index     0123456789          0123456789          0123456789          0123456789
+[0,0]     DeER .    .    .    .    .    .    .    .    .    .    .    .    .    . .   movq	%r15, -8(%rbp)
+[0,1]     D=eER.    .    .    .    .    .    .    .    .    .    .    .    .    . .   movq	%r14, -16(%rbp)
+[0,2]     D==eER    .    .    .    .    .    .    .    .    .    .    .    .    . .   movq	%r13, -24(%rbp)
+[0,3]     D===eER   .    .    .    .    .    .    .    .    .    .    .    .    . .   movq	%r12, -32(%rbp)
+[0,4]     D====eER  .    .    .    .    .    .    .    .    .    .    .    .    . .   movq	%rbx, -40(%rbp)
+[0,5]     DeE----R  .    .    .    .    .    .    .    .    .    .    .    .    . .   movq	%rdx, %rbx
+[0,6]     .DeeeeeER .    .    .    .    .    .    .    .    .    .    .    .    . .   movq	(%rdx), %rdx
+[0,7]     .D=====eeeeeeeeeER  .    .    .    .    .    .    .    .    .    .    . .   mulxq	(%rsi), %rcx, %rax
+[0,8]     . D=====eeeeeeeeeER .    .    .    .    .    .    .    .    .    .    . .   mulxq	8(%rsi), %rdx, %r12
+[0,9]     . D=======eE------R .    .    .    .    .    .    .    .    .    .    . .   movq	%rcx, -48(%rbp)
+[0,10]    . D=============eER .    .    .    .    .    .    .    .    .    .    . .   addq	%rdx, %rax
+[0,11]    .  DeeeeeE--------R .    .    .    .    .    .    .    .    .    .    . .   movq	(%rbx), %rdx
+[0,12]    .  D=====eeeeeeeeeER.    .    .    .    .    .    .    .    .    .    . .   mulxq	16(%rsi), %rdx, %r15
+[0,13]    .  D=============eER.    .    .    .    .    .    .    .    .    .    . .   adcq	%rdx, %r12
+[0,14]    .   DeeeeeE--------R.    .    .    .    .    .    .    .    .    .    . .   movq	(%rbx), %rdx
+[0,15]    .   D=====eeeeeeeeeER    .    .    .    .    .    .    .    .    .    . .   mulxq	24(%rsi), %rdx, %r10
+[0,16]    .   D=============eER    .    .    .    .    .    .    .    .    .    . .   adcq	%rdx, %r15
+[0,17]    .    DeeeeeE--------R    .    .    .    .    .    .    .    .    .    . .   movq	(%rbx), %rdx
+[0,18]    .    D=====eeeeeeeeeER   .    .    .    .    .    .    .    .    .    . .   mulxq	32(%rsi), %rdx, %r9
+[0,19]    .    D=============eER   .    .    .    .    .    .    .    .    .    . .   adcq	%rdx, %r10
+[0,20]    .    .DeeeeeE--------R   .    .    .    .    .    .    .    .    .    . .   movq	(%rbx), %rdx
+[0,21]    .    .D=====eeeeeeeeeER  .    .    .    .    .    .    .    .    .    . .   mulxq	40(%rsi), %rdx, %rcx
+[0,22]    .    .D=============eER  .    .    .    .    .    .    .    .    .    . .   adcq	%rdx, %r9
+[0,23]    .    . DeeeeeE--------R  .    .    .    .    .    .    .    .    .    . .   movq	8(%rbx), %rdx
+[0,24]    .    . D=============eER .    .    .    .    .    .    .    .    .    . .   adcq	$0, %rcx
+[0,25]    .    . D=====eeeeeeeeeER .    .    .    .    .    .    .    .    .    . .   mulxq	(%rsi), %r13, %r11
+[0,26]    .    .  D--------------R .    .    .    .    .    .    .    .    .    . .   xorl	%r8d, %r8d
+[0,27]    .    .  D========eE----R .    .    .    .    .    .    .    .    .    . .   adoxq	%r13, %rax
+[0,28]    .    .  D=============eER.    .    .    .    .    .    .    .    .    . .   adcxq	%r11, %r12
+[0,29]    .    .  D=========eE----R.    .    .    .    .    .    .    .    .    . .   movq	%rax, -56(%rbp)
+[0,30]    .    .   D====eeeeeeeeeER.    .    .    .    .    .    .    .    .    . .   mulxq	8(%rsi), %r11, %rax
+[0,31]    .    .   D=============eER    .    .    .    .    .    .    .    .    . .   adoxq	%r11, %r12
+[0,32]    .    .   D==============eER   .    .    .    .    .    .    .    .    . .   adcxq	%rax, %r15
+[0,33]    .    .    D=============eER   .    .    .    .    .    .    .    .    . .   movq	%r12, %r14
+[0,34]    .    .    D====eeeeeeeeeE-R   .    .    .    .    .    .    .    .    . .   mulxq	16(%rsi), %r11, %rax
+[0,35]    .    .    D==============eER  .    .    .    .    .    .    .    .    . .   adoxq	%r11, %r15
+[0,36]    .    .    .D==============eER .    .    .    .    .    .    .    .    . .   adcxq	%rax, %r10
+[0,37]    .    .    .D====eeeeeeeeeE--R .    .    .    .    .    .    .    .    . .   mulxq	24(%rsi), %r11, %rax
+[0,38]    .    .    .D===============eER.    .    .    .    .    .    .    .    . .   adoxq	%r11, %r10
+[0,39]    .    .    . D===============eER    .    .    .    .    .    .    .    . .   adcxq	%rax, %r9
+[0,40]    .    .    . D====eeeeeeeeeE---R    .    .    .    .    .    .    .    . .   mulxq	32(%rsi), %r11, %rax
+[0,41]    .    .    . D================eER   .    .    .    .    .    .    .    . .   adoxq	%r11, %r9
+[0,42]    .    .    .  D================eER  .    .    .    .    .    .    .    . .   adcxq	%rax, %rcx
+[0,43]    .    .    .  D====eeeeeeeeeE----R  .    .    .    .    .    .    .    . .   mulxq	40(%rsi), %rdx, %rax
+[0,44]    .    .    .  D=================eER .    .    .    .    .    .    .    . .   adoxq	%rdx, %rcx
+[0,45]    .    .    .   D=================eER.    .    .    .    .    .    .    . .   adcxq	%r8, %rax
+[0,46]    .    .    .   DeeeeeE-------------R.    .    .    .    .    .    .    . .   movq	16(%rbx), %rdx
+[0,47]    .    .    .   D==================eER    .    .    .    .    .    .    . .   adoxq	%r8, %rax
+[0,48]    .    .    .    D====eeeeeeeeeE-----R    .    .    .    .    .    .    . .   mulxq	(%rsi), %r13, %r8
+[0,49]    .    .    .    D====E--------------R    .    .    .    .    .    .    . .   xorl	%r11d, %r11d
+[0,50]    .    .    .    D=========eE--------R    .    .    .    .    .    .    . .   adoxq	%r13, %r14
+[0,51]    .    .    .    .D=========eE-------R    .    .    .    .    .    .    . .   movq	%r14, -64(%rbp)
+[0,52]    .    .    .    .D============eE----R    .    .    .    .    .    .    . .   adcxq	%r8, %r15
+[0,53]    .    .    .    .D====eeeeeeeeeE----R    .    .    .    .    .    .    . .   mulxq	8(%rsi), %r12, %r8
+[0,54]    .    .    .    . D============eE---R    .    .    .    .    .    .    . .   adoxq	%r12, %r15
+[0,55]    .    .    .    . D=============eE--R    .    .    .    .    .    .    . .   adcxq	%r8, %r10
+[0,56]    .    .    .    . D====eeeeeeeeeE---R    .    .    .    .    .    .    . .   mulxq	16(%rsi), %r12, %r8
+[0,57]    .    .    .    .  D=============eE-R    .    .    .    .    .    .    . .   adoxq	%r12, %r10
+[0,58]    .    .    .    .  D==============eER    .    .    .    .    .    .    . .   adcxq	%r8, %r9
+[0,59]    .    .    .    .  D====eeeeeeeeeE--R    .    .    .    .    .    .    . .   mulxq	24(%rsi), %r12, %r8
+[0,60]    .    .    .    .   D==============eER   .    .    .    .    .    .    . .   adoxq	%r12, %r9
+[0,61]    .    .    .    .   D===============eER  .    .    .    .    .    .    . .   adcxq	%r8, %rcx
+[0,62]    .    .    .    .   D====eeeeeeeeeE---R  .    .    .    .    .    .    . .   mulxq	32(%rsi), %r12, %r8
+[0,63]    .    .    .    .    D===============eER .    .    .    .    .    .    . .   adoxq	%r12, %rcx
+[0,64]    .    .    .    .    D================eER.    .    .    .    .    .    . .   adcxq	%r8, %rax
+[0,65]    .    .    .    .    D====eeeeeeeeeE----R.    .    .    .    .    .    . .   mulxq	40(%rsi), %rdx, %r8
+[0,66]    .    .    .    .    .D================eER    .    .    .    .    .    . .   adoxq	%rdx, %rax
+[0,67]    .    .    .    .    .D=================eER   .    .    .    .    .    . .   adcxq	%r11, %r8
+[0,68]    .    .    .    .    .DeeeeeE-------------R   .    .    .    .    .    . .   movq	24(%rbx), %rdx
+[0,69]    .    .    .    .    .D==================eER  .    .    .    .    .    . .   adoxq	%r11, %r8
+[0,70]    .    .    .    .    . D====eeeeeeeeeE-----R  .    .    .    .    .    . .   mulxq	(%rsi), %r13, %r11
+[0,71]    .    .    .    .    . D====E--------------R  .    .    .    .    .    . .   xorl	%r12d, %r12d
+[0,72]    .    .    .    .    . D===========eE------R  .    .    .    .    .    . .   adoxq	%r13, %r15
+[0,73]    .    .    .    .    .  D============eE----R  .    .    .    .    .    . .   adcxq	%r11, %r10
+[0,74]    .    .    .    .    .  D====eeeeeeeeeE----R  .    .    .    .    .    . .   mulxq	8(%rsi), %r13, %r11
+[0,75]    .    .    .    .    .  D=============eE---R  .    .    .    .    .    . .   adoxq	%r13, %r10
+[0,76]    .    .    .    .    .   D=============eE--R  .    .    .    .    .    . .   adcxq	%r11, %r9
+[0,77]    .    .    .    .    .   D====eeeeeeeeeE---R  .    .    .    .    .    . .   mulxq	16(%rsi), %r13, %r11
+[0,78]    .    .    .    .    .   D==============eE-R  .    .    .    .    .    . .   adoxq	%r13, %r9
+[0,79]    .    .    .    .    .    D==============eER  .    .    .    .    .    . .   adcxq	%r11, %rcx
+[0,80]    .    .    .    .    .    D====eeeeeeeeeE--R  .    .    .    .    .    . .   mulxq	24(%rsi), %r13, %r11
+[0,81]    .    .    .    .    .    D===============eER .    .    .    .    .    . .   adoxq	%r13, %rcx
+[0,82]    .    .    .    .    .    .D===============eER.    .    .    .    .    . .   adcxq	%r11, %rax
+[0,83]    .    .    .    .    .    .D====eeeeeeeeeE---R.    .    .    .    .    . .   mulxq	32(%rsi), %r13, %r11
+[0,84]    .    .    .    .    .    .D================eER    .    .    .    .    . .   adoxq	%r13, %rax
+[0,85]    .    .    .    .    .    . D================eER   .    .    .    .    . .   adcxq	%r11, %r8
+[0,86]    .    .    .    .    .    . D====eeeeeeeeeE----R   .    .    .    .    . .   mulxq	40(%rsi), %rdx, %r11
+[0,87]    .    .    .    .    .    . D=================eER  .    .    .    .    . .   adoxq	%rdx, %r8
+[0,88]    .    .    .    .    .    .  DeeeeeE------------R  .    .    .    .    . .   movq	32(%rbx), %rdx
+[0,89]    .    .    .    .    .    .  D=================eER .    .    .    .    . .   adcxq	%r12, %r11
+[0,90]    .    .    .    .    .    .  D=====eeeeeeeeeE----R .    .    .    .    . .   mulxq	(%rsi), %r14, %r13
+[0,91]    .    .    .    .    .    .   D=================eER.    .    .    .    . .   adoxq	%r12, %r11
+[0,92]    .    .    .    .    .    .   D-------------------R.    .    .    .    . .   xorl	%r12d, %r12d
+[0,93]    .    .    .    .    .    .   D===========eE------R.    .    .    .    . .   adoxq	%r14, %r10
+[0,94]    .    .    .    .    .    .   D=============eE----R.    .    .    .    . .   adcxq	%r13, %r9
+[0,95]    .    .    .    .    .    .    D====eeeeeeeeeE----R.    .    .    .    . .   mulxq	8(%rsi), %r14, %r13
+[0,96]    .    .    .    .    .    .    D=============eE---R.    .    .    .    . .   adoxq	%r14, %r9
+[0,97]    .    .    .    .    .    .    D==============eE--R.    .    .    .    . .   adcxq	%r13, %rcx
+[0,98]    .    .    .    .    .    .    .D====eeeeeeeeeE---R.    .    .    .    . .   mulxq	16(%rsi), %r14, %r13
+[0,99]    .    .    .    .    .    .    .D==============eE-R.    .    .    .    . .   adoxq	%r14, %rcx
+[0,100]   .    .    .    .    .    .    .D===============eER.    .    .    .    . .   adcxq	%r13, %rax
+[0,101]   .    .    .    .    .    .    . D====eeeeeeeeeE--R.    .    .    .    . .   mulxq	24(%rsi), %r14, %r13
+[0,102]   .    .    .    .    .    .    . D===============eER    .    .    .    . .   adoxq	%r14, %rax
+[0,103]   .    .    .    .    .    .    . D================eER   .    .    .    . .   adcxq	%r13, %r8
+[0,104]   .    .    .    .    .    .    .  D====eeeeeeeeeE---R   .    .    .    . .   mulxq	32(%rsi), %r14, %r13
+[0,105]   .    .    .    .    .    .    .  D================eER  .    .    .    . .   adoxq	%r14, %r8
+[0,106]   .    .    .    .    .    .    .  D=================eER .    .    .    . .   adcxq	%r13, %r11
+[0,107]   .    .    .    .    .    .    .   D====eeeeeeeeeE----R .    .    .    . .   mulxq	40(%rsi), %rdx, %r13
+[0,108]   .    .    .    .    .    .    .   D=================eER.    .    .    . .   adoxq	%rdx, %r11
+[0,109]   .    .    .    .    .    .    .   D==================eER    .    .    . .   adcxq	%r12, %r13
+[0,110]   .    .    .    .    .    .    .    DeeeeeE-------------R    .    .    . .   movq	40(%rbx), %rdx
+[0,111]   .    .    .    .    .    .    .    D==================eER   .    .    . .   adoxq	%r12, %r13
+[0,112]   .    .    .    .    .    .    .    D=====eeeeeeeeeE-----R   .    .    . .   mulxq	(%rsi), %r14, %rbx
+[0,113]   .    .    .    .    .    .    .    .D-------------------R   .    .    . .   xorl	%r12d, %r12d
+[0,114]   .    .    .    .    .    .    .    .D===========eE------R   .    .    . .   adoxq	%r14, %r9
+[0,115]   .    .    .    .    .    .    .    .D=============eE----R   .    .    . .   adcxq	%rbx, %rcx
+[0,116]   .    .    .    .    .    .    .    . D====eeeeeeeeeE----R   .    .    . .   mulxq	8(%rsi), %r14, %rbx
+[0,117]   .    .    .    .    .    .    .    . D=============eE---R   .    .    . .   adoxq	%r14, %rcx
+[0,118]   .    .    .    .    .    .    .    . D==============eE--R   .    .    . .   adcxq	%rbx, %rax
+[0,119]   .    .    .    .    .    .    .    .  D====eeeeeeeeeE---R   .    .    . .   mulxq	16(%rsi), %r14, %rbx
+[0,120]   .    .    .    .    .    .    .    .  D==============eE-R   .    .    . .   adoxq	%r14, %rax
+[0,121]   .    .    .    .    .    .    .    .  D===============eER   .    .    . .   adcxq	%rbx, %r8
+[0,122]   .    .    .    .    .    .    .    .   D====eeeeeeeeeE--R   .    .    . .   mulxq	24(%rsi), %r14, %rbx
+[0,123]   .    .    .    .    .    .    .    .   D===============eER  .    .    . .   adoxq	%r14, %r8
+[0,124]   .    .    .    .    .    .    .    .   D================eER .    .    . .   adcxq	%rbx, %r11
+[0,125]   .    .    .    .    .    .    .    .    D====eeeeeeeeeE---R .    .    . .   mulxq	32(%rsi), %r14, %rbx
+[0,126]   .    .    .    .    .    .    .    .    .D====eeeeeeeeeE--R .    .    . .   mulxq	40(%rsi), %rsi, %rdx
+[0,127]   .    .    .    .    .    .    .    .    .D===============eER.    .    . .   adoxq	%r14, %r11
+[0,128]   .    .    .    .    .    .    .    .    .D================eER    .    . .   adcxq	%rbx, %r13
+[0,129]   .    .    .    .    .    .    .    .    . D================eER   .    . .   adoxq	%rsi, %r13
+[0,130]   .    .    .    .    .    .    .    .    . D=================eER  .    . .   adcxq	%r12, %rdx
+[0,131]   .    .    .    .    .    .    .    .    . D==================eER .    . .   adoxq	%r12, %rdx
+[0,132]   .    .    .    .    .    .    .    .    . DeeeeeE--------------R .    . .   movq	-48(%rbp), %rsi
+[0,133]   .    .    .    .    .    .    .    .    . D=eeeeeE-------------R .    . .   movq	-56(%rbp), %rbx
+[0,134]   .    .    .    .    .    .    .    .    . D===eE---------------R .    . .   movq	%r15, 24(%rdi)
+[0,135]   .    .    .    .    .    .    .    .    .  D=eeeeeE------------R .    . .   movq	-64(%rbp), %r14
+[0,136]   .    .    .    .    .    .    .    .    .  D================eE-R .    . .   movq	%r13, 80(%rdi)
+[0,137]   .    .    .    .    .    .    .    .    .  D=================eER .    . .   movq	%rbx, 8(%rdi)
+[0,138]   .    .    .    .    .    .    .    .    .  D==================eER.    . .   movq	%r14, 16(%rdi)
+[0,139]   .    .    .    .    .    .    .    .    .  D===================eER    . .   movq	%rsi, (%rdi)
+[0,140]   .    .    .    .    .    .    .    .    .  D====================eER   . .   movq	%r10, 32(%rdi)
+[0,141]   .    .    .    .    .    .    .    .    .   D====================eER  . .   movq	%r9, 40(%rdi)
+[0,142]   .    .    .    .    .    .    .    .    .   D=====================eER . .   movq	%rcx, 48(%rdi)
+[0,143]   .    .    .    .    .    .    .    .    .   D======================eER. .   movq	%rax, 56(%rdi)
+[0,144]   .    .    .    .    .    .    .    .    .   D=======================eER .   movq	%r8, 64(%rdi)
+[0,145]   .    .    .    .    .    .    .    .    .   D========================eER.   movq	%r11, 72(%rdi)
+[0,146]   .    .    .    .    .    .    .    .    .   D=========================eER   movq	%rdx, 88(%rdi)
+[0,147]   .    .    .    .    .    .    .    .    .    DeeeeeE--------------------R   movq	-8(%rbp), %r15
+[0,148]   .    .    .    .    .    .    .    .    .    D=eeeeeE-------------------R   movq	-16(%rbp), %r14
+[0,149]   .    .    .    .    .    .    .    .    .    D=eeeeeE-------------------R   movq	-24(%rbp), %r13
+[0,150]   .    .    .    .    .    .    .    .    .    D==eeeeeE------------------R   movq	-32(%rbp), %r12
+[0,151]   .    .    .    .    .    .    .    .    .    D==eeeeeE------------------R   movq	-40(%rbp), %rbx
diff --git a/libc/nexgen32e/mul8x8.S b/libc/nexgen32e/mul8x8.S
deleted file mode 100644
index 3ad62baf5..000000000
--- a/libc/nexgen32e/mul8x8.S
+++ /dev/null
@@ -1,483 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-
-/	Computes 1024-bit product of 512-bit and 512-bit numbers.
-/
-/		Instructions:      262
-/		Total Cycles:      114
-/		Total uOps:        469
-/		Dispatch Width:    6
-/		uOps Per Cycle:    4.11
-/		IPC:               2.30
-/		Block RThroughput: 78.2
-/
-/	@param	rdi receives 16 quadword result
-/	@param	rsi is left hand side which must have 8 quadwords
-/	@param	rdx is right hand side which must have 8 quadwords
-/	@note	words are host endian while array is little endian
-/	@mayalias
-Mul8x8Adx:
-	push	%rbp
-	mov	%rsp,%rbp
-	.profilable
-	push	%r15
-	push	%r14
-	push	%r13
-	push	%r12
-	mov	%rdx,%r12
-	push	%rbx
-	sub	$64,%rsp
-	mov	(%rdx),%rdx
-	xor	%r13d,%r13d
-	mulx	(%rsi),%rax,%rcx
-	mov	%rdi,-48(%rbp)
-	mov	%rax,-56(%rbp)
-	mulx	8(%rsi),%rdx,%rax
-	adox	%rdx,%rcx
-	mov	(%r12),%rdx
-	mulx	16(%rsi),%rdx,%rbx
-	adox	%rdx,%rax
-	mov	(%r12),%rdx
-	mulx	24(%rsi),%rdx,%r11
-	adox	%rdx,%rbx
-	mov	(%r12),%rdx
-	mulx	32(%rsi),%rdx,%r10
-	adox	%rdx,%r11
-	mov	(%r12),%rdx
-	mulx	40(%rsi),%rdx,%r9
-	adox	%rdx,%r10
-	mov	(%r12),%rdx
-	mulx	48(%rsi),%rdx,%r8
-	adox	%rdx,%r9
-	mov	(%r12),%rdx
-	mulx	56(%rsi),%rdx,%rdi
-	adox	%rdx,%r8
-	adox	%r13,%rdi
-	xor	%r13d,%r13d
-	mov	8(%r12),%rdx
-	mulx	(%rsi),%r15,%r14
-	adox	%r15,%rcx
-	adcx	%r14,%rax
-	mov	%rcx,-64(%rbp)
-	mulx	8(%rsi),%r14,%rcx
-	adox	%r14,%rax
-	adcx	%rcx,%rbx
-	mulx	16(%rsi),%r14,%rcx
-	adox	%r14,%rbx
-	adcx	%rcx,%r11
-	mulx	24(%rsi),%r14,%rcx
-	adox	%r14,%r11
-	adcx	%rcx,%r10
-	mulx	32(%rsi),%r14,%rcx
-	adox	%r14,%r10
-	adcx	%rcx,%r9
-	mulx	40(%rsi),%r14,%rcx
-	adox	%r14,%r9
-	adcx	%rcx,%r8
-	mulx	48(%rsi),%r14,%rcx
-	adox	%r14,%r8
-	adcx	%rcx,%rdi
-	mulx	56(%rsi),%rdx,%rcx
-	adox	%rdx,%rdi
-	adcx	%r13,%rcx
-	mov	16(%r12),%rdx
-	adox	%r13,%rcx
-	mulx	(%rsi),%r15,%r14
-	xor	%r13d,%r13d
-	adox	%r15,%rax
-	adcx	%r14,%rbx
-	mov	%rax,-72(%rbp)
-	mulx	8(%rsi),%r14,%rax
-	adox	%r14,%rbx
-	adcx	%rax,%r11
-	mulx	16(%rsi),%r14,%rax
-	adox	%r14,%r11
-	adcx	%rax,%r10
-	mulx	24(%rsi),%r14,%rax
-	adox	%r14,%r10
-	adcx	%rax,%r9
-	mulx	32(%rsi),%r14,%rax
-	adox	%r14,%r9
-	adcx	%rax,%r8
-	mulx	40(%rsi),%r14,%rax
-	adox	%r14,%r8
-	adcx	%rax,%rdi
-	mulx	48(%rsi),%r14,%rax
-	adox	%r14,%rdi
-	adcx	%rax,%rcx
-	mulx	56(%rsi),%rdx,%rax
-	adox	%rdx,%rcx
-	adcx	%r13,%rax
-	adox	%r13,%rax
-	xor	%r13d,%r13d
-	mov	24(%r12),%rdx
-	mulx	(%rsi),%r15,%r14
-	adox	%r15,%rbx
-	adcx	%r14,%r11
-	mov	%rbx,-80(%rbp)
-	mov	%r11,%r15
-	mulx	8(%rsi),%r14,%rbx
-	adox	%r14,%r15
-	adcx	%rbx,%r10
-	mulx	16(%rsi),%rbx,%r11
-	adox	%rbx,%r10
-	adcx	%r11,%r9
-	mulx	24(%rsi),%rbx,%r11
-	adox	%rbx,%r9
-	adcx	%r11,%r8
-	mulx	32(%rsi),%rbx,%r11
-	adox	%rbx,%r8
-	adcx	%r11,%rdi
-	mulx	40(%rsi),%rbx,%r11
-	adox	%rbx,%rdi
-	adcx	%r11,%rcx
-	mulx	48(%rsi),%rbx,%r11
-	adox	%rbx,%rcx
-	adcx	%r11,%rax
-	mulx	56(%rsi),%rdx,%r11
-	adox	%rdx,%rax
-	adcx	%r13,%r11
-	mov	32(%r12),%rdx
-	adox	%r13,%r11
-	xor	%ebx,%ebx
-	mulx	(%rsi),%r14,%r13
-	adox	%r14,%r15
-	adcx	%r13,%r10
-	mov	%r15,-88(%rbp)
-	mulx	8(%rsi),%r14,%r13
-	mov	%r10,%r15
-	adcx	%r13,%r9
-	adox	%r14,%r15
-	mulx	16(%rsi),%r13,%r10
-	adox	%r13,%r9
-	adcx	%r10,%r8
-	mulx	24(%rsi),%r13,%r10
-	adcx	%r10,%rdi
-	adox	%r13,%r8
-	mulx	32(%rsi),%r13,%r10
-	adox	%r13,%rdi
-	adcx	%r10,%rcx
-	mulx	40(%rsi),%r13,%r10
-	adox	%r13,%rcx
-	adcx	%r10,%rax
-	mulx	48(%rsi),%r13,%r10
-	adox	%r13,%rax
-	adcx	%r10,%r11
-	mulx	56(%rsi),%rdx,%r10
-	adox	%rdx,%r11
-	adcx	%rbx,%r10
-	mov	40(%r12),%rdx
-	adox	%rbx,%r10
-	mulx	(%rsi),%r14,%r13
-	xor	%ebx,%ebx
-	adox	%r14,%r15
-	mov	%r15,-96(%rbp)
-	adcx	%r13,%r9
-	mulx	8(%rsi),%r14,%r13
-	mov	%r9,%r15
-	adox	%r14,%r15
-	adcx	%r13,%r8
-	mulx	16(%rsi),%r13,%r9
-	adox	%r13,%r8
-	adcx	%r9,%rdi
-	mulx	24(%rsi),%r13,%r9
-	adox	%r13,%rdi
-	adcx	%r9,%rcx
-	mulx	32(%rsi),%r13,%r9
-	adox	%r13,%rcx
-	adcx	%r9,%rax
-	mulx	40(%rsi),%r13,%r9
-	adox	%r13,%rax
-	adcx	%r9,%r11
-	mulx	48(%rsi),%r13,%r9
-	adox	%r13,%r11
-	adcx	%r9,%r10
-	mulx	56(%rsi),%rdx,%r9
-	adox	%rdx,%r10
-	adcx	%rbx,%r9
-	adox	%rbx,%r9
-	xor	%ebx,%ebx
-	mov	48(%r12),%rdx
-	mulx	(%rsi),%r14,%r13
-	adox	%r14,%r15
-	adcx	%r13,%r8
-	mov	%r15,-104(%rbp)
-	mulx	8(%rsi),%r14,%r13
-	mov	%r8,%r15
-	adcx	%r13,%rdi
-	adox	%r14,%r15
-	mulx	16(%rsi),%r13,%r8
-	adox	%r13,%rdi
-	adcx	%r8,%rcx
-	mulx	24(%rsi),%r13,%r8
-	adox	%r13,%rcx
-	adcx	%r8,%rax
-	mulx	32(%rsi),%r13,%r8
-	adox	%r13,%rax
-	adcx	%r8,%r11
-	mulx	40(%rsi),%r13,%r8
-	adox	%r13,%r11
-	adcx	%r8,%r10
-	mulx	48(%rsi),%r13,%r8
-	adox	%r13,%r10
-	adcx	%r8,%r9
-	mulx	56(%rsi),%rdx,%r8
-	adox	%rdx,%r9
-	mov	56(%r12),%rdx
-	adcx	%rbx,%r8
-	mulx	(%rsi),%r13,%r12
-	adox	%rbx,%r8
-	xor	%ebx,%ebx
-	adox	%r13,%r15
-	adcx	%r12,%rdi
-	mulx	8(%rsi),%r13,%r12
-	adox	%r13,%rdi
-	adcx	%r12,%rcx
-	mulx	16(%rsi),%r13,%r12
-	adox	%r13,%rcx
-	adcx	%r12,%rax
-	mulx	24(%rsi),%r13,%r12
-	adox	%r13,%rax
-	adcx	%r12,%r11
-	mulx	32(%rsi),%r13,%r12
-	adox	%r13,%r11
-	adcx	%r12,%r10
-	mulx	40(%rsi),%r13,%r12
-	adox	%r13,%r10
-	adcx	%r12,%r9
-	mulx	48(%rsi),%r13,%r12
-	mulx	56(%rsi),%rsi,%rdx
-	adox	%r13,%r9
-	adcx	%r12,%r8
-	adox	%rsi,%r8
-	adcx	%rbx,%rdx
-	mov	-64(%rbp),%rsi
-	adox	%rbx,%rdx
-	mov	-48(%rbp),%rbx
-	mov	-56(%rbp),%r14
-	mov	%rsi,8(%rbx)
-	mov	-72(%rbp),%rsi
-	mov	%r14,(%rbx)
-	mov	%rsi,16(%rbx)
-	mov	-80(%rbp),%rsi
-	mov	%rsi,24(%rbx)
-	mov	-88(%rbp),%rsi
-	mov	%rsi,32(%rbx)
-	mov	-96(%rbp),%rsi
-	mov	%rsi,40(%rbx)
-	mov	-104(%rbp),%rsi
-	mov	%r15,56(%rbx)
-	mov	%rsi,48(%rbx)
-	mov	%rdi,64(%rbx)
-	mov	%rcx,72(%rbx)
-	mov	%rax,80(%rbx)
-	mov	%r11,88(%rbx)
-	mov	%r10,96(%rbx)
-	mov	%r9,104(%rbx)
-	mov	%r8,112(%rbx)
-	mov	%rdx,120(%rbx)
-	add	$64,%rsp
-	pop	%rbx
-	pop	%r12
-	pop	%r13
-	pop	%r14
-	pop	%r15
-	pop	%rbp
-	ret
-	.endfn	Mul8x8Adx,globl
-
-	.end
-Timeline view:      0123456789          0123456789          0123456789          0123456789
-Index     0123456789          0123456789          0123456789          0123456789
-[0,0]     DeeER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   pushq	%r15
-[0,1]     D==eeER   .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   pushq	%r14
-[0,2]     .D===eeER .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   pushq	%r13
-[0,3]     .D=====eeER    .    .    .    .    .    .    .    .    .    .    .    .    .   .   pushq	%r12
-[0,4]     . DeE-----R    .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq	%rdx, %r12
-[0,5]     . D======eeER  .    .    .    .    .    .    .    .    .    .    .    .    .   .   pushq	%rbx
-[0,6]     . D========eER .    .    .    .    .    .    .    .    .    .    .    .    .   .   subq	$64, %rsp
-[0,7]     . DeeeeeE----R .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq	(%rdx), %rdx
-[0,8]     .  D---------R .    .    .    .    .    .    .    .    .    .    .    .    .   .   xorl	%r13d, %r13d
-[0,9]     .  D====eeeeeeeeeER .    .    .    .    .    .    .    .    .    .    .    .   .   mulxq	(%rsi), %rax, %rcx
-[0,10]    .  D======eE------R .    .    .    .    .    .    .    .    .    .    .    .   .   movq	%rdi, -48(%rbp)
-[0,11]    .   D======eE-----R .    .    .    .    .    .    .    .    .    .    .    .   .   movq	%rax, -56(%rbp)
-[0,12]    .   D====eeeeeeeeeER.    .    .    .    .    .    .    .    .    .    .    .   .   mulxq	8(%rsi), %rdx, %rax
-[0,13]    .   D============eER.    .    .    .    .    .    .    .    .    .    .    .   .   adoxq	%rdx, %rcx
-[0,14]    .    DeeeeeE-------R.    .    .    .    .    .    .    .    .    .    .    .   .   movq	(%r12), %rdx
-[0,15]    .    D=====eeeeeeeeeER   .    .    .    .    .    .    .    .    .    .    .   .   mulxq	16(%rsi), %rdx, %rbx
-[0,16]    .    D============eE-R   .    .    .    .    .    .    .    .    .    .    .   .   adoxq	%rdx, %rax
-[0,17]    .    .DeeeeeE--------R   .    .    .    .    .    .    .    .    .    .    .   .   movq	(%r12), %rdx
-[0,18]    .    .D=====eeeeeeeeeER  .    .    .    .    .    .    .    .    .    .    .   .   mulxq	24(%rsi), %rdx, %r11
-[0,19]    .    .D=============eER  .    .    .    .    .    .    .    .    .    .    .   .   adoxq	%rdx, %rbx
-[0,20]    .    . DeeeeeE--------R  .    .    .    .    .    .    .    .    .    .    .   .   movq	(%r12), %rdx
-[0,21]    .    . D=====eeeeeeeeeER .    .    .    .    .    .    .    .    .    .    .   .   mulxq	32(%rsi), %rdx, %r10
-[0,22]    .    . D=============eER .    .    .    .    .    .    .    .    .    .    .   .   adoxq	%rdx, %r11
-[0,23]    .    .  DeeeeeE--------R .    .    .    .    .    .    .    .    .    .    .   .   movq	(%r12), %rdx
-[0,24]    .    .  D=====eeeeeeeeeER.    .    .    .    .    .    .    .    .    .    .   .   mulxq	40(%rsi), %rdx, %r9
-[0,25]    .    .  D=============eER.    .    .    .    .    .    .    .    .    .    .   .   adoxq	%rdx, %r10
-[0,26]    .    .   DeeeeeE--------R.    .    .    .    .    .    .    .    .    .    .   .   movq	(%r12), %rdx
-[0,27]    .    .   D=====eeeeeeeeeER    .    .    .    .    .    .    .    .    .    .   .   mulxq	48(%rsi), %rdx, %r8
-[0,28]    .    .   D=============eER    .    .    .    .    .    .    .    .    .    .   .   adoxq	%rdx, %r9
-[0,29]    .    .    DeeeeeE--------R    .    .    .    .    .    .    .    .    .    .   .   movq	(%r12), %rdx
-[0,30]    .    .    D=====eeeeeeeeeER   .    .    .    .    .    .    .    .    .    .   .   mulxq	56(%rsi), %rdx, %rdi
-[0,31]    .    .    D=============eER   .    .    .    .    .    .    .    .    .    .   .   adoxq	%rdx, %r8
-[0,32]    .    .    .D=============eER  .    .    .    .    .    .    .    .    .    .   .   adoxq	%r13, %rdi
-[0,33]    .    .    .D---------------R  .    .    .    .    .    .    .    .    .    .   .   xorl	%r13d, %r13d
-[0,34]    .    .    .DeeeeeE---------R  .    .    .    .    .    .    .    .    .    .   .   movq	8(%r12), %rdx
-[0,35]    .    .    . D====eeeeeeeeeER  .    .    .    .    .    .    .    .    .    .   .   mulxq	(%rsi), %r15, %r14
-[0,36]    .    .    . D=======eE-----R  .    .    .    .    .    .    .    .    .    .   .   adoxq	%r15, %rcx
-[0,37]    .    .    . D=============eER .    .    .    .    .    .    .    .    .    .   .   adcxq	%r14, %rax
-[0,38]    .    .    .  D=======eE-----R .    .    .    .    .    .    .    .    .    .   .   movq	%rcx, -64(%rbp)
-[0,39]    .    .    .  D====eeeeeeeeeER .    .    .    .    .    .    .    .    .    .   .   mulxq	8(%rsi), %r14, %rcx
-[0,40]    .    .    .  D=============eER.    .    .    .    .    .    .    .    .    .   .   adoxq	%r14, %rax
-[0,41]    .    .    .   D=============eER    .    .    .    .    .    .    .    .    .   .   adcxq	%rcx, %rbx
-[0,42]    .    .    .   D====eeeeeeeeeE-R    .    .    .    .    .    .    .    .    .   .   mulxq	16(%rsi), %r14, %rcx
-[0,43]    .    .    .   D==============eER   .    .    .    .    .    .    .    .    .   .   adoxq	%r14, %rbx
-[0,44]    .    .    .    D==============eER  .    .    .    .    .    .    .    .    .   .   adcxq	%rcx, %r11
-[0,45]    .    .    .    D====eeeeeeeeeE--R  .    .    .    .    .    .    .    .    .   .   mulxq	24(%rsi), %r14, %rcx
-[0,46]    .    .    .    D===============eER .    .    .    .    .    .    .    .    .   .   adoxq	%r14, %r11
-[0,47]    .    .    .    .D===============eER.    .    .    .    .    .    .    .    .   .   adcxq	%rcx, %r10
-[0,48]    .    .    .    .D====eeeeeeeeeE---R.    .    .    .    .    .    .    .    .   .   mulxq	32(%rsi), %r14, %rcx
-[0,49]    .    .    .    .D================eER    .    .    .    .    .    .    .    .   .   adoxq	%r14, %r10
-[0,50]    .    .    .    . D================eER   .    .    .    .    .    .    .    .   .   adcxq	%rcx, %r9
-[0,51]    .    .    .    . D====eeeeeeeeeE----R   .    .    .    .    .    .    .    .   .   mulxq	40(%rsi), %r14, %rcx
-[0,52]    .    .    .    . D=================eER  .    .    .    .    .    .    .    .   .   adoxq	%r14, %r9
-[0,53]    .    .    .    .  D=================eER .    .    .    .    .    .    .    .   .   adcxq	%rcx, %r8
-[0,54]    .    .    .    .  D====eeeeeeeeeE-----R .    .    .    .    .    .    .    .   .   mulxq	48(%rsi), %r14, %rcx
-[0,55]    .    .    .    .  D==================eER.    .    .    .    .    .    .    .   .   adoxq	%r14, %r8
-[0,56]    .    .    .    .   D==================eER    .    .    .    .    .    .    .   .   adcxq	%rcx, %rdi
-[0,57]    .    .    .    .   D====eeeeeeeeeE------R    .    .    .    .    .    .    .   .   mulxq	56(%rsi), %rdx, %rcx
-[0,58]    .    .    .    .   D===================eER   .    .    .    .    .    .    .   .   adoxq	%rdx, %rdi
-[0,59]    .    .    .    .    D===================eER  .    .    .    .    .    .    .   .   adcxq	%r13, %rcx
-[0,60]    .    .    .    .    DeeeeeE---------------R  .    .    .    .    .    .    .   .   movq	16(%r12), %rdx
-[0,61]    .    .    .    .    D====================eER .    .    .    .    .    .    .   .   adoxq	%r13, %rcx
-[0,62]    .    .    .    .    .D====eeeeeeeeeE-------R .    .    .    .    .    .    .   .   mulxq	(%rsi), %r15, %r14
-[0,63]    .    .    .    .    .D---------------------R .    .    .    .    .    .    .   .   xorl	%r13d, %r13d
-[0,64]    .    .    .    .    .D=======eE------------R .    .    .    .    .    .    .   .   adoxq	%r15, %rax
-[0,65]    .    .    .    .    . D============eE------R .    .    .    .    .    .    .   .   adcxq	%r14, %rbx
-[0,66]    .    .    .    .    . D=======eE-----------R .    .    .    .    .    .    .   .   movq	%rax, -72(%rbp)
-[0,67]    .    .    .    .    . D====eeeeeeeeeE------R .    .    .    .    .    .    .   .   mulxq	8(%rsi), %r14, %rax
-[0,68]    .    .    .    .    .  D============eE-----R .    .    .    .    .    .    .   .   adoxq	%r14, %rbx
-[0,69]    .    .    .    .    .  D=============eE----R .    .    .    .    .    .    .   .   adcxq	%rax, %r11
-[0,70]    .    .    .    .    .  D====eeeeeeeeeE-----R .    .    .    .    .    .    .   .   mulxq	16(%rsi), %r14, %rax
-[0,71]    .    .    .    .    .   D=============eE---R .    .    .    .    .    .    .   .   adoxq	%r14, %r11
-[0,72]    .    .    .    .    .   D==============eE--R .    .    .    .    .    .    .   .   adcxq	%rax, %r10
-[0,73]    .    .    .    .    .   D====eeeeeeeeeE----R .    .    .    .    .    .    .   .   mulxq	24(%rsi), %r14, %rax
-[0,74]    .    .    .    .    .    D==============eE-R .    .    .    .    .    .    .   .   adoxq	%r14, %r10
-[0,75]    .    .    .    .    .    D===============eER .    .    .    .    .    .    .   .   adcxq	%rax, %r9
-[0,76]    .    .    .    .    .    D====eeeeeeeeeE---R .    .    .    .    .    .    .   .   mulxq	32(%rsi), %r14, %rax
-[0,77]    .    .    .    .    .    .D===============eER.    .    .    .    .    .    .   .   adoxq	%r14, %r9
-[0,78]    .    .    .    .    .    .D================eER    .    .    .    .    .    .   .   adcxq	%rax, %r8
-[0,79]    .    .    .    .    .    .D====eeeeeeeeeE----R    .    .    .    .    .    .   .   mulxq	40(%rsi), %r14, %rax
-[0,80]    .    .    .    .    .    . D================eER   .    .    .    .    .    .   .   adoxq	%r14, %r8
-[0,81]    .    .    .    .    .    . D=================eER  .    .    .    .    .    .   .   adcxq	%rax, %rdi
-[0,82]    .    .    .    .    .    . D====eeeeeeeeeE-----R  .    .    .    .    .    .   .   mulxq	48(%rsi), %r14, %rax
-[0,83]    .    .    .    .    .    .  D=================eER .    .    .    .    .    .   .   adoxq	%r14, %rdi
-[0,84]    .    .    .    .    .    .  D==================eER.    .    .    .    .    .   .   adcxq	%rax, %rcx
-[0,85]    .    .    .    .    .    .  D====eeeeeeeeeE------R.    .    .    .    .    .   .   mulxq	56(%rsi), %rdx, %rax
-[0,86]    .    .    .    .    .    .   D==================eER    .    .    .    .    .   .   adoxq	%rdx, %rcx
-[0,87]    .    .    .    .    .    .   D===================eER   .    .    .    .    .   .   adcxq	%r13, %rax
-[0,88]    .    .    .    .    .    .   D====================eER  .    .    .    .    .   .   adoxq	%r13, %rax
-[0,89]    .    .    .    .    .    .   D----------------------R  .    .    .    .    .   .   xorl	%r13d, %r13d
-[0,90]    .    .    .    .    .    .   DeeeeeE----------------R  .    .    .    .    .   .   movq	24(%r12), %rdx
-[0,91]    .    .    .    .    .    .    D====eeeeeeeeeE-------R  .    .    .    .    .   .   mulxq	(%rsi), %r15, %r14
-[0,92]    .    .    .    .    .    .    D===========eE--------R  .    .    .    .    .   .   adoxq	%r15, %rbx
-[0,93]    .    .    .    .    .    .    D=============eE------R  .    .    .    .    .   .   adcxq	%r14, %r11
-[0,94]    .    .    .    .    .    .    .D===========eE-------R  .    .    .    .    .   .   movq	%rbx, -80(%rbp)
-[0,95]    .    .    .    .    .    .    .D=============eE-----R  .    .    .    .    .   .   movq	%r11, %r15
-[0,96]    .    .    .    .    .    .    .D====eeeeeeeeeE------R  .    .    .    .    .   .   mulxq	8(%rsi), %r14, %rbx
-[0,97]    .    .    .    .    .    .    . D=============eE----R  .    .    .    .    .   .   adoxq	%r14, %r15
-[0,98]    .    .    .    .    .    .    . D==============eE---R  .    .    .    .    .   .   adcxq	%rbx, %r10
-[0,99]    .    .    .    .    .    .    . D====eeeeeeeeeE-----R  .    .    .    .    .   .   mulxq	16(%rsi), %rbx, %r11
-[0,100]   .    .    .    .    .    .    .  D==============eE--R  .    .    .    .    .   .   adoxq	%rbx, %r10
-[0,101]   .    .    .    .    .    .    .  D===============eE-R  .    .    .    .    .   .   adcxq	%r11, %r9
-[0,102]   .    .    .    .    .    .    .  D====eeeeeeeeeE----R  .    .    .    .    .   .   mulxq	24(%rsi), %rbx, %r11
-[0,103]   .    .    .    .    .    .    .   D===============eER  .    .    .    .    .   .   adoxq	%rbx, %r9
-[0,104]   .    .    .    .    .    .    .   D================eER .    .    .    .    .   .   adcxq	%r11, %r8
-[0,105]   .    .    .    .    .    .    .   D====eeeeeeeeeE----R .    .    .    .    .   .   mulxq	32(%rsi), %rbx, %r11
-[0,106]   .    .    .    .    .    .    .    D================eER.    .    .    .    .   .   adoxq	%rbx, %r8
-[0,107]   .    .    .    .    .    .    .    D=================eER    .    .    .    .   .   adcxq	%r11, %rdi
-[0,108]   .    .    .    .    .    .    .    D====eeeeeeeeeE-----R    .    .    .    .   .   mulxq	40(%rsi), %rbx, %r11
-[0,109]   .    .    .    .    .    .    .    .D=================eER   .    .    .    .   .   adoxq	%rbx, %rdi
-[0,110]   .    .    .    .    .    .    .    .D==================eER  .    .    .    .   .   adcxq	%r11, %rcx
-[0,111]   .    .    .    .    .    .    .    .D====eeeeeeeeeE------R  .    .    .    .   .   mulxq	48(%rsi), %rbx, %r11
-[0,112]   .    .    .    .    .    .    .    . D==================eER .    .    .    .   .   adoxq	%rbx, %rcx
-[0,113]   .    .    .    .    .    .    .    . D===================eER.    .    .    .   .   adcxq	%r11, %rax
-[0,114]   .    .    .    .    .    .    .    . D====eeeeeeeeeE-------R.    .    .    .   .   mulxq	56(%rsi), %rdx, %r11
-[0,115]   .    .    .    .    .    .    .    .  D===================eER    .    .    .   .   adoxq	%rdx, %rax
-[0,116]   .    .    .    .    .    .    .    .  D====================eER   .    .    .   .   adcxq	%r13, %r11
-[0,117]   .    .    .    .    .    .    .    .  DeeeeeE----------------R   .    .    .   .   movq	32(%r12), %rdx
-[0,118]   .    .    .    .    .    .    .    .  D=====================eER  .    .    .   .   adoxq	%r13, %r11
-[0,119]   .    .    .    .    .    .    .    .  D=====E-----------------R  .    .    .   .   xorl	%ebx, %ebx
-[0,120]   .    .    .    .    .    .    .    .   D====eeeeeeeeeE--------R  .    .    .   .   mulxq	(%rsi), %r14, %r13
-[0,121]   .    .    .    .    .    .    .    .   D===========eE---------R  .    .    .   .   adoxq	%r14, %r15
-[0,122]   .    .    .    .    .    .    .    .   D=============eE-------R  .    .    .   .   adcxq	%r13, %r10
-[0,123]   .    .    .    .    .    .    .    .    D===========eE--------R  .    .    .   .   movq	%r15, -88(%rbp)
-[0,124]   .    .    .    .    .    .    .    .    D====eeeeeeeeeE-------R  .    .    .   .   mulxq	8(%rsi), %r14, %r13
-[0,125]   .    .    .    .    .    .    .    .    D=============eE------R  .    .    .   .   movq	%r10, %r15
-[0,126]   .    .    .    .    .    .    .    .    .D============eE------R  .    .    .   .   adcxq	%r13, %r9
-[0,127]   .    .    .    .    .    .    .    .    .D=============eE-----R  .    .    .   .   adoxq	%r14, %r15
-[0,128]   .    .    .    .    .    .    .    .    .D====eeeeeeeeeE------R  .    .    .   .   mulxq	16(%rsi), %r13, %r10
-[0,129]   .    .    .    .    .    .    .    .    . D=============eE----R  .    .    .   .   adoxq	%r13, %r9
-[0,130]   .    .    .    .    .    .    .    .    . D==============eE---R  .    .    .   .   adcxq	%r10, %r8
-[0,131]   .    .    .    .    .    .    .    .    . D====eeeeeeeeeE-----R  .    .    .   .   mulxq	24(%rsi), %r13, %r10
-[0,132]   .    .    .    .    .    .    .    .    .  D==============eE--R  .    .    .   .   adcxq	%r10, %rdi
-[0,133]   .    .    .    .    .    .    .    .    .  D===============eE-R  .    .    .   .   adoxq	%r13, %r8
-[0,134]   .    .    .    .    .    .    .    .    .  D====eeeeeeeeeE----R  .    .    .   .   mulxq	32(%rsi), %r13, %r10
-[0,135]   .    .    .    .    .    .    .    .    .   D===============eER  .    .    .   .   adoxq	%r13, %rdi
-[0,136]   .    .    .    .    .    .    .    .    .   D================eER .    .    .   .   adcxq	%r10, %rcx
-[0,137]   .    .    .    .    .    .    .    .    .   D====eeeeeeeeeE----R .    .    .   .   mulxq	40(%rsi), %r13, %r10
-[0,138]   .    .    .    .    .    .    .    .    .    D================eER.    .    .   .   adoxq	%r13, %rcx
-[0,139]   .    .    .    .    .    .    .    .    .    D=================eER    .    .   .   adcxq	%r10, %rax
-[0,140]   .    .    .    .    .    .    .    .    .    D====eeeeeeeeeE-----R    .    .   .   mulxq	48(%rsi), %r13, %r10
-[0,141]   .    .    .    .    .    .    .    .    .    .D=================eER   .    .   .   adoxq	%r13, %rax
-[0,142]   .    .    .    .    .    .    .    .    .    .D==================eER  .    .   .   adcxq	%r10, %r11
-[0,143]   .    .    .    .    .    .    .    .    .    .D====eeeeeeeeeE------R  .    .   .   mulxq	56(%rsi), %rdx, %r10
-[0,144]   .    .    .    .    .    .    .    .    .    . D==================eER .    .   .   adoxq	%rdx, %r11
-[0,145]   .    .    .    .    .    .    .    .    .    . D===================eER.    .   .   adcxq	%rbx, %r10
-[0,146]   .    .    .    .    .    .    .    .    .    . DeeeeeE---------------R.    .   .   movq	40(%r12), %rdx
-[0,147]   .    .    .    .    .    .    .    .    .    . D====================eER    .   .   adoxq	%rbx, %r10
-[0,148]   .    .    .    .    .    .    .    .    .    .  D====eeeeeeeeeE-------R    .   .   mulxq	(%rsi), %r14, %r13
-[0,149]   .    .    .    .    .    .    .    .    .    .  D---------------------R    .   .   xorl	%ebx, %ebx
-[0,150]   .    .    .    .    .    .    .    .    .    .  D============eE-------R    .   .   adoxq	%r14, %r15
-[0,151]   .    .    .    .    .    .    .    .    .    .   D============eE------R    .   .   movq	%r15, -96(%rbp)
-[0,152]   .    .    .    .    .    .    .    .    .    .   D============eE------R    .   .   adcxq	%r13, %r9
-[0,153]   .    .    .    .    .    .    .    .    .    .   D=====eeeeeeeeeE-----R    .   .   mulxq	8(%rsi), %r14, %r13
-[0,154]   .    .    .    .    .    .    .    .    .    .    D============eE-----R    .   .   movq	%r9, %r15
-[0,155]   .    .    .    .    .    .    .    .    .    .    D=============eE----R    .   .   adoxq	%r14, %r15
-[0,156]   .    .    .    .    .    .    .    .    .    .    D==============eE---R    .   .   adcxq	%r13, %r8
-[0,157]   .    .    .    .    .    .    .    .    .    .    .D====eeeeeeeeeE----R    .   .   mulxq	16(%rsi), %r13, %r9
-[0,158]   .    .    .    .    .    .    .    .    .    .    .D==============eE--R    .   .   adoxq	%r13, %r8
-[0,159]   .    .    .    .    .    .    .    .    .    .    .D===============eE-R    .   .   adcxq	%r9, %rdi
-[0,160]   .    .    .    .    .    .    .    .    .    .    . D====eeeeeeeeeE---R    .   .   mulxq	24(%rsi), %r13, %r9
-[0,161]   .    .    .    .    .    .    .    .    .    .    . D===============eER    .   .   adoxq	%r13, %rdi
-[0,162]   .    .    .    .    .    .    .    .    .    .    . D================eER   .   .   adcxq	%r9, %rcx
-[0,163]   .    .    .    .    .    .    .    .    .    .    .  D====eeeeeeeeeE---R   .   .   mulxq	32(%rsi), %r13, %r9
-[0,164]   .    .    .    .    .    .    .    .    .    .    .  D================eER  .   .   adoxq	%r13, %rcx
-[0,165]   .    .    .    .    .    .    .    .    .    .    .  D=================eER .   .   adcxq	%r9, %rax
-[0,166]   .    .    .    .    .    .    .    .    .    .    .   D====eeeeeeeeeE----R .   .   mulxq	40(%rsi), %r13, %r9
-[0,167]   .    .    .    .    .    .    .    .    .    .    .   D=================eER.   .   adoxq	%r13, %rax
-[0,168]   .    .    .    .    .    .    .    .    .    .    .   D==================eER   .   adcxq	%r9, %r11
-[0,169]   .    .    .    .    .    .    .    .    .    .    .    D====eeeeeeeeeE-----R   .   mulxq	48(%rsi), %r13, %r9
-[0,170]   .    .    .    .    .    .    .    .    .    .    .    D==================eER  .   adoxq	%r13, %r11
-[0,171]   .    .    .    .    .    .    .    .    .    .    .    D===================eER .   adcxq	%r9, %r10
-[0,172]   .    .    .    .    .    .    .    .    .    .    .    .D====eeeeeeeeeE------R .   mulxq	56(%rsi), %rdx, %r9
-[0,173]   .    .    .    .    .    .    .    .    .    .    .    .D===================eER.   adoxq	%rdx, %r10
-[0,174]   .    .    .    .    .    .    .    .    .    .    .    .D====================eER   adcxq	%rbx, %r9
diff --git a/libc/nexgen32e/mul8x8adx.S b/libc/nexgen32e/mul8x8adx.S
new file mode 100644
index 000000000..12d9f98df
--- /dev/null
+++ b/libc/nexgen32e/mul8x8adx.S
@@ -0,0 +1,495 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/macros.internal.h"
+
+//	Computes 1024-bit product of 512-bit and 512-bit numbers.
+//
+//		Instructions:       260
+//		Total Cycles:        98
+//		Total uOps:         452
+//		uOps Per Cycle:    4.61
+//		IPC:               2.65
+//		Block RThroughput: 75.3
+//
+//	@param	rdi receives 16 quadword result
+//	@param	rsi is left hand side which must have 8 quadwords
+//	@param	rdx is right hand side which must have 8 quadwords
+//	@note	words are host endian while array is little endian
+//	@mayalias
+Mul8x8Adx:
+	push	%rbp
+	mov	%rsp,%rbp
+	.profilable
+	sub	$104,%rsp
+	mov	%r15,-8(%rbp)
+	mov	%r14,-16(%rbp)
+	mov	%r13,-24(%rbp)
+	mov	%r12,-32(%rbp)
+	mov	%rbx,-40(%rbp)
+	mov	%rdx,%r12
+	mov	(%rdx),%rdx
+	mulx	(%rsi),%rax,%rcx
+	mov	%rdi,-48(%rbp)
+	mov	%rax,-56(%rbp)
+	mulx	8(%rsi),%rdx,%rax
+	add	%rdx,%rcx
+	mov	(%r12),%rdx
+	mulx	16(%rsi),%rdx,%rbx
+	adc	%rdx,%rax
+	mov	(%r12),%rdx
+	mulx	24(%rsi),%rdx,%r11
+	adc	%rdx,%rbx
+	mov	(%r12),%rdx
+	mulx	32(%rsi),%rdx,%r10
+	adc	%rdx,%r11
+	mov	(%r12),%rdx
+	mulx	40(%rsi),%rdx,%r9
+	adc	%rdx,%r10
+	mov	(%r12),%rdx
+	mulx	48(%rsi),%rdx,%r8
+	adc	%rdx,%r9
+	mov	(%r12),%rdx
+	mulx	56(%rsi),%rdx,%rdi
+	adc	%rdx,%r8
+	adc	$0,%rdi
+	xor	%r13d,%r13d
+	mov	8(%r12),%rdx
+	mulx	(%rsi),%r15,%r14
+	adox	%r15,%rcx
+	adcx	%r14,%rax
+	mov	%rcx,-64(%rbp)
+	mulx	8(%rsi),%r14,%rcx
+	adox	%r14,%rax
+	adcx	%rcx,%rbx
+	mulx	16(%rsi),%r14,%rcx
+	adox	%r14,%rbx
+	adcx	%rcx,%r11
+	mulx	24(%rsi),%r14,%rcx
+	adox	%r14,%r11
+	adcx	%rcx,%r10
+	mulx	32(%rsi),%r14,%rcx
+	adox	%r14,%r10
+	adcx	%rcx,%r9
+	mulx	40(%rsi),%r14,%rcx
+	adox	%r14,%r9
+	adcx	%rcx,%r8
+	mulx	48(%rsi),%r14,%rcx
+	adox	%r14,%r8
+	adcx	%rcx,%rdi
+	mulx	56(%rsi),%rdx,%rcx
+	adox	%rdx,%rdi
+	adcx	%r13,%rcx
+	mov	16(%r12),%rdx
+	adox	%r13,%rcx
+	mulx	(%rsi),%r15,%r14
+	xor	%r13d,%r13d
+	adox	%r15,%rax
+	adcx	%r14,%rbx
+	mov	%rax,-72(%rbp)
+	mulx	8(%rsi),%r14,%rax
+	adox	%r14,%rbx
+	adcx	%rax,%r11
+	mulx	16(%rsi),%r14,%rax
+	adox	%r14,%r11
+	adcx	%rax,%r10
+	mulx	24(%rsi),%r14,%rax
+	adox	%r14,%r10
+	adcx	%rax,%r9
+	mulx	32(%rsi),%r14,%rax
+	adox	%r14,%r9
+	adcx	%rax,%r8
+	mulx	40(%rsi),%r14,%rax
+	adox	%r14,%r8
+	adcx	%rax,%rdi
+	mulx	48(%rsi),%r14,%rax
+	adox	%r14,%rdi
+	adcx	%rax,%rcx
+	mulx	56(%rsi),%rdx,%rax
+	adox	%rdx,%rcx
+	adcx	%r13,%rax
+	adox	%r13,%rax
+	xor	%r13d,%r13d
+	mov	24(%r12),%rdx
+	mulx	(%rsi),%r15,%r14
+	adox	%r15,%rbx
+	adcx	%r14,%r11
+	mov	%rbx,-80(%rbp)
+	mov	%r11,%r15
+	mulx	8(%rsi),%r14,%rbx
+	adox	%r14,%r15
+	adcx	%rbx,%r10
+	mulx	16(%rsi),%rbx,%r11
+	adox	%rbx,%r10
+	adcx	%r11,%r9
+	mulx	24(%rsi),%rbx,%r11
+	adox	%rbx,%r9
+	adcx	%r11,%r8
+	mulx	32(%rsi),%rbx,%r11
+	adox	%rbx,%r8
+	adcx	%r11,%rdi
+	mulx	40(%rsi),%rbx,%r11
+	adox	%rbx,%rdi
+	adcx	%r11,%rcx
+	mulx	48(%rsi),%rbx,%r11
+	adox	%rbx,%rcx
+	adcx	%r11,%rax
+	mulx	56(%rsi),%rdx,%r11
+	adox	%rdx,%rax
+	adcx	%r13,%r11
+	mov	32(%r12),%rdx
+	adox	%r13,%r11
+	xor	%ebx,%ebx
+	mulx	(%rsi),%r14,%r13
+	adox	%r14,%r15
+	adcx	%r13,%r10
+	mov	%r15,-88(%rbp)
+	mulx	8(%rsi),%r14,%r13
+	mov	%r10,%r15
+	adcx	%r13,%r9
+	adox	%r14,%r15
+	mulx	16(%rsi),%r13,%r10
+	adox	%r13,%r9
+	adcx	%r10,%r8
+	mulx	24(%rsi),%r13,%r10
+	adcx	%r10,%rdi
+	adox	%r13,%r8
+	mulx	32(%rsi),%r13,%r10
+	adox	%r13,%rdi
+	adcx	%r10,%rcx
+	mulx	40(%rsi),%r13,%r10
+	adox	%r13,%rcx
+	adcx	%r10,%rax
+	mulx	48(%rsi),%r13,%r10
+	adox	%r13,%rax
+	adcx	%r10,%r11
+	mulx	56(%rsi),%rdx,%r10
+	adox	%rdx,%r11
+	adcx	%rbx,%r10
+	mov	40(%r12),%rdx
+	adox	%rbx,%r10
+	mulx	(%rsi),%r14,%r13
+	xor	%ebx,%ebx
+	adox	%r14,%r15
+	mov	%r15,-96(%rbp)
+	adcx	%r13,%r9
+	mulx	8(%rsi),%r14,%r13
+	mov	%r9,%r15
+	adox	%r14,%r15
+	adcx	%r13,%r8
+	mulx	16(%rsi),%r13,%r9
+	adox	%r13,%r8
+	adcx	%r9,%rdi
+	mulx	24(%rsi),%r13,%r9
+	adox	%r13,%rdi
+	adcx	%r9,%rcx
+	mulx	32(%rsi),%r13,%r9
+	adox	%r13,%rcx
+	adcx	%r9,%rax
+	mulx	40(%rsi),%r13,%r9
+	adox	%r13,%rax
+	adcx	%r9,%r11
+	mulx	48(%rsi),%r13,%r9
+	adox	%r13,%r11
+	adcx	%r9,%r10
+	mulx	56(%rsi),%rdx,%r9
+	adox	%rdx,%r10
+	adcx	%rbx,%r9
+	adox	%rbx,%r9
+	xor	%ebx,%ebx
+	mov	48(%r12),%rdx
+	mulx	(%rsi),%r14,%r13
+	adox	%r14,%r15
+	adcx	%r13,%r8
+	mov	%r15,-104(%rbp)
+	mulx	8(%rsi),%r14,%r13
+	mov	%r8,%r15
+	adcx	%r13,%rdi
+	adox	%r14,%r15
+	mulx	16(%rsi),%r13,%r8
+	adox	%r13,%rdi
+	adcx	%r8,%rcx
+	mulx	24(%rsi),%r13,%r8
+	adox	%r13,%rcx
+	adcx	%r8,%rax
+	mulx	32(%rsi),%r13,%r8
+	adox	%r13,%rax
+	adcx	%r8,%r11
+	mulx	40(%rsi),%r13,%r8
+	adox	%r13,%r11
+	adcx	%r8,%r10
+	mulx	48(%rsi),%r13,%r8
+	adox	%r13,%r10
+	adcx	%r8,%r9
+	mulx	56(%rsi),%rdx,%r8
+	adox	%rdx,%r9
+	mov	56(%r12),%rdx
+	adcx	%rbx,%r8
+	mulx	(%rsi),%r13,%r12
+	adox	%rbx,%r8
+	xor	%ebx,%ebx
+	adox	%r13,%r15
+	adcx	%r12,%rdi
+	mulx	8(%rsi),%r13,%r12
+	adox	%r13,%rdi
+	adcx	%r12,%rcx
+	mulx	16(%rsi),%r13,%r12
+	adox	%r13,%rcx
+	adcx	%r12,%rax
+	mulx	24(%rsi),%r13,%r12
+	adox	%r13,%rax
+	adcx	%r12,%r11
+	mulx	32(%rsi),%r13,%r12
+	adox	%r13,%r11
+	adcx	%r12,%r10
+	mulx	40(%rsi),%r13,%r12
+	adox	%r13,%r10
+	adcx	%r12,%r9
+	mulx	48(%rsi),%r13,%r12
+	mulx	56(%rsi),%rsi,%rdx
+	adox	%r13,%r9
+	adcx	%r12,%r8
+	adox	%rsi,%r8
+	adcx	%rbx,%rdx
+	mov	-64(%rbp),%rsi
+	adox	%rbx,%rdx
+	mov	-48(%rbp),%rbx
+	mov	-56(%rbp),%r14
+	mov	%rsi,8(%rbx)
+	mov	-72(%rbp),%rsi
+	mov	%r14,(%rbx)
+	mov	%rsi,16(%rbx)
+	mov	-80(%rbp),%rsi
+	mov	%rsi,24(%rbx)
+	mov	-88(%rbp),%rsi
+	mov	%rsi,32(%rbx)
+	mov	-96(%rbp),%rsi
+	mov	%rsi,40(%rbx)
+	mov	-104(%rbp),%rsi
+	mov	%r15,56(%rbx)
+	mov	%rsi,48(%rbx)
+	mov	%rdi,64(%rbx)
+	mov	%rcx,72(%rbx)
+	mov	%rax,80(%rbx)
+	mov	%r11,88(%rbx)
+	mov	%r10,96(%rbx)
+	mov	%r9,104(%rbx)
+	mov	%r8,112(%rbx)
+	mov	%rdx,120(%rbx)
+	mov	-8(%rbp),%r15
+	mov	-16(%rbp),%r14
+	mov	-24(%rbp),%r13
+	mov	-32(%rbp),%r12
+	mov	-40(%rbp),%rbx
+	leave
+	ret
+	.endfn	Mul8x8Adx,globl
+
+	.end
+TIMELINE VIEW       0123456789          0123456789          0123456789          0123456789
+Index     0123456789          0123456789          0123456789          0123456789
+[0,0]     DeER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   subq	$104, %rsp
+[0,1]     DeER .    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq	%r15, -8(%rbp)
+[0,2]     D=eER.    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq	%r14, -16(%rbp)
+[0,3]     D==eER    .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq	%r13, -24(%rbp)
+[0,4]     D===eER   .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq	%r12, -32(%rbp)
+[0,5]     D====eER  .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq	%rbx, -40(%rbp)
+[0,6]     .DeE---R  .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq	%rdx, %r12
+[0,7]     .DeeeeeER .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   movq	(%rdx), %rdx
+[0,8]     .D=====eeeeeeeeeER  .    .    .    .    .    .    .    .    .    .    .    .   .   mulxq	(%rsi), %rax, %rcx
+[0,9]     . D====eE--------R  .    .    .    .    .    .    .    .    .    .    .    .   .   movq	%rdi, -48(%rbp)
+[0,10]    . D=======eE-----R  .    .    .    .    .    .    .    .    .    .    .    .   .   movq	%rax, -56(%rbp)
+[0,11]    . D=====eeeeeeeeeER .    .    .    .    .    .    .    .    .    .    .    .   .   mulxq	8(%rsi), %rdx, %rax
+[0,12]    .  D============eER .    .    .    .    .    .    .    .    .    .    .    .   .   addq	%rdx, %rcx
+[0,13]    .  DeeeeeE--------R .    .    .    .    .    .    .    .    .    .    .    .   .   movq	(%r12), %rdx
+[0,14]    .  D=====eeeeeeeeeER.    .    .    .    .    .    .    .    .    .    .    .   .   mulxq	16(%rsi), %rdx, %rbx
+[0,15]    .   D============eER.    .    .    .    .    .    .    .    .    .    .    .   .   adcq	%rdx, %rax
+[0,16]    .   DeeeeeE--------R.    .    .    .    .    .    .    .    .    .    .    .   .   movq	(%r12), %rdx
+[0,17]    .   D=====eeeeeeeeeER    .    .    .    .    .    .    .    .    .    .    .   .   mulxq	24(%rsi), %rdx, %r11
+[0,18]    .    D============eER    .    .    .    .    .    .    .    .    .    .    .   .   adcq	%rdx, %rbx
+[0,19]    .    DeeeeeE--------R    .    .    .    .    .    .    .    .    .    .    .   .   movq	(%r12), %rdx
+[0,20]    .    D=====eeeeeeeeeER   .    .    .    .    .    .    .    .    .    .    .   .   mulxq	32(%rsi), %rdx, %r10
+[0,21]    .    .D============eER   .    .    .    .    .    .    .    .    .    .    .   .   adcq	%rdx, %r11
+[0,22]    .    .DeeeeeE--------R   .    .    .    .    .    .    .    .    .    .    .   .   movq	(%r12), %rdx
+[0,23]    .    .D=====eeeeeeeeeER  .    .    .    .    .    .    .    .    .    .    .   .   mulxq	40(%rsi), %rdx, %r9
+[0,24]    .    . D============eER  .    .    .    .    .    .    .    .    .    .    .   .   adcq	%rdx, %r10
+[0,25]    .    . DeeeeeE--------R  .    .    .    .    .    .    .    .    .    .    .   .   movq	(%r12), %rdx
+[0,26]    .    . D=====eeeeeeeeeER .    .    .    .    .    .    .    .    .    .    .   .   mulxq	48(%rsi), %rdx, %r8
+[0,27]    .    .  D============eER .    .    .    .    .    .    .    .    .    .    .   .   adcq	%rdx, %r9
+[0,28]    .    .  DeeeeeE--------R .    .    .    .    .    .    .    .    .    .    .   .   movq	(%r12), %rdx
+[0,29]    .    .  D=====eeeeeeeeeER.    .    .    .    .    .    .    .    .    .    .   .   mulxq	56(%rsi), %rdx, %rdi
+[0,30]    .    .   D============eER.    .    .    .    .    .    .    .    .    .    .   .   adcq	%rdx, %r8
+[0,31]    .    .   D=============eER    .    .    .    .    .    .    .    .    .    .   .   adcq	$0, %rdi
+[0,32]    .    .   D---------------R    .    .    .    .    .    .    .    .    .    .   .   xorl	%r13d, %r13d
+[0,33]    .    .   DeeeeeE---------R    .    .    .    .    .    .    .    .    .    .   .   movq	8(%r12), %rdx
+[0,34]    .    .    D====eeeeeeeeeER    .    .    .    .    .    .    .    .    .    .   .   mulxq	(%rsi), %r15, %r14
+[0,35]    .    .    D=======eE-----R    .    .    .    .    .    .    .    .    .    .   .   adoxq	%r15, %rcx
+[0,36]    .    .    D=============eER   .    .    .    .    .    .    .    .    .    .   .   adcxq	%r14, %rax
+[0,37]    .    .    .D=======eE-----R   .    .    .    .    .    .    .    .    .    .   .   movq	%rcx, -64(%rbp)
+[0,38]    .    .    .D====eeeeeeeeeER   .    .    .    .    .    .    .    .    .    .   .   mulxq	8(%rsi), %r14, %rcx
+[0,39]    .    .    .D=============eER  .    .    .    .    .    .    .    .    .    .   .   adoxq	%r14, %rax
+[0,40]    .    .    . D=============eER .    .    .    .    .    .    .    .    .    .   .   adcxq	%rcx, %rbx
+[0,41]    .    .    . D====eeeeeeeeeE-R .    .    .    .    .    .    .    .    .    .   .   mulxq	16(%rsi), %r14, %rcx
+[0,42]    .    .    . D==============eER.    .    .    .    .    .    .    .    .    .   .   adoxq	%r14, %rbx
+[0,43]    .    .    .  D==============eER    .    .    .    .    .    .    .    .    .   .   adcxq	%rcx, %r11
+[0,44]    .    .    .  D====eeeeeeeeeE--R    .    .    .    .    .    .    .    .    .   .   mulxq	24(%rsi), %r14, %rcx
+[0,45]    .    .    .  D===============eER   .    .    .    .    .    .    .    .    .   .   adoxq	%r14, %r11
+[0,46]    .    .    .   D===============eER  .    .    .    .    .    .    .    .    .   .   adcxq	%rcx, %r10
+[0,47]    .    .    .   D====eeeeeeeeeE---R  .    .    .    .    .    .    .    .    .   .   mulxq	32(%rsi), %r14, %rcx
+[0,48]    .    .    .   D================eER .    .    .    .    .    .    .    .    .   .   adoxq	%r14, %r10
+[0,49]    .    .    .    D================eER.    .    .    .    .    .    .    .    .   .   adcxq	%rcx, %r9
+[0,50]    .    .    .    D====eeeeeeeeeE----R.    .    .    .    .    .    .    .    .   .   mulxq	40(%rsi), %r14, %rcx
+[0,51]    .    .    .    D=================eER    .    .    .    .    .    .    .    .   .   adoxq	%r14, %r9
+[0,52]    .    .    .    .D=================eER   .    .    .    .    .    .    .    .   .   adcxq	%rcx, %r8
+[0,53]    .    .    .    .D====eeeeeeeeeE-----R   .    .    .    .    .    .    .    .   .   mulxq	48(%rsi), %r14, %rcx
+[0,54]    .    .    .    .D==================eER  .    .    .    .    .    .    .    .   .   adoxq	%r14, %r8
+[0,55]    .    .    .    . D==================eER .    .    .    .    .    .    .    .   .   adcxq	%rcx, %rdi
+[0,56]    .    .    .    . D====eeeeeeeeeE------R .    .    .    .    .    .    .    .   .   mulxq	56(%rsi), %rdx, %rcx
+[0,57]    .    .    .    . D===================eER.    .    .    .    .    .    .    .   .   adoxq	%rdx, %rdi
+[0,58]    .    .    .    .  D===================eER    .    .    .    .    .    .    .   .   adcxq	%r13, %rcx
+[0,59]    .    .    .    .  DeeeeeE---------------R    .    .    .    .    .    .    .   .   movq	16(%r12), %rdx
+[0,60]    .    .    .    .  D====================eER   .    .    .    .    .    .    .   .   adoxq	%r13, %rcx
+[0,61]    .    .    .    .   D====eeeeeeeeeE-------R   .    .    .    .    .    .    .   .   mulxq	(%rsi), %r15, %r14
+[0,62]    .    .    .    .   D---------------------R   .    .    .    .    .    .    .   .   xorl	%r13d, %r13d
+[0,63]    .    .    .    .   D=======eE------------R   .    .    .    .    .    .    .   .   adoxq	%r15, %rax
+[0,64]    .    .    .    .    D============eE------R   .    .    .    .    .    .    .   .   adcxq	%r14, %rbx
+[0,65]    .    .    .    .    D=======eE-----------R   .    .    .    .    .    .    .   .   movq	%rax, -72(%rbp)
+[0,66]    .    .    .    .    D====eeeeeeeeeE------R   .    .    .    .    .    .    .   .   mulxq	8(%rsi), %r14, %rax
+[0,67]    .    .    .    .    .D============eE-----R   .    .    .    .    .    .    .   .   adoxq	%r14, %rbx
+[0,68]    .    .    .    .    .D=============eE----R   .    .    .    .    .    .    .   .   adcxq	%rax, %r11
+[0,69]    .    .    .    .    .D====eeeeeeeeeE-----R   .    .    .    .    .    .    .   .   mulxq	16(%rsi), %r14, %rax
+[0,70]    .    .    .    .    . D=============eE---R   .    .    .    .    .    .    .   .   adoxq	%r14, %r11
+[0,71]    .    .    .    .    . D==============eE--R   .    .    .    .    .    .    .   .   adcxq	%rax, %r10
+[0,72]    .    .    .    .    . D====eeeeeeeeeE----R   .    .    .    .    .    .    .   .   mulxq	24(%rsi), %r14, %rax
+[0,73]    .    .    .    .    .  D==============eE-R   .    .    .    .    .    .    .   .   adoxq	%r14, %r10
+[0,74]    .    .    .    .    .  D===============eER   .    .    .    .    .    .    .   .   adcxq	%rax, %r9
+[0,75]    .    .    .    .    .  D====eeeeeeeeeE---R   .    .    .    .    .    .    .   .   mulxq	32(%rsi), %r14, %rax
+[0,76]    .    .    .    .    .   D===============eER  .    .    .    .    .    .    .   .   adoxq	%r14, %r9
+[0,77]    .    .    .    .    .   D================eER .    .    .    .    .    .    .   .   adcxq	%rax, %r8
+[0,78]    .    .    .    .    .   D====eeeeeeeeeE----R .    .    .    .    .    .    .   .   mulxq	40(%rsi), %r14, %rax
+[0,79]    .    .    .    .    .    D================eER.    .    .    .    .    .    .   .   adoxq	%r14, %r8
+[0,80]    .    .    .    .    .    D=================eER    .    .    .    .    .    .   .   adcxq	%rax, %rdi
+[0,81]    .    .    .    .    .    D====eeeeeeeeeE-----R    .    .    .    .    .    .   .   mulxq	48(%rsi), %r14, %rax
+[0,82]    .    .    .    .    .    .D=================eER   .    .    .    .    .    .   .   adoxq	%r14, %rdi
+[0,83]    .    .    .    .    .    .D==================eER  .    .    .    .    .    .   .   adcxq	%rax, %rcx
+[0,84]    .    .    .    .    .    .D====eeeeeeeeeE------R  .    .    .    .    .    .   .   mulxq	56(%rsi), %rdx, %rax
+[0,85]    .    .    .    .    .    . D==================eER .    .    .    .    .    .   .   adoxq	%rdx, %rcx
+[0,86]    .    .    .    .    .    . D===================eER.    .    .    .    .    .   .   adcxq	%r13, %rax
+[0,87]    .    .    .    .    .    . D====================eER    .    .    .    .    .   .   adoxq	%r13, %rax
+[0,88]    .    .    .    .    .    . D----------------------R    .    .    .    .    .   .   xorl	%r13d, %r13d
+[0,89]    .    .    .    .    .    . DeeeeeE----------------R    .    .    .    .    .   .   movq	24(%r12), %rdx
+[0,90]    .    .    .    .    .    .  D====eeeeeeeeeE-------R    .    .    .    .    .   .   mulxq	(%rsi), %r15, %r14
+[0,91]    .    .    .    .    .    .  D===========eE--------R    .    .    .    .    .   .   adoxq	%r15, %rbx
+[0,92]    .    .    .    .    .    .  D=============eE------R    .    .    .    .    .   .   adcxq	%r14, %r11
+[0,93]    .    .    .    .    .    .   D===========eE-------R    .    .    .    .    .   .   movq	%rbx, -80(%rbp)
+[0,94]    .    .    .    .    .    .   D=============eE-----R    .    .    .    .    .   .   movq	%r11, %r15
+[0,95]    .    .    .    .    .    .   D====eeeeeeeeeE------R    .    .    .    .    .   .   mulxq	8(%rsi), %r14, %rbx
+[0,96]    .    .    .    .    .    .    D=============eE----R    .    .    .    .    .   .   adoxq	%r14, %r15
+[0,97]    .    .    .    .    .    .    D==============eE---R    .    .    .    .    .   .   adcxq	%rbx, %r10
+[0,98]    .    .    .    .    .    .    D====eeeeeeeeeE-----R    .    .    .    .    .   .   mulxq	16(%rsi), %rbx, %r11
+[0,99]    .    .    .    .    .    .    .D==============eE--R    .    .    .    .    .   .   adoxq	%rbx, %r10
+[0,100]   .    .    .    .    .    .    .D===============eE-R    .    .    .    .    .   .   adcxq	%r11, %r9
+[0,101]   .    .    .    .    .    .    .D====eeeeeeeeeE----R    .    .    .    .    .   .   mulxq	24(%rsi), %rbx, %r11
+[0,102]   .    .    .    .    .    .    . D===============eER    .    .    .    .    .   .   adoxq	%rbx, %r9
+[0,103]   .    .    .    .    .    .    . D================eER   .    .    .    .    .   .   adcxq	%r11, %r8
+[0,104]   .    .    .    .    .    .    . D====eeeeeeeeeE----R   .    .    .    .    .   .   mulxq	32(%rsi), %rbx, %r11
+[0,105]   .    .    .    .    .    .    .  D================eER  .    .    .    .    .   .   adoxq	%rbx, %r8
+[0,106]   .    .    .    .    .    .    .  D=================eER .    .    .    .    .   .   adcxq	%r11, %rdi
+[0,107]   .    .    .    .    .    .    .  D====eeeeeeeeeE-----R .    .    .    .    .   .   mulxq	40(%rsi), %rbx, %r11
+[0,108]   .    .    .    .    .    .    .   D=================eER.    .    .    .    .   .   adoxq	%rbx, %rdi
+[0,109]   .    .    .    .    .    .    .   D==================eER    .    .    .    .   .   adcxq	%r11, %rcx
+[0,110]   .    .    .    .    .    .    .   D====eeeeeeeeeE------R    .    .    .    .   .   mulxq	48(%rsi), %rbx, %r11
+[0,111]   .    .    .    .    .    .    .    D==================eER   .    .    .    .   .   adoxq	%rbx, %rcx
+[0,112]   .    .    .    .    .    .    .    D===================eER  .    .    .    .   .   adcxq	%r11, %rax
+[0,113]   .    .    .    .    .    .    .    D====eeeeeeeeeE-------R  .    .    .    .   .   mulxq	56(%rsi), %rdx, %r11
+[0,114]   .    .    .    .    .    .    .    .D===================eER .    .    .    .   .   adoxq	%rdx, %rax
+[0,115]   .    .    .    .    .    .    .    .D====================eER.    .    .    .   .   adcxq	%r13, %r11
+[0,116]   .    .    .    .    .    .    .    .DeeeeeE----------------R.    .    .    .   .   movq	32(%r12), %rdx
+[0,117]   .    .    .    .    .    .    .    .D=====================eER    .    .    .   .   adoxq	%r13, %r11
+[0,118]   .    .    .    .    .    .    .    .D=====E-----------------R    .    .    .   .   xorl	%ebx, %ebx
+[0,119]   .    .    .    .    .    .    .    . D====eeeeeeeeeE--------R    .    .    .   .   mulxq	(%rsi), %r14, %r13
+[0,120]   .    .    .    .    .    .    .    . D===========eE---------R    .    .    .   .   adoxq	%r14, %r15
+[0,121]   .    .    .    .    .    .    .    . D=============eE-------R    .    .    .   .   adcxq	%r13, %r10
+[0,122]   .    .    .    .    .    .    .    .  D===========eE--------R    .    .    .   .   movq	%r15, -88(%rbp)
+[0,123]   .    .    .    .    .    .    .    .  D====eeeeeeeeeE-------R    .    .    .   .   mulxq	8(%rsi), %r14, %r13
+[0,124]   .    .    .    .    .    .    .    .  D=============eE------R    .    .    .   .   movq	%r10, %r15
+[0,125]   .    .    .    .    .    .    .    .   D============eE------R    .    .    .   .   adcxq	%r13, %r9
+[0,126]   .    .    .    .    .    .    .    .   D=============eE-----R    .    .    .   .   adoxq	%r14, %r15
+[0,127]   .    .    .    .    .    .    .    .   D====eeeeeeeeeE------R    .    .    .   .   mulxq	16(%rsi), %r13, %r10
+[0,128]   .    .    .    .    .    .    .    .    D=============eE----R    .    .    .   .   adoxq	%r13, %r9
+[0,129]   .    .    .    .    .    .    .    .    D==============eE---R    .    .    .   .   adcxq	%r10, %r8
+[0,130]   .    .    .    .    .    .    .    .    D====eeeeeeeeeE-----R    .    .    .   .   mulxq	24(%rsi), %r13, %r10
+[0,131]   .    .    .    .    .    .    .    .    .D==============eE--R    .    .    .   .   adcxq	%r10, %rdi
+[0,132]   .    .    .    .    .    .    .    .    .D===============eE-R    .    .    .   .   adoxq	%r13, %r8
+[0,133]   .    .    .    .    .    .    .    .    .D====eeeeeeeeeE----R    .    .    .   .   mulxq	32(%rsi), %r13, %r10
+[0,134]   .    .    .    .    .    .    .    .    . D===============eER    .    .    .   .   adoxq	%r13, %rdi
+[0,135]   .    .    .    .    .    .    .    .    . D================eER   .    .    .   .   adcxq	%r10, %rcx
+[0,136]   .    .    .    .    .    .    .    .    . D====eeeeeeeeeE----R   .    .    .   .   mulxq	40(%rsi), %r13, %r10
+[0,137]   .    .    .    .    .    .    .    .    .  D================eER  .    .    .   .   adoxq	%r13, %rcx
+[0,138]   .    .    .    .    .    .    .    .    .  D=================eER .    .    .   .   adcxq	%r10, %rax
+[0,139]   .    .    .    .    .    .    .    .    .  D====eeeeeeeeeE-----R .    .    .   .   mulxq	48(%rsi), %r13, %r10
+[0,140]   .    .    .    .    .    .    .    .    .   D=================eER.    .    .   .   adoxq	%r13, %rax
+[0,141]   .    .    .    .    .    .    .    .    .   D==================eER    .    .   .   adcxq	%r10, %r11
+[0,142]   .    .    .    .    .    .    .    .    .   D====eeeeeeeeeE------R    .    .   .   mulxq	56(%rsi), %rdx, %r10
+[0,143]   .    .    .    .    .    .    .    .    .    D==================eER   .    .   .   adoxq	%rdx, %r11
+[0,144]   .    .    .    .    .    .    .    .    .    D===================eER  .    .   .   adcxq	%rbx, %r10
+[0,145]   .    .    .    .    .    .    .    .    .    DeeeeeE---------------R  .    .   .   movq	40(%r12), %rdx
+[0,146]   .    .    .    .    .    .    .    .    .    D====================eER .    .   .   adoxq	%rbx, %r10
+[0,147]   .    .    .    .    .    .    .    .    .    .D====eeeeeeeeeE-------R .    .   .   mulxq	(%rsi), %r14, %r13
+[0,148]   .    .    .    .    .    .    .    .    .    .D---------------------R .    .   .   xorl	%ebx, %ebx
+[0,149]   .    .    .    .    .    .    .    .    .    .D============eE-------R .    .   .   adoxq	%r14, %r15
+[0,150]   .    .    .    .    .    .    .    .    .    . D============eE------R .    .   .   movq	%r15, -96(%rbp)
+[0,151]   .    .    .    .    .    .    .    .    .    . D============eE------R .    .   .   adcxq	%r13, %r9
+[0,152]   .    .    .    .    .    .    .    .    .    . D=====eeeeeeeeeE-----R .    .   .   mulxq	8(%rsi), %r14, %r13
+[0,153]   .    .    .    .    .    .    .    .    .    .  D============eE-----R .    .   .   movq	%r9, %r15
+[0,154]   .    .    .    .    .    .    .    .    .    .  D=============eE----R .    .   .   adoxq	%r14, %r15
+[0,155]   .    .    .    .    .    .    .    .    .    .  D==============eE---R .    .   .   adcxq	%r13, %r8
+[0,156]   .    .    .    .    .    .    .    .    .    .   D====eeeeeeeeeE----R .    .   .   mulxq	16(%rsi), %r13, %r9
+[0,157]   .    .    .    .    .    .    .    .    .    .   D==============eE--R .    .   .   adoxq	%r13, %r8
+[0,158]   .    .    .    .    .    .    .    .    .    .   D===============eE-R .    .   .   adcxq	%r9, %rdi
+[0,159]   .    .    .    .    .    .    .    .    .    .    D====eeeeeeeeeE---R .    .   .   mulxq	24(%rsi), %r13, %r9
+[0,160]   .    .    .    .    .    .    .    .    .    .    D===============eER .    .   .   adoxq	%r13, %rdi
+[0,161]   .    .    .    .    .    .    .    .    .    .    D================eER.    .   .   adcxq	%r9, %rcx
+[0,162]   .    .    .    .    .    .    .    .    .    .    .D====eeeeeeeeeE---R.    .   .   mulxq	32(%rsi), %r13, %r9
+[0,163]   .    .    .    .    .    .    .    .    .    .    .D================eER    .   .   adoxq	%r13, %rcx
+[0,164]   .    .    .    .    .    .    .    .    .    .    .D=================eER   .   .   adcxq	%r9, %rax
+[0,165]   .    .    .    .    .    .    .    .    .    .    . D====eeeeeeeeeE----R   .   .   mulxq	40(%rsi), %r13, %r9
+[0,166]   .    .    .    .    .    .    .    .    .    .    . D=================eER  .   .   adoxq	%r13, %rax
+[0,167]   .    .    .    .    .    .    .    .    .    .    . D==================eER .   .   adcxq	%r9, %r11
+[0,168]   .    .    .    .    .    .    .    .    .    .    .  D====eeeeeeeeeE-----R .   .   mulxq	48(%rsi), %r13, %r9
+[0,169]   .    .    .    .    .    .    .    .    .    .    .  D==================eER.   .   adoxq	%r13, %r11
+[0,170]   .    .    .    .    .    .    .    .    .    .    .  D===================eER   .   adcxq	%r9, %r10
+[0,171]   .    .    .    .    .    .    .    .    .    .    .   D====eeeeeeeeeE------R   .   mulxq	56(%rsi), %rdx, %r9
+[0,172]   .    .    .    .    .    .    .    .    .    .    .   D===================eER  .   adoxq	%rdx, %r10
+[0,173]   .    .    .    .    .    .    .    .    .    .    .   D====================eER .   adcxq	%rbx, %r9
+[0,174]   .    .    .    .    .    .    .    .    .    .    .    D====================eER.   adoxq	%rbx, %r9
+[0,175]   .    .    .    .    .    .    .    .    .    .    .    D----------------------R.   xorl	%ebx, %ebx
+[0,176]   .    .    .    .    .    .    .    .    .    .    .    DeeeeeE----------------R.   movq	48(%r12), %rdx
+[0,177]   .    .    .    .    .    .    .    .    .    .    .    .D=====eeeeeeeeeE------R.   mulxq	(%rsi), %r14, %r13
+[0,178]   .    .    .    .    .    .    .    .    .    .    .    .D==========eE---------R.   adoxq	%r14, %r15
+[0,179]   .    .    .    .    .    .    .    .    .    .    .    .D==============eE-----R.   adcxq	%r13, %r8
+[0,180]   .    .    .    .    .    .    .    .    .    .    .    . D==========eE--------R.   movq	%r15, -104(%rbp)
+[0,181]   .    .    .    .    .    .    .    .    .    .    .    . D=====eeeeeeeeeE-----R.   mulxq	8(%rsi), %r14, %r13
+[0,182]   .    .    .    .    .    .    .    .    .    .    .    . D==============eE----R.   movq	%r8, %r15
+[0,183]   .    .    .    .    .    .    .    .    .    .    .    .  D==============eE---R.   adcxq	%r13, %rdi
+[0,184]   .    .    .    .    .    .    .    .    .    .    .    .  D===============eE--R.   adoxq	%r14, %r15
+[0,185]   .    .    .    .    .    .    .    .    .    .    .    .  D=====eeeeeeeeeE----R.   mulxq	16(%rsi), %r13, %r8
+[0,186]   .    .    .    .    .    .    .    .    .    .    .    .   D===============eE-R.   adoxq	%r13, %rdi
+[0,187]   .    .    .    .    .    .    .    .    .    .    .    .   D================eER.   adcxq	%r8, %rcx
+[0,188]   .    .    .    .    .    .    .    .    .    .    .    .   D=====eeeeeeeeeE---R.   mulxq	24(%rsi), %r13, %r8
+[0,189]   .    .    .    .    .    .    .    .    .    .    .    .    D================eER   adoxq	%r13, %rcx
diff --git a/libc/nexgen32e/sub.S b/libc/nexgen32e/sub.S
deleted file mode 100644
index b065b90ff..000000000
--- a/libc/nexgen32e/sub.S
+++ /dev/null
@@ -1,41 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-
-//	Computes C = A - B
-//
-//	Aliasing such as sbb(A,A,B) or sbb(B,A,B) is OK.
-//
-//	@param	rdi is C
-//	@param	rsi is A
-//	@param	rdx is B
-//	@param	rcx is number of subtracts
-//	@return	al  is carry
-sbb:	.leafprologue
-	test	%ecx,%ecx
-	jz	1f
-	xor	%r9d,%r9d
-0:	mov	(%rsi,%r9,8),%rax
-	sbb	(%rdx,%r9,8),%rax
-	mov	%rax,(%rdi,%r9,8)
-	inc	%r9d
-	loop	0b
-1:	setb	%al
-	.leafepilogue
-	.endfn	sbb,globl
diff --git a/test/net/https/mbedtls_test.c b/test/net/https/mbedtls_test.c
index 3cb664029..b8a52b6af 100644
--- a/test/net/https/mbedtls_test.c
+++ b/test/net/https/mbedtls_test.c
@@ -39,6 +39,7 @@
 #include "third_party/mbedtls/des.h"
 #include "third_party/mbedtls/dhm.h"
 #include "third_party/mbedtls/ecp.h"
+#include "third_party/mbedtls/ecp_internal.h"
 #include "third_party/mbedtls/entropy.h"
 #include "third_party/mbedtls/error.h"
 #include "third_party/mbedtls/gcm.h"
@@ -148,17 +149,17 @@ static void P256_MPI(mbedtls_mpi *N) {
 
 static void P256_JUSTINE(mbedtls_mpi *N) {
   memcpy(N->p, rng, 8 * 8);
-  ecp_mod_p256(N);
+  secp256r1(N->p);
 }
 
 static void P384_MPI(mbedtls_mpi *N) {
-  memcpy(N->p, rng, 8 * 8);
+  memcpy(N->p, rng, 12 * 8);
   ASSERT_EQ(0, mbedtls_mpi_mod_mpi(N, N, &grp.P));
 }
 
 static void P384_JUSTINE(mbedtls_mpi *N) {
-  memcpy(N->p, rng, 8 * 8);
-  ecp_mod_p384(N);
+  memcpy(N->p, rng, 12 * 8);
+  secp384r1(N->p);
 }
 
 BENCH(p256, bench) {
@@ -166,6 +167,7 @@ BENCH(p256, bench) {
   mbedtls_ecp_group_init(&grp);
   mbedtls_ecp_group_load(&grp, MBEDTLS_ECP_DP_SECP256R1);
   mbedtls_mpi x = {1, 8, gc(calloc(8, 8))};
+  rngset(x.p, 8 * 8, rand64, -1);
   EZBENCH2("P-256 modulus MbedTLS MPI lib", donothing, P256_MPI(&x));
   EZBENCH2("P-256 modulus Justine rewrite", donothing, P256_JUSTINE(&x));
   mbedtls_ecp_group_free(&grp);
@@ -176,10 +178,10 @@ BENCH(p384, bench) {
 #ifdef MBEDTLS_ECP_C
   mbedtls_ecp_group_init(&grp);
   mbedtls_ecp_group_load(&grp, MBEDTLS_ECP_DP_SECP384R1);
+  uint64_t y[12];
   mbedtls_mpi x = {1, 12, gc(calloc(12, 8))};
   EZBENCH2("P-384 modulus MbedTLS MPI lib", donothing, P384_MPI(&x));
   EZBENCH2("P-384 modulus Justine rewrite", donothing, P384_JUSTINE(&x));
-  rngset(x.p, 12 * 8, rand64, -1);
   mbedtls_ecp_group_free(&grp);
 #endif
 }
@@ -1112,3 +1114,49 @@ BENCH(cmpint, bench) {
   EZBENCH2("cmpint 3.1", donothing, mbedtls_mpi_cmp_int(&z, 0));
   EZBENCH2("cmpint 3.2", donothing, mbedtls_mpi_cmp_int(&z, 1));
 }
+
+mbedtls_mpi_uint F1(mbedtls_mpi_uint *d, const mbedtls_mpi_uint *a,
+                    const mbedtls_mpi_uint *b, size_t n) {
+  size_t i;
+  unsigned char cf;
+  mbedtls_mpi_uint c, x;
+  cf = c = i = 0;
+  for (; i < n; ++i) SBB(d[i], a[i], b[i], c, c);
+  return c;
+}
+
+mbedtls_mpi_uint F2(mbedtls_mpi_uint *d, const mbedtls_mpi_uint *a,
+                    const mbedtls_mpi_uint *b, size_t n) {
+  size_t i;
+  unsigned char cf;
+  mbedtls_mpi_uint c, x;
+  cf = c = i = 0;
+  asm volatile("xor\t%1,%1\n\t"
+               ".align\t16\n1:\t"
+               "mov\t(%5,%3,8),%1\n\t"
+               "sbb\t(%6,%3,8),%1\n\t"
+               "mov\t%1,(%4,%3,8)\n\t"
+               "lea\t1(%3),%3\n\t"
+               "dec\t%2\n\t"
+               "jnz\t1b"
+               : "=@ccb"(cf), "=&r"(x), "+c"(n), "=r"(i)
+               : "r"(d), "r"(a), "r"(b), "3"(0)
+               : "cc", "memory");
+  return cf;
+}
+
+TEST(wut, wut) {
+  uint64_t A[8];
+  uint64_t B[8];
+  uint64_t C[8];
+  uint64_t D[8];
+  int i;
+  for (i = 0; i < 1000; ++i) {
+    rngset(A, sizeof(A), rand64, -1);
+    rngset(B, sizeof(B), rand64, -1);
+    int x = F1(C, A, B, 8);
+    int y = F2(D, A, B, 8);
+    ASSERT_EQ(x, y);
+    ASSERT_EQ(0, memcmp(C, D, sizeof(C)));
+  }
+}
diff --git a/third_party/mbedtls/bignum.c b/third_party/mbedtls/bignum.c
index 6dc7879fa..b8b1df68a 100644
--- a/third_party/mbedtls/bignum.c
+++ b/third_party/mbedtls/bignum.c
@@ -26,7 +26,6 @@
 #include "libc/nexgen32e/nexgen32e.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/runtime/runtime.h"
-#include "libc/stdio/stdio.h"
 #include "third_party/mbedtls/bignum.h"
 #include "third_party/mbedtls/bignum_internal.h"
 #include "third_party/mbedtls/chk.h"
@@ -65,20 +64,10 @@ asm(".include \"libc/disclaimer.inc\"");
 
 #if defined(MBEDTLS_BIGNUM_C)
 
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define mpi_uint_bigendian_to_host(x) (x)
-#elif __SIZEOF_LONG__ == 8
-#define mpi_uint_bigendian_to_host(x) __builtin_bswap64(x)
-#elif __SIZEOF_LONG__ == 4
-#define mpi_uint_bigendian_to_host(x) __builtin_bswap32(x)
-#endif
-
-/* Get a specific byte, without range checks. */
-#define GET_BYTE(X, i) (((X)->p[(i) / ciL] >> (((i) % ciL) * 8)) & 0xff)
-
-static inline void mbedtls_mpi_zeroize(mbedtls_mpi_uint *v, size_t n)
+/* Implementation that should never be optimized out by the compiler */
+static void mbedtls_mpi_zeroize( mbedtls_mpi_uint *v, size_t n )
 {
-    mbedtls_platform_zeroize(v, ciL * n);
+    mbedtls_platform_zeroize( v, ciL * n );
 }
 
 /**
@@ -88,15 +77,18 @@ static inline void mbedtls_mpi_zeroize(mbedtls_mpi_uint *v, size_t n)
  *                 in which case this function is a no-op. If it is
  *                 not \c NULL, it must point to an initialized MPI.
  */
-void mbedtls_mpi_free(mbedtls_mpi *X)
+void mbedtls_mpi_free( mbedtls_mpi *X )
 {
-    if (!X) return;
-    if (X->p)
+    if( !X )
+        return;
+    if( X->p )
     {
-        mbedtls_mpi_zeroize(X->p, X->n);
-        mbedtls_free(X->p);
+        mbedtls_mpi_zeroize( X->p, X->n );
+        mbedtls_free( X->p );
     }
-    mbedtls_mpi_init(X);
+    X->s = 1;
+    X->n = 0;
+    X->p = NULL;
 }
 
 /**
@@ -216,28 +208,35 @@ int mbedtls_mpi_shrink(mbedtls_mpi *X, size_t nblimbs)
  * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed.
  * \return         Another negative error code on other kinds of failure.
  */
-int mbedtls_mpi_copy(mbedtls_mpi *X, const mbedtls_mpi *Y)
+int mbedtls_mpi_copy( mbedtls_mpi *X, const mbedtls_mpi *Y )
 {
     int ret = 0;
     size_t i;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(Y);
-    if (X == Y)
-        return 0;
-    if (!Y->n)
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( Y );
+    if( X == Y )
+        return( 0 );
+    if( Y->n == 0 )
     {
-        mbedtls_mpi_free(X);
-        return 0;
+        mbedtls_mpi_free( X );
+        return( 0 );
     }
-    i = MAX(1, mbedtls_mpi_limbs(Y));
+    for( i = Y->n - 1; i > 0; i-- )
+        if( Y->p[i] != 0 )
+            break;
+    i++;
     X->s = Y->s;
-    if (X->n < i)
-        MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, i));
+    if( X->n < i )
+    {
+        MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, i ) );
+    }
     else
-        mbedtls_mpi_zeroize(X->p + i, X->n - i);
-    memcpy(X->p, Y->p, i * ciL);
+    {
+        mbedtls_platform_zeroize( X->p + i, ( X->n - i ) * ciL );
+    }
+    memcpy( X->p, Y->p, i * ciL );
 cleanup:
-    return ret;
+    return( ret );
 }
 
 /**
@@ -246,14 +245,14 @@ cleanup:
  * \param X        The first MPI. It must be initialized.
  * \param Y        The second MPI. It must be initialized.
  */
-void mbedtls_mpi_swap(mbedtls_mpi *X, mbedtls_mpi *Y)
+void mbedtls_mpi_swap( mbedtls_mpi *X, mbedtls_mpi *Y )
 {
     mbedtls_mpi T;
-    MPI_VALIDATE(X);
-    MPI_VALIDATE(Y);
-    memcpy(&T, X, sizeof(mbedtls_mpi));
-    memcpy(X, Y, sizeof(mbedtls_mpi));
-    memcpy(Y, &T, sizeof(mbedtls_mpi));
+    MPI_VALIDATE( X );
+    MPI_VALIDATE( Y );
+    memcpy( &T,  X, sizeof( mbedtls_mpi ) );
+    memcpy(  X,  Y, sizeof( mbedtls_mpi ) );
+    memcpy(  Y, &T, sizeof( mbedtls_mpi ) );
 }
 
 /**
@@ -289,7 +288,8 @@ int mbedtls_mpi_safe_cond_assign(mbedtls_mpi *X,
     MPI_VALIDATE_RET(X);
     MPI_VALIDATE_RET(Y);
     /* make sure assign is 0 or 1 in a time-constant manner */
-    if (Y->n > X->n) MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, Y->n));
+    if (Y->n > X->n)
+        MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, Y->n ) );
     assign = (assign | (unsigned char)-assign) >> 7;
     X->s = Select(Y->s, X->s, -assign);
     for (i = 0; i < Y->n; i++)
@@ -297,7 +297,7 @@ int mbedtls_mpi_safe_cond_assign(mbedtls_mpi *X,
     for (i = Y->n; i < X->n; i++)
         X->p[i] &= CONCEAL("r", assign - 1);
 cleanup:
-    return ret;
+    return( ret );
 }
 
 /**
@@ -323,31 +323,30 @@ cleanup:
  * \return         Another negative error code on other kinds of failure.
  *
  */
-int mbedtls_mpi_safe_cond_swap(mbedtls_mpi *X,
-                               mbedtls_mpi *Y,
-                               unsigned char swap)
+int mbedtls_mpi_safe_cond_swap( mbedtls_mpi *X, mbedtls_mpi *Y, unsigned char swap )
 {
     int ret, s;
     size_t i;
     mbedtls_mpi_uint tmp;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(Y);
-    if (X == Y) return (0);
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( Y );
+    if( X == Y )
+        return( 0 );
     /* make sure swap is 0 or 1 in a time-constant manner */
     swap = (swap | (unsigned char)-swap) >> 7;
-    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, Y->n));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(Y, X->n));
+    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, Y->n ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( Y, X->n ) );
     s = X->s;
-    X->s = X->s * (1 - swap) + Y->s * swap;
-    Y->s = Y->s * (1 - swap) + s * swap;
-    for (i = 0; i < X->n; i++)
+    X->s = X->s * ( 1 - swap ) + Y->s * swap;
+    Y->s = Y->s * ( 1 - swap ) +    s * swap;
+    for( i = 0; i < X->n; i++ )
     {
         tmp = X->p[i];
-        X->p[i] = X->p[i] * (1 - swap) + Y->p[i] * swap;
-        Y->p[i] = Y->p[i] * (1 - swap) + tmp * swap;
+        X->p[i] = X->p[i] * ( 1 - swap ) + Y->p[i] * swap;
+        Y->p[i] = Y->p[i] * ( 1 - swap ) +     tmp * swap;
     }
 cleanup:
-    return ret;
+    return( ret );
 }
 
 /**
@@ -360,16 +359,16 @@ cleanup:
  * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed.
  * \return         Another negative error code on other kinds of failure.
  */
-int mbedtls_mpi_lset(mbedtls_mpi *X, mbedtls_mpi_sint z)
+int mbedtls_mpi_lset( mbedtls_mpi *X, mbedtls_mpi_sint z )
 {
     int ret = MBEDTLS_ERR_THIS_CORRUPTION;
-    MPI_VALIDATE_RET(X);
-    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, 1));
-    mbedtls_mpi_zeroize(X->p, X->n);
-    X->p[0] = (z < 0) ? -z : z;
-    X->s = (z < 0) ? -1 : 1;
+    MPI_VALIDATE_RET( X );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, 1 ) );
+    mbedtls_platform_zeroize( X->p, X->n * ciL );
+    X->p[0] = ( z < 0 ) ? -z : z;
+    X->s    = ( z < 0 ) ? -1 : 1;
 cleanup:
-    return ret;
+    return( ret );
 }
 
 /**
@@ -382,13 +381,18 @@ cleanup:
  *                 of \c X is unset or set.
  * \return         A negative error code on failure.
  */
-int mbedtls_mpi_get_bit(const mbedtls_mpi *X, size_t pos)
+int mbedtls_mpi_get_bit( const mbedtls_mpi *X, size_t pos )
 {
-    MPI_VALIDATE_RET(X);
-    if (X->n * biL <= pos) return 0;
-    return ((X->p[pos / biL] >> (pos % biL)) & 0x01);
+    MPI_VALIDATE_RET( X );
+    if( X->n * biL <= pos )
+        return( 0 );
+    return( ( X->p[pos / biL] >> ( pos % biL ) ) & 0x01 );
 }
 
+/* Get a specific byte, without range checks. */
+#define GET_BYTE( X, i )                                \
+    ( ( ( X )->p[( i ) / ciL] >> ( ( ( i ) % ciL ) * 8 ) ) & 0xff )
+
 /**
  * \brief          Modify a specific bit in an MPI.
  *
@@ -404,23 +408,24 @@ int mbedtls_mpi_get_bit(const mbedtls_mpi *X, size_t pos)
  * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if memory allocation failed.
  * \return         Another negative error code on other kinds of failure.
  */
-int mbedtls_mpi_set_bit(mbedtls_mpi *X, size_t pos, unsigned char val)
+int mbedtls_mpi_set_bit( mbedtls_mpi *X, size_t pos, unsigned char val )
 {
     int ret = 0;
     size_t off = pos / biL;
     size_t idx = pos % biL;
-    MPI_VALIDATE_RET(X);
-    if (val && val != 1)
-        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
-    if (X->n * biL <= pos)
+    MPI_VALIDATE_RET( X );
+    if( val != 0 && val != 1 )
+        return( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
+    if( X->n * biL <= pos )
     {
-        if (!val) return 0;
-        MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, off + 1));
+        if( !val )
+            return( 0 );
+        MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, off + 1 ) );
     }
-    X->p[off] &= ~((mbedtls_mpi_uint)0x01 << idx);
-    X->p[off] |= (mbedtls_mpi_uint)val << idx;
+    X->p[off] &= ~( (mbedtls_mpi_uint) 0x01 << idx );
+    X->p[off] |= (mbedtls_mpi_uint) val << idx;
 cleanup:
-    return ret;
+    return( ret );
 }
 
 /**
@@ -435,13 +440,13 @@ cleanup:
  * \return         The number of bits of value \c 0 before the least significant
  *                 bit of value \c 1 in \p X.
  */
-size_t mbedtls_mpi_lsb(const mbedtls_mpi *X)
+size_t mbedtls_mpi_lsb( const mbedtls_mpi *X )
 {
     size_t i, j, count = 0;
     MBEDTLS_INTERNAL_VALIDATE_RET(X, 0);
-    for (i = 0; i < X->n; i++)
+    for( i = 0; i < X->n; i++ )
     {
-        if (X->p[i])
+        if ( X->p[i] )
             return count + __builtin_ctzll(X->p[i]);
         else
             count += biL;
@@ -452,7 +457,7 @@ size_t mbedtls_mpi_lsb(const mbedtls_mpi *X)
 /*
  * Count leading zero bits in a given integer
  */
-static inline size_t mbedtls_clz(const mbedtls_mpi_uint x)
+static inline size_t mbedtls_clz( const mbedtls_mpi_uint x )
 {
     return x ? __builtin_clzll(x) : biL;
 }
@@ -490,23 +495,23 @@ size_t mbedtls_mpi_bitlen(const mbedtls_mpi *X)
  * \return         The least number of bytes capable of storing
  *                 the absolute value of \p X.
  */
-size_t mbedtls_mpi_size(const mbedtls_mpi *X)
+size_t mbedtls_mpi_size( const mbedtls_mpi *X )
 {
-    return (mbedtls_mpi_bitlen(X) + 7) >> 3;
+    return( ( mbedtls_mpi_bitlen( X ) + 7 ) >> 3 );
 }
 
 /*
  * Convert an ASCII character to digit value
  */
-static int mpi_get_digit(mbedtls_mpi_uint *d, int radix, char c)
+static int mpi_get_digit( mbedtls_mpi_uint *d, int radix, char c )
 {
     *d = 255;
-    if (c >= 0x30 && c <= 0x39) *d = c - 0x30;
-    if (c >= 0x41 && c <= 0x46) *d = c - 0x37;
-    if (c >= 0x61 && c <= 0x66) *d = c - 0x57;
-    if (*d >= (mbedtls_mpi_uint)radix)
-        return MBEDTLS_ERR_MPI_INVALID_CHARACTER;
-    return 0;
+    if( c >= 0x30 && c <= 0x39 ) *d = c - 0x30;
+    if( c >= 0x41 && c <= 0x46 ) *d = c - 0x37;
+    if( c >= 0x61 && c <= 0x66 ) *d = c - 0x57;
+    if( *d >= (mbedtls_mpi_uint) radix )
+        return( MBEDTLS_ERR_MPI_INVALID_CHARACTER );
+    return( 0 );
 }
 
 /**
@@ -519,87 +524,94 @@ static int mpi_get_digit(mbedtls_mpi_uint *d, int radix, char c)
  * \return         \c 0 if successful.
  * \return         A negative error code on failure.
  */
-int mbedtls_mpi_read_string(mbedtls_mpi *X, int radix, const char *s)
+int mbedtls_mpi_read_string( mbedtls_mpi *X, int radix, const char *s )
 {
     int ret = MBEDTLS_ERR_THIS_CORRUPTION;
     size_t i, j, slen, n;
     mbedtls_mpi_uint d;
     mbedtls_mpi T;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(s);
-    if (radix < 2 || radix > 16)
-        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
-    mbedtls_mpi_init(&T);
-    slen = strlen(s);
-    if (radix == 16)
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( s );
+    if( radix < 2 || radix > 16 )
+        return( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
+    mbedtls_mpi_init( &T );
+    slen = strlen( s );
+    if( radix == 16 )
     {
-        if (slen > MPI_SIZE_T_MAX >> 2)
-            return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
-        n = BITS_TO_LIMBS(slen << 2);
-        MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, n));
-        MBEDTLS_MPI_CHK(mbedtls_mpi_lset(X, 0));
-        for (i = slen, j = 0; i > 0; i--, j++)
+        if( slen > MPI_SIZE_T_MAX >> 2 )
+            return( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
+        n = BITS_TO_LIMBS( slen << 2 );
+        MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, n ) );
+        MBEDTLS_MPI_CHK( mbedtls_mpi_lset( X, 0 ) );
+        for( i = slen, j = 0; i > 0; i--, j++ )
         {
-            if (i == 1 && s[i - 1] == '-')
+            if( i == 1 && s[i - 1] == '-' )
             {
                 X->s = -1;
                 break;
             }
-            MBEDTLS_MPI_CHK(mpi_get_digit(&d, radix, s[i - 1]));
-            X->p[j / (2 * ciL)] |= d << ((j % (2 * ciL)) << 2);
+            MBEDTLS_MPI_CHK( mpi_get_digit( &d, radix, s[i - 1] ) );
+            X->p[j / ( 2 * ciL )] |= d << ( ( j % ( 2 * ciL ) ) << 2 );
         }
     }
     else
     {
-        MBEDTLS_MPI_CHK(mbedtls_mpi_lset(X, 0));
-        for (i = 0; i < slen; i++)
+        MBEDTLS_MPI_CHK( mbedtls_mpi_lset( X, 0 ) );
+        for( i = 0; i < slen; i++ )
         {
-            if (!i && s[i] == '-')
+            if( i == 0 && s[i] == '-' )
             {
                 X->s = -1;
                 continue;
             }
-            MBEDTLS_MPI_CHK(mpi_get_digit(&d, radix, s[i]));
-            MBEDTLS_MPI_CHK(mbedtls_mpi_mul_int(&T, X, radix));
-            if (X->s == 1)
-                MBEDTLS_MPI_CHK(mbedtls_mpi_add_int(X, &T, d));
+            MBEDTLS_MPI_CHK( mpi_get_digit( &d, radix, s[i] ) );
+            MBEDTLS_MPI_CHK( mbedtls_mpi_mul_int( &T, X, radix ) );
+            if( X->s == 1 )
+            {
+                MBEDTLS_MPI_CHK( mbedtls_mpi_add_int( X, &T, d ) );
+            }
             else
-                MBEDTLS_MPI_CHK(mbedtls_mpi_sub_int(X, &T, d));
+            {
+                MBEDTLS_MPI_CHK( mbedtls_mpi_sub_int( X, &T, d ) );
+            }
         }
     }
 cleanup:
-    mbedtls_mpi_free(&T);
-    return ret;
+    mbedtls_mpi_free( &T );
+    return( ret );
 }
 
 /*
  * Helper to write the digits high-order first.
  */
-static int mpi_write_hlp(mbedtls_mpi *X, int radix, char **p,
-                         const size_t buflen)
+static int mpi_write_hlp( mbedtls_mpi *X, int radix,
+                          char **p, const size_t buflen )
 {
     int ret = MBEDTLS_ERR_THIS_CORRUPTION;
     mbedtls_mpi_uint r;
     size_t length = 0;
     char *p_end = *p + buflen;
-    do {
-        if (length >= buflen)
-            return MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL;
-        MBEDTLS_MPI_CHK(mbedtls_mpi_mod_int(&r, X, radix));
-        MBEDTLS_MPI_CHK(mbedtls_mpi_div_int(X, NULL, X, radix));
+    do
+    {
+        if( length >= buflen )
+        {
+            return( MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL );
+        }
+        MBEDTLS_MPI_CHK( mbedtls_mpi_mod_int( &r, X, radix ) );
+        MBEDTLS_MPI_CHK( mbedtls_mpi_div_int( X, NULL, X, radix ) );
         /*
          * Write the residue in the current position, as an ASCII character.
          */
-        if (r < 0xA)
-            *(--p_end) = (char)('0' + r);
+        if( r < 0xA )
+            *(--p_end) = (char)( '0' + r );
         else
-            *(--p_end) = (char)('A' + (r - 0xA));
+            *(--p_end) = (char)( 'A' + ( r - 0xA ) );
         length++;
-    } while (!mbedtls_mpi_is_zero(X));
-    memmove(*p, p_end, length);
+    } while( mbedtls_mpi_cmp_int( X, 0 ) != 0 );
+    memmove( *p, p_end, length );
     *p += length;
 cleanup:
-    return ret;
+    return( ret );
 }
 
 /**
@@ -624,74 +636,75 @@ cleanup:
  *                 size of \p buf required for a successful call.
  * \return         Another negative error code on different kinds of failure.
  */
-int mbedtls_mpi_write_string(const mbedtls_mpi *X, int radix, char *buf,
-                             size_t buflen, size_t *olen)
+int mbedtls_mpi_write_string( const mbedtls_mpi *X, int radix,
+                              char *buf, size_t buflen, size_t *olen )
 {
     int ret = 0;
     size_t n;
     char *p;
     mbedtls_mpi T;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(olen);
-    MPI_VALIDATE_RET(!buflen || buf);
-    if (radix < 2 || radix > 16)
-        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
-    n = mbedtls_mpi_bitlen(X); /* Number of bits necessary to present `n`. */
-    if (radix >= 4)
-        n >>= 1; /* Number of 4-adic digits necessary to present
-                  * `n`. If radix > 4, this might be a strict
-                  * overapproximation of the number of
-                  * radix-adic digits needed to present `n`. */
-    if (radix >= 16)
-        n >>= 1;    /* Number of hexadecimal digits necessary to
-                     * present `n`. */
-    n += 1;       /* Terminating null byte */
-    n += 1;       /* Compensate for the divisions above, which round down `n`
-                   * in case it's not even. */
-    n += 1;       /* Potential '-'-sign. */
-    n += (n & 1); /* Make n even to have enough space for hexadecimal writing,
-                   * which always uses an even number of hex-digits. */
-    if (buflen < n)
+    MPI_VALIDATE_RET( X    );
+    MPI_VALIDATE_RET( olen );
+    MPI_VALIDATE_RET( buflen == 0 || buf );
+    if( radix < 2 || radix > 16 )
+        return( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
+    n = mbedtls_mpi_bitlen( X ); /* Number of bits necessary to present `n`. */
+    if( radix >=  4 ) n >>= 1;   /* Number of 4-adic digits necessary to present
+                                  * `n`. If radix > 4, this might be a strict
+                                  * overapproximation of the number of
+                                  * radix-adic digits needed to present `n`. */
+    if( radix >= 16 ) n >>= 1;   /* Number of hexadecimal digits necessary to
+                                  * present `n`. */
+    n += 1; /* Terminating null byte */
+    n += 1; /* Compensate for the divisions above, which round down `n`
+             * in case it's not even. */
+    n += 1; /* Potential '-'-sign. */
+    n += ( n & 1 ); /* Make n even to have enough space for hexadecimal writing,
+                     * which always uses an even number of hex-digits. */
+    if( buflen < n )
     {
         *olen = n;
-        return MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL;
+        return( MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL );
     }
     p = buf;
-    mbedtls_mpi_init(&T);
-    if (X->s == -1)
+    mbedtls_mpi_init( &T );
+    if( X->s == -1 )
     {
         *p++ = '-';
         buflen--;
     }
-    if (radix == 16)
+    if( radix == 16 )
     {
         int c;
         size_t i, j, k;
-        for (i = X->n, k = 0; i > 0; i--)
+        for( i = X->n, k = 0; i > 0; i-- )
         {
-            for (j = ciL; j > 0; j--)
+            for( j = ciL; j > 0; j-- )
             {
-                c = (X->p[i - 1] >> ((j - 1) << 3)) & 0xFF;
-                if (!c && !k && (i + j) != 2) continue;
-                *(p++) = "0123456789ABCDEF"[c / 16];
-                *(p++) = "0123456789ABCDEF"[c % 16];
+                c = ( X->p[i - 1] >> ( ( j - 1 ) << 3) ) & 0xFF;
+                if( c == 0 && k == 0 && ( i + j ) != 2 )
+                    continue;
+                *(p++) = "0123456789ABCDEF" [c / 16];
+                *(p++) = "0123456789ABCDEF" [c % 16];
                 k = 1;
             }
         }
     }
     else
     {
-        MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&T, X));
-        if (T.s == -1) T.s = 1;
-        MBEDTLS_MPI_CHK(mpi_write_hlp(&T, radix, &p, buflen));
+        MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &T, X ) );
+        if( T.s == -1 )
+            T.s = 1;
+        MBEDTLS_MPI_CHK( mpi_write_hlp( &T, radix, &p, buflen ) );
     }
     *p++ = '\0';
     *olen = p - buf;
 cleanup:
-    mbedtls_mpi_free(&T);
-    return ret;
+    mbedtls_mpi_free( &T );
+    return( ret );
 }
 
+#if defined(MBEDTLS_FS_IO)
 /**
  * \brief          Read an MPI from a line in an opened file.
  *
@@ -713,7 +726,7 @@ cleanup:
  *                 is too small.
  * \return         Another negative error code on failure.
  */
-int mbedtls_mpi_read_file(mbedtls_mpi *X, int radix, FILE *fin)
+int mbedtls_mpi_read_file( mbedtls_mpi *X, int radix, FILE *fin )
 {
     mbedtls_mpi_uint d;
     size_t slen;
@@ -722,32 +735,24 @@ int mbedtls_mpi_read_file(mbedtls_mpi *X, int radix, FILE *fin)
      * Buffer should have space for (short) label and decimal formatted MPI,
      * newline characters and '\0'
      */
-    char s[MBEDTLS_MPI_RW_BUFFER_SIZE];
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(fin);
-    if (radix < 2 || radix > 16)
-        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
-    mbedtls_platform_zeroize(s, sizeof(s));
-    if (!fgets(s, sizeof(s) - 1, fin))
-        return MBEDTLS_ERR_MPI_FILE_IO_ERROR;
-    slen = strlen(s);
-    if (slen == sizeof(s) - 2)
-        return MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL;
-    if (slen > 0 && s[slen - 1] == '\n')
-    {
-        slen--;
-        s[slen] = '\0';
-    }
-    if (slen > 0 && s[slen - 1] == '\r')
-    {
-        slen--;
-        s[slen] = '\0';
-    }
+    char s[ MBEDTLS_MPI_RW_BUFFER_SIZE ];
+    MPI_VALIDATE_RET( X   );
+    MPI_VALIDATE_RET( fin );
+    if( radix < 2 || radix > 16 )
+        return( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
+    mbedtls_platform_zeroize( s, sizeof( s ) );
+    if( fgets( s, sizeof( s ) - 1, fin ) == NULL )
+        return( MBEDTLS_ERR_MPI_FILE_IO_ERROR );
+    slen = strlen( s );
+    if( slen == sizeof( s ) - 2 )
+        return( MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL );
+    if( slen > 0 && s[slen - 1] == '\n' ) { slen--; s[slen] = '\0'; }
+    if( slen > 0 && s[slen - 1] == '\r' ) { slen--; s[slen] = '\0'; }
     p = s + slen;
-    while (p-- > s)
-        if (mpi_get_digit(&d, radix, *p))
+    while( p-- > s )
+        if( mpi_get_digit( &d, radix, *p ) != 0 )
             break;
-    return mbedtls_mpi_read_string(X, radix, p + 1);
+    return( mbedtls_mpi_read_string( X, radix, p + 1 ) );
 }
 
 /**
@@ -765,8 +770,7 @@ int mbedtls_mpi_read_file(mbedtls_mpi *X, int radix, FILE *fin)
  * \return         \c 0 if successful.
  * \return         A negative error code on failure.
  */
-int mbedtls_mpi_write_file(const char *p, const mbedtls_mpi *X, int radix,
-                           FILE *fout)
+int mbedtls_mpi_write_file( const char *p, const mbedtls_mpi *X, int radix, FILE *fout )
 {
     int ret = MBEDTLS_ERR_THIS_CORRUPTION;
     size_t n, slen, plen;
@@ -774,35 +778,43 @@ int mbedtls_mpi_write_file(const char *p, const mbedtls_mpi *X, int radix,
      * Buffer should have space for (short) label and decimal formatted MPI,
      * newline characters and '\0'
      */
-    char s[MBEDTLS_MPI_RW_BUFFER_SIZE];
-    MPI_VALIDATE_RET(X);
-    if (radix < 2 || radix > 16)
-        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
-    mbedtls_platform_zeroize(s, sizeof(s));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_write_string(X, radix, s, sizeof(s) - 2, &n));
-    if (!p) p = "";
-    plen = strlen(p);
-    slen = strlen(s);
+    char s[ MBEDTLS_MPI_RW_BUFFER_SIZE ];
+    MPI_VALIDATE_RET( X );
+    if( radix < 2 || radix > 16 )
+        return( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
+    mbedtls_platform_zeroize( s, sizeof( s ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_write_string( X, radix, s, sizeof( s ) - 2, &n ) );
+    if( p == NULL ) p = "";
+    plen = strlen( p );
+    slen = strlen( s );
     s[slen++] = '\r';
     s[slen++] = '\n';
-    if (fout)
+    if( fout )
     {
-        if (fwrite(p, 1, plen, fout) != plen || fwrite(s, 1, slen, fout) != slen)
-            return MBEDTLS_ERR_MPI_FILE_IO_ERROR;
+        if( fwrite( p, 1, plen, fout ) != plen ||
+            fwrite( s, 1, slen, fout ) != slen )
+            return( MBEDTLS_ERR_MPI_FILE_IO_ERROR );
     }
     else
-    {
-        mbedtls_printf("%s%s", p, s);
-    }
+        mbedtls_printf( "%s%s", p, s );
 cleanup:
-    return ret;
+    return( ret );
 }
+#endif /* MBEDTLS_FS_IO */
 
-static void mpi_bigendian_to_host(mbedtls_mpi_uint *const p, size_t limbs)
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define mpi_uint_bigendian_to_host(x) (x)
+#elif __SIZEOF_LONG__ == 8
+#define mpi_uint_bigendian_to_host(x) __builtin_bswap64(x)
+#elif __SIZEOF_LONG__ == 4
+#define mpi_uint_bigendian_to_host(x) __builtin_bswap32(x)
+#endif
+
+static void mpi_bigendian_to_host( mbedtls_mpi_uint * const p, size_t limbs )
 {
     mbedtls_mpi_uint *cur_limb_left;
     mbedtls_mpi_uint *cur_limb_right;
-    if (!limbs)
+    if( !limbs )
         return;
     /*
      * Traverse limbs and
@@ -813,14 +825,15 @@ static void mpi_bigendian_to_host(mbedtls_mpi_uint *const p, size_t limbs)
      * than the right index (it's not a problem if limbs is odd and the
      * indices coincide in the last iteration).
      */
-    for (cur_limb_left = p, cur_limb_right = p + (limbs - 1);
-         cur_limb_left <= cur_limb_right; cur_limb_left++, cur_limb_right--)
+    for( cur_limb_left = p, cur_limb_right = p + ( limbs - 1 );
+         cur_limb_left <= cur_limb_right;
+         cur_limb_left++, cur_limb_right-- )
     {
         mbedtls_mpi_uint tmp;
         /* Note that if cur_limb_left == cur_limb_right,
          * this code effectively swaps the bytes only once. */
-        tmp = mpi_uint_bigendian_to_host(*cur_limb_left);
-        *cur_limb_left = mpi_uint_bigendian_to_host(*cur_limb_right);
+        tmp             = mpi_uint_bigendian_to_host( *cur_limb_left  );
+        *cur_limb_left  = mpi_uint_bigendian_to_host( *cur_limb_right );
         *cur_limb_right = tmp;
     }
 }
@@ -923,13 +936,13 @@ int mbedtls_mpi_read_binary(mbedtls_mpi *X, const unsigned char *p, size_t n)
  *                 large enough to hold the value of \p X.
  * \return         Another negative error code on different kinds of failure.
  */
-int mbedtls_mpi_write_binary_le(const mbedtls_mpi *X, unsigned char *buf,
-                                size_t buflen)
+int mbedtls_mpi_write_binary_le( const mbedtls_mpi *X,
+                                 unsigned char *buf, size_t buflen )
 {
     size_t stored_bytes = X->n * ciL;
     size_t bytes_to_copy;
     size_t i;
-    if (stored_bytes < buflen)
+    if( stored_bytes < buflen )
     {
         bytes_to_copy = stored_bytes;
     }
@@ -938,19 +951,20 @@ int mbedtls_mpi_write_binary_le(const mbedtls_mpi *X, unsigned char *buf,
         bytes_to_copy = buflen;
         /* The output buffer is smaller than the allocated size of X.
          * However X may fit if its leading bytes are zero. */
-        for (i = bytes_to_copy; i < stored_bytes; i++)
+        for( i = bytes_to_copy; i < stored_bytes; i++ )
         {
-            if (GET_BYTE(X, i))
-                return MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL;
+            if( GET_BYTE( X, i ) != 0 )
+                return( MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL );
         }
     }
-    for (i = 0; i < bytes_to_copy; i++) buf[i] = GET_BYTE(X, i);
-    if (stored_bytes < buflen)
+    for( i = 0; i < bytes_to_copy; i++ )
+        buf[i] = GET_BYTE( X, i );
+    if( stored_bytes < buflen )
     {
         /* Write trailing 0 bytes */
-        mbedtls_platform_zeroize(buf + stored_bytes, buflen - stored_bytes);
+        mbedtls_platform_zeroize( buf + stored_bytes, buflen - stored_bytes );
     }
-    return 0;
+    return( 0 );
 }
 
 /**
@@ -968,17 +982,17 @@ int mbedtls_mpi_write_binary_le(const mbedtls_mpi *X, unsigned char *buf,
  *                 large enough to hold the value of \p X.
  * \return         Another negative error code on different kinds of failure.
  */
-int mbedtls_mpi_write_binary(const mbedtls_mpi *X, unsigned char *buf,
-                             size_t buflen)
+int mbedtls_mpi_write_binary( const mbedtls_mpi *X,
+                              unsigned char *buf, size_t buflen )
 {
     size_t stored_bytes;
     size_t bytes_to_copy;
     unsigned char *p;
     size_t i;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(!buflen || buf);
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( buflen == 0 || buf );
     stored_bytes = X->n * ciL;
-    if (stored_bytes < buflen)
+    if( stored_bytes < buflen )
     {
         /* There is enough space in the output buffer. Write initial
          * null bytes and record the position at which to start
@@ -987,7 +1001,7 @@ int mbedtls_mpi_write_binary(const mbedtls_mpi *X, unsigned char *buf,
          * number. */
         bytes_to_copy = stored_bytes;
         p = buf + buflen - stored_bytes;
-        mbedtls_platform_zeroize(buf, buflen - stored_bytes);
+        mbedtls_platform_zeroize( buf, buflen - stored_bytes );
     }
     else
     {
@@ -995,14 +1009,352 @@ int mbedtls_mpi_write_binary(const mbedtls_mpi *X, unsigned char *buf,
          * However X may fit if its leading bytes are zero. */
         bytes_to_copy = buflen;
         p = buf;
-        for (i = bytes_to_copy; i < stored_bytes; i++)
+        for( i = bytes_to_copy; i < stored_bytes; i++ )
         {
-            if (GET_BYTE(X, i))
-                return MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL;
+            if( GET_BYTE( X, i ) != 0 )
+                return( MBEDTLS_ERR_MPI_BUFFER_TOO_SMALL );
         }
     }
-    for (i = 0; i < bytes_to_copy; i++) p[bytes_to_copy - i - 1] = GET_BYTE(X, i);
-    return 0;
+    for( i = 0; i < bytes_to_copy; i++ )
+        p[bytes_to_copy - i - 1] = GET_BYTE( X, i );
+    return( 0 );
+}
+
+/**
+ * \brief          Compare the absolute values of two MPIs.
+ *
+ * \param X        The left-hand MPI. This must point to an initialized MPI.
+ * \param Y        The right-hand MPI. This must point to an initialized MPI.
+ *
+ * \return         \c 1 if `|X|` is greater than `|Y|`.
+ * \return         \c -1 if `|X|` is lesser than `|Y|`.
+ * \return         \c 0 if `|X|` is equal to `|Y|`.
+ */
+int mbedtls_mpi_cmp_abs( const mbedtls_mpi *X, const mbedtls_mpi *Y )
+{
+    size_t i, j;
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( Y );
+    i = mbedtls_mpi_limbs(X);
+    j = mbedtls_mpi_limbs(Y);
+    if( !i && !j )
+        return( 0 );
+    if( i > j ) return(  1 );
+    if( j > i ) return( -1 );
+    for( ; i > 0; i-- )
+    {
+        if( X->p[i - 1] > Y->p[i - 1] ) return(  1 );
+        if( X->p[i - 1] < Y->p[i - 1] ) return( -1 );
+    }
+    return( 0 );
+}
+
+/**
+ * \brief          Compare two MPIs.
+ *
+ * \param X        The left-hand MPI. This must point to an initialized MPI.
+ * \param Y        The right-hand MPI. This must point to an initialized MPI.
+ *
+ * \return         \c 1 if \p X is greater than \p Y.
+ * \return         \c -1 if \p X is lesser than \p Y.
+ * \return         \c 0 if \p X is equal to \p Y.
+ */
+int mbedtls_mpi_cmp_mpi( const mbedtls_mpi *X, const mbedtls_mpi *Y )
+{
+    size_t i, j;
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( Y );
+    i = mbedtls_mpi_limbs(X);
+    j = mbedtls_mpi_limbs(Y);
+    if( !i && !j )
+        return( 0 );
+    if( i > j ) return(  X->s );
+    if( j > i ) return( -Y->s );
+    if( X->s > 0 && Y->s < 0 ) return(  1 );
+    if( Y->s > 0 && X->s < 0 ) return( -1 );
+    for( ; i > 0; i-- )
+    {
+        if( X->p[i - 1] > Y->p[i - 1] ) return(  X->s );
+        if( X->p[i - 1] < Y->p[i - 1] ) return( -X->s );
+    }
+    return( 0 );
+}
+
+/**
+ * Decide if an integer is less than the other, without branches.
+ *
+ * \param x         First integer.
+ * \param y         Second integer.
+ *
+ * \return          1 if \p x is less than \p y, 0 otherwise
+ */
+static unsigned ct_lt_mpi_uint( const mbedtls_mpi_uint x,
+                                const mbedtls_mpi_uint y )
+{
+    mbedtls_mpi_uint ret;
+    mbedtls_mpi_uint cond;
+    /*
+     * Check if the most significant bits (MSB) of the operands are different.
+     */
+    cond = ( x ^ y );
+    /*
+     * If the MSB are the same then the difference x-y will be negative (and
+     * have its MSB set to 1 during conversion to unsigned) if and only if x<y.
+     */
+    ret = ( x - y ) & ~cond;
+    /*
+     * If the MSB are different, then the operand with the MSB of 1 is the
+     * bigger. (That is if y has MSB of 1, then x<y is true and it is false if
+     * the MSB of y is 0.)
+     */
+    ret |= y & cond;
+
+    ret = ret >> ( biL - 1 );
+    return (unsigned) ret;
+}
+
+/**
+ * \brief          Check if an MPI is less than the other in constant time.
+ *
+ * \param X        The left-hand MPI. This must point to an initialized MPI
+ *                 with the same allocated length as Y.
+ * \param Y        The right-hand MPI. This must point to an initialized MPI
+ *                 with the same allocated length as X.
+ * \param ret      The result of the comparison:
+ *                 \c 1 if \p X is less than \p Y.
+ *                 \c 0 if \p X is greater than or equal to \p Y.
+ *
+ * \return         0 on success.
+ * \return         MBEDTLS_ERR_MPI_BAD_INPUT_DATA if the allocated length of
+ *                 the two input MPIs is not the same.
+ */
+int mbedtls_mpi_lt_mpi_ct( const mbedtls_mpi *X, const mbedtls_mpi *Y,
+        unsigned *ret )
+{
+    size_t i;
+    /* The value of any of these variables is either 0 or 1 at all times. */
+    unsigned cond, done, X_is_negative, Y_is_negative;
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( Y );
+    MPI_VALIDATE_RET( ret );
+    if( X->n != Y->n )
+        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
+    /*
+     * Set sign_N to 1 if N >= 0, 0 if N < 0.
+     * We know that N->s == 1 if N >= 0 and N->s == -1 if N < 0.
+     */
+    X_is_negative = ( X->s & 2 ) >> 1;
+    Y_is_negative = ( Y->s & 2 ) >> 1;
+    /*
+     * If the signs are different, then the positive operand is the bigger.
+     * That is if X is negative (X_is_negative == 1), then X < Y is true and it
+     * is false if X is positive (X_is_negative == 0).
+     */
+    cond = ( X_is_negative ^ Y_is_negative );
+    *ret = cond & X_is_negative;
+    /*
+     * This is a constant-time function. We might have the result, but we still
+     * need to go through the loop. Record if we have the result already.
+     */
+    done = cond;
+    for( i = X->n; i > 0; i-- )
+    {
+        /*
+         * If Y->p[i - 1] < X->p[i - 1] then X < Y is true if and only if both
+         * X and Y are negative.
+         *
+         * Again even if we can make a decision, we just mark the result and
+         * the fact that we are done and continue looping.
+         */
+        cond = ct_lt_mpi_uint( Y->p[i - 1], X->p[i - 1] );
+        *ret |= cond & ( 1 - done ) & X_is_negative;
+        done |= cond;
+        /*
+         * If X->p[i - 1] < Y->p[i - 1] then X < Y is true if and only if both
+         * X and Y are positive.
+         *
+         * Again even if we can make a decision, we just mark the result and
+         * the fact that we are done and continue looping.
+         */
+        cond = ct_lt_mpi_uint( X->p[i - 1], Y->p[i - 1] );
+        *ret |= cond & ( 1 - done ) & ( 1 - X_is_negative );
+        done |= cond;
+    }
+    return( 0 );
+}
+
+/**
+ * \brief          Compare an MPI with an integer.
+ *
+ * \param X        The left-hand MPI. This must point to an initialized MPI.
+ * \param z        The integer value to compare \p X to.
+ *
+ * \return         \c 1 if \p X is greater than \p z.
+ * \return         \c -1 if \p X is lesser than \p z.
+ * \return         \c 0 if \p X is equal to \p z.
+ */
+int mbedtls_mpi_cmp_int( const mbedtls_mpi *X, mbedtls_mpi_sint z )
+{
+    mbedtls_mpi Y;
+    mbedtls_mpi_uint p[1];
+    MPI_VALIDATE_RET( X );
+    *p  = ( z < 0 ) ? -z : z;
+    Y.s = ( z < 0 ) ? -1 : 1;
+    Y.n = 1;
+    Y.p = p;
+    return( mbedtls_mpi_cmp_mpi( X, &Y ) );
+}
+
+/**
+ * \brief          Perform an unsigned addition of MPIs: X = |A| + |B|
+ *
+ * \param X        The destination MPI. This must point to an initialized MPI.
+ * \param A        The first summand. This must point to an initialized MPI.
+ * \param B        The second summand. This must point to an initialized MPI.
+ *
+ * \return         \c 0 if successful.
+ * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if a memory allocation failed.
+ * \return         Another negative error code on different kinds of failure.
+ */
+int mbedtls_mpi_add_abs( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *B )
+{
+    int ret = MBEDTLS_ERR_THIS_CORRUPTION;
+    size_t i, j;
+    mbedtls_mpi_uint *o, *p, c, tmp;
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( A );
+    MPI_VALIDATE_RET( B );
+    if( X == B )
+    {
+        const mbedtls_mpi *T = A; A = X; B = T;
+    }
+    if( X != A )
+        MBEDTLS_MPI_CHK( mbedtls_mpi_copy( X, A ) );
+    /*
+     * X should always be positive as a result of unsigned additions.
+     */
+    X->s = 1;
+    for( j = B->n; j > 0; j-- )
+        if( B->p[j - 1] != 0 )
+            break;
+    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, j ) );
+    o = B->p; p = X->p; c = 0;
+    /*
+     * tmp is used because it might happen that p == o
+     */
+    for( i = 0; i < j; i++, o++, p++ )
+    {
+        tmp= *o;
+        *p +=  c; c  = ( *p <  c );
+        *p += tmp; c += ( *p < tmp );
+    }
+    while( c != 0 )
+    {
+        if( i >= X->n )
+        {
+            MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, i + 1 ) );
+            p = X->p + i;
+        }
+        *p += c; c = ( *p < c ); i++; p++;
+    }
+cleanup:
+    return( ret );
+}
+
+/**
+ * Helper for mbedtls_mpi subtraction.
+ *
+ * Calculate d = a - b where d, a, and b have the same size.
+ * This function operates modulo (2^ciL)^n and returns the carry
+ * (1 if there was a wraparound, i.e. if `a < b`, and 0 otherwise).
+ *
+ * \param[out] d        Result of subtraction.
+ * \param[in] a         Left operand.
+ * \param[in] b         Right operand.
+ * \param n             Number of limbs of \p a and \p b.
+ * \return              1 if `d < s`.
+ *                      0 if `d >= s`.
+ */
+forceinline mbedtls_mpi_uint mpi_sub_hlp(mbedtls_mpi_uint *d,
+                                         const mbedtls_mpi_uint *a,
+                                         const mbedtls_mpi_uint *b,
+                                         size_t n)
+{
+    size_t i;
+    unsigned char cf;
+    mbedtls_mpi_uint c, x;
+    cf = c = i = 0;
+#ifdef __x86_64__
+    if (!n) return 0;
+    asm volatile("xor\t%1,%1\n\t"
+                 ".align\t16\n1:\t"
+                 "mov\t(%5,%3,8),%1\n\t"
+                 "sbb\t(%6,%3,8),%1\n\t"
+                 "mov\t%1,(%4,%3,8)\n\t"
+                 "lea\t1(%3),%3\n\t"
+                 "dec\t%2\n\t"
+                 "jnz\t1b"
+                 : "=@ccb"(cf), "=&r"(x), "+&c"(n), "=&r"(i)
+                 : "r"(d), "r"(a), "r"(b), "3"(0)
+                 : "cc", "memory");
+    return cf;
+#else
+    for (; i < n; ++i)
+        SBB(d[i], a[i], b[i], c, c);
+    return c;
+#endif
+}
+
+/**
+ * \brief          Perform an unsigned subtraction of MPIs: X = |A| - |B|
+ *
+ * \param X        The destination MPI. This must point to an initialized MPI.
+ * \param A        The minuend. This must point to an initialized MPI.
+ * \param B        The subtrahend. This must point to an initialized MPI.
+ *
+ * \return         \c 0 if successful.
+ * \return         #MBEDTLS_ERR_MPI_NEGATIVE_VALUE if \p B is greater than \p A.
+ * \return         Another negative error code on different kinds of failure.
+ */
+int mbedtls_mpi_sub_abs( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *B )
+{
+    size_t n, m, r;
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( A );
+    MPI_VALIDATE_RET( B );
+    if( X != A && !B->n )
+        return mbedtls_mpi_copy( X, A ); /* wut */
+    for( n = B->n; n > 0; n-- )
+        if( B->p[n - 1] != 0 )
+            break;
+    if( n > A->n )
+        return MBEDTLS_ERR_MPI_NEGATIVE_VALUE; /* B >= (2^ciL)^n > A */
+    if (X != A)
+    {
+        if (X->n < A->n) {
+            if ((r = mbedtls_mpi_grow(X, A->n))) return r;
+        } else if (X->n > A->n) {
+            mbedtls_mpi_zeroize(X->p + A->n, X->n - A->n);
+        }
+        if ((m = A->n - n))
+            memcpy(X->p + n, A->p + n, m * ciL);
+    }
+    /*
+     * X should always be positive as a result of unsigned subtractions.
+     */
+    X->s = 1;
+    if( mpi_sub_hlp( X->p, A->p, B->p, n ) ){
+        /* Propagate the carry to the first nonzero limb of X. */
+        for( ; n < A->n && A->p[n] == 0; n++ )
+            /* --X->p[n]; */
+            X->p[n] = A->p[n] - 1;
+        /* If we ran out of space for the carry, it means that the result
+         * is negative. */
+        if( n == X->n )
+            return MBEDTLS_ERR_MPI_NEGATIVE_VALUE;
+        --X->p[n];
+    }
+    return( 0 );
 }
 
 static int mpi_cmp_abs(const mbedtls_mpi *X,
@@ -1026,310 +1378,17 @@ static int mpi_cmp_abs(const mbedtls_mpi *X,
     return 0;
 }
 
-/**
- * \brief          Compare the absolute values of two MPIs.
- *
- * \param X        The left-hand MPI. This must point to an initialized MPI.
- * \param Y        The right-hand MPI. This must point to an initialized MPI.
- *
- * \return         \c 1 if `|X|` is greater than `|Y|`.
- * \return         \c -1 if `|X|` is lesser than `|Y|`.
- * \return         \c 0 if `|X|` is equal to `|Y|`.
- */
-int mbedtls_mpi_cmp_abs(const mbedtls_mpi *X, const mbedtls_mpi *Y)
+static int mpi_sub_abs( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *B, size_t n )
 {
-    size_t i, j;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(Y);
-    return mpi_cmp_abs(X, Y, &i, &j);
-}
-
-static int mpi_cmp_mpi(const mbedtls_mpi *X, const mbedtls_mpi *Y,
-                       size_t *Xn, size_t *Yn) {
-    size_t i, j;
-    i = mbedtls_mpi_limbs(X);
-    j = mbedtls_mpi_limbs(Y);
-    *Xn = i;
-    *Yn = j;
-    if (!i && !j) return 0;
-    if (i > j) return X->s;
-    if (j > i) return -Y->s;
-    if (X->s > 0 && Y->s < 0) return 1;
-    if (Y->s > 0 && X->s < 0) return -1;
-    for (; i > 0; i--) {
-        if (X->p[i - 1] > Y->p[i - 1]) return X->s;
-        if (X->p[i - 1] < Y->p[i - 1]) return -X->s;
-    }
-    return 0;
-}
-
-/**
- * \brief          Compare two MPIs.
- *
- * \param X        The left-hand MPI. This must point to an initialized MPI.
- * \param Y        The right-hand MPI. This must point to an initialized MPI.
- *
- * \return         \c 1 if \p X is greater than \p Y.
- * \return         \c -1 if \p X is lesser than \p Y.
- * \return         \c 0 if \p X is equal to \p Y.
- */
-int mbedtls_mpi_cmp_mpi(const mbedtls_mpi *X, const mbedtls_mpi *Y) {
-    size_t i, j;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(Y);
-    return mpi_cmp_mpi(X, Y, &i, &j);
-}
-
-/**
- * Decide if an integer is less than the other, without branches.
- *
- * \param x         First integer.
- * \param y         Second integer.
- *
- * \return          1 if \p x is less than \p y, 0 otherwise
- */
-static unsigned ct_lt_mpi_uint(const mbedtls_mpi_uint x,
-                               const mbedtls_mpi_uint y) {
-    mbedtls_mpi_uint ret;
-    mbedtls_mpi_uint cond;
-    /*
-     * Check if the most significant bits (MSB) of the operands are different.
-     */
-    cond = (x ^ y);
-    /*
-     * If the MSB are the same then the difference x-y will be negative (and
-     * have its MSB set to 1 during conversion to unsigned) if and only if x<y.
-     */
-    ret = (x - y) & ~cond;
-    /*
-     * If the MSB are different, then the operand with the MSB of 1 is the
-     * bigger. (That is if y has MSB of 1, then x<y is true and it is false if
-     * the MSB of y is 0.)
-     */
-    ret |= y & cond;
-    ret = ret >> (biL - 1);
-    return (unsigned)ret;
-}
-
-/**
- * \brief          Check if an MPI is less than the other in constant time.
- *
- * \param X        The left-hand MPI. This must point to an initialized MPI
- *                 with the same allocated length as Y.
- * \param Y        The right-hand MPI. This must point to an initialized MPI
- *                 with the same allocated length as X.
- * \param ret      The result of the comparison:
- *                 \c 1 if \p X is less than \p Y.
- *                 \c 0 if \p X is greater than or equal to \p Y.
- *
- * \return         0 on success.
- * \return         MBEDTLS_ERR_MPI_BAD_INPUT_DATA if the allocated length of
- *                 the two input MPIs is not the same.
- */
-int mbedtls_mpi_lt_mpi_ct(const mbedtls_mpi *X, const mbedtls_mpi *Y,
-                          unsigned *ret)
-{
-    size_t i;
-    /* The value of any of these variables is either 0 or 1 at all times. */
-    unsigned cond, done, X_is_negative, Y_is_negative;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(Y);
-    MPI_VALIDATE_RET(ret);
-    if (X->n != Y->n)
-        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
-    /*
-     * Set sign_N to 1 if N >= 0, 0 if N < 0.
-     * We know that N->s == 1 if N >= 0 and N->s == -1 if N < 0.
-     */
-    X_is_negative = (X->s & 2) >> 1;
-    Y_is_negative = (Y->s & 2) >> 1;
-    /*
-     * If the signs are different, then the positive operand is the bigger.
-     * That is if X is negative (X_is_negative == 1), then X < Y is true and it
-     * is false if X is positive (X_is_negative == 0).
-     */
-    cond = (X_is_negative ^ Y_is_negative);
-    *ret = cond & X_is_negative;
-    /*
-     * This is a constant-time function. We might have the result, but we still
-     * need to go through the loop. Record if we have the result already.
-     */
-    done = cond;
-    for (i = X->n; i > 0; i--)
-    {
-        /*
-         * If Y->p[i - 1] < X->p[i - 1] then X < Y is true if and only if both
-         * X and Y are negative.
-         *
-         * Again even if we can make a decision, we just mark the result and
-         * the fact that we are done and continue looping.
-         */
-        cond = ct_lt_mpi_uint(Y->p[i - 1], X->p[i - 1]);
-        *ret |= cond & (1 - done) & X_is_negative;
-        done |= cond;
-        /*
-         * If X->p[i - 1] < Y->p[i - 1] then X < Y is true if and only if both
-         * X and Y are positive.
-         *
-         * Again even if we can make a decision, we just mark the result and
-         * the fact that we are done and continue looping.
-         */
-        cond = ct_lt_mpi_uint(X->p[i - 1], Y->p[i - 1]);
-        *ret |= cond & (1 - done) & (1 - X_is_negative);
-        done |= cond;
-    }
-    return 0;
-}
-
-/**
- * \brief          Compare an MPI with an integer.
- *
- * \param X        The left-hand MPI. This must point to an initialized MPI.
- * \param z        The integer value to compare \p X to.
- *
- * \return         \c 1 if \p X is greater than \p z.
- * \return         \c -1 if \p X is lesser than \p z.
- * \return         \c 0 if \p X is equal to \p z.
- */
-int mbedtls_mpi_cmp_int(const mbedtls_mpi *X, mbedtls_mpi_sint z)
-{
-    mbedtls_mpi Y;
-    mbedtls_mpi_uint p[1];
-    MPI_VALIDATE_RET(X);
-    *p = (z < 0) ? -z : z;
-    Y.s = (z < 0) ? -1 : 1;
-    Y.n = 1;
-    Y.p = p;
-    return mbedtls_mpi_cmp_mpi(X, &Y);
-}
-
-forceinline mbedtls_mpi_uint mpi_add_hlp(mbedtls_mpi_uint *d,
-                                         const mbedtls_mpi_uint *b,
-                                         size_t n)
-{
-    size_t i;
-    unsigned char cf;
-    mbedtls_mpi_uint c, t, *e;
-    e = d + n;
-    c = i = 0;
-#ifdef __x86_64__
-    for (; d + 4 <= e; d += 4, b += 4, c = cf)
-    {
-        asm("add\t%5,%1\n\t"
-            "adc\t%6,%2\n\t"
-            "adc\t%7,%3\n\t"
-            "adc\t%8,%4"
-            : "=@ccc"(cf), "+m"(d[0]), "+m"(d[1]), "+m"(d[2]), "+m"(d[3])
-            : "r"(b[0] + c), "r"(b[1]), "r"(b[2]), "r"(b[3])
-            : "cc");
-    }
-#endif
-    for (; d < e; ++d, ++b)
-        ADC(*d, *d, *b, c, c);
-    return c;
-}
-
-/**
- * Helper for mbedtls_mpi subtraction.
- *
- * Calculate d = a - b where d, a, and b have the same size.
- * This function operates modulo (2^ciL)^n and returns the carry
- * (1 if there was a wraparound, i.e. if `a < b`, and 0 otherwise).
- *
- * \param[out] d        Result of subtraction.
- * \param[in] a         Left operand.
- * \param[in] b         Right operand.
- * \param n             Number of limbs of \p a and \p b.
- * \return              1 if `d < s`.
- *                      0 if `d >= s`.
- */
-forceinline mbedtls_mpi_uint mpi_sub_hlp(mbedtls_mpi_uint *d,
-                                         const mbedtls_mpi_uint *a,
-                                         const mbedtls_mpi_uint *b, 
-                                         size_t n)
-{
-    size_t i;
-    unsigned char cf;
-    uint64_t q, r, s, t;
-    mbedtls_mpi_uint c, z, x, y;
-    cf = c = i = 0;
-#ifdef __x86_64__
-    for (; i + 4 <= n; i += 4, c = cf)
-    {
-        q = a[i + 0];
-        r = a[i + 1];
-        s = a[i + 2];
-        t = a[i + 3];
-        asm volatile("sub\t%5,%1\n\t"
-                     "sbb\t1*8(%6),%2\n\t"
-                     "sbb\t2*8(%6),%3\n\t"
-                     "sbb\t3*8(%6),%4"
-                     : "=@ccc"(cf), "+r"(q), "+r"(r), "+r"(s), "+r"(t)
-                     : "r"(b[i] + c), "r"(b + i)
-                     : "memory", "cc");
-        d[i + 0] = q;
-        d[i + 1] = r;
-        d[i + 2] = s;
-        d[i + 3] = t;
-    }
-#endif
-    for (; i < n; ++i)
-        SBB(d[i], a[i], b[i], c, c);
-    return c;
-}
-
-/**
- * \brief          Perform an unsigned addition of MPIs: X = |A| + |B|
- *
- * \param X        The destination MPI. This must point to an initialized MPI.
- * \param A        The first summand. This must point to an initialized MPI.
- * \param B        The second summand. This must point to an initialized MPI.
- *
- * \return         \c 0 if successful.
- * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if a memory allocation failed.
- * \return         Another negative error code on different kinds of failure.
- */
-int mbedtls_mpi_add_abs(mbedtls_mpi *X, const mbedtls_mpi *A,
-                        const mbedtls_mpi *B)
-{
-    int ret = MBEDTLS_ERR_THIS_CORRUPTION;
-    size_t i, j;
-    unsigned char cf;
-    const mbedtls_mpi *T;
-    mbedtls_mpi_uint c, tmp;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(A);
-    MPI_VALIDATE_RET(B);
-    if (X == B) T = A, A = X, B = T;
-    if (X != A) MBEDTLS_MPI_CHK(mbedtls_mpi_copy(X, A));
-    X->s = 1; /* always positive b/c unsigned addition */
-    j = mbedtls_mpi_limbs(B);
-    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, j));
-    c = mpi_add_hlp(X->p, B->p, j);
-    for (; c; ++j)
-    {
-        if (j >= X->n)
-            MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, j + 1));
-        X->p[j] += c;
-        c = X->p[j] < c;
-    }
-cleanup:
-    return ret;
-}
-
-static int mpi_sub_abs(mbedtls_mpi *X, const mbedtls_mpi *A,
-                       const mbedtls_mpi *B, size_t Bn)
-{
-    int ret;
-    size_t n, m;
-    unsigned char cf;
-    n = Bn;
-    if (n > A->n)
+    size_t m, r;
+    if( X != A && !B->n )
+        return mbedtls_mpi_copy( X, A ); /* wut */
+    if( n > A->n )
         return MBEDTLS_ERR_MPI_NEGATIVE_VALUE; /* B >= (2^ciL)^n > A */
     if (X != A)
     {
         if (X->n < A->n) {
-            if ((ret = mbedtls_mpi_grow(X, A->n))) return ret;
+            if ((r = mbedtls_mpi_grow(X, A->n))) return r;
         } else if (X->n > A->n) {
             mbedtls_mpi_zeroize(X->p + A->n, X->n - A->n);
         }
@@ -1340,43 +1399,18 @@ static int mpi_sub_abs(mbedtls_mpi *X, const mbedtls_mpi *A,
      * X should always be positive as a result of unsigned subtractions.
      */
     X->s = 1;
-    cf = mpi_sub_hlp(X->p, A->p, B->p, n);
-    if (cf)
-    {
+    if( mpi_sub_hlp( X->p, A->p, B->p, n ) ){
         /* Propagate the carry to the first nonzero limb of X. */
-        for (; n < A->n && !A->p[n]; n++) { /* --X->p[n]; */
+        for( ; n < A->n && A->p[n] == 0; n++ )
+            /* --X->p[n]; */
             X->p[n] = A->p[n] - 1;
-        }
         /* If we ran out of space for the carry, it means that the result
          * is negative. */
-        if (n == X->n)
+        if( n == X->n )
             return MBEDTLS_ERR_MPI_NEGATIVE_VALUE;
         --X->p[n];
     }
-    return 0;
-}
-
-/**
- * \brief          Perform an unsigned subtraction of MPIs: X = |A| - |B|
- *
- * \param X        The destination MPI. This must point to an initialized MPI.
- * \param A        The minuend. This must point to an initialized MPI.
- * \param B        The subtrahend. This must point to an initialized MPI.
- *
- * \return         \c 0 if successful.
- * \return         #MBEDTLS_ERR_MPI_NEGATIVE_VALUE if \p B is greater than \p A.
- * \return         Another negative error code on different kinds of failure.
- */
-int mbedtls_mpi_sub_abs(mbedtls_mpi *X, const mbedtls_mpi *A,
-                        const mbedtls_mpi *B)
-{
-    size_t n, m;
-    unsigned char cf;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(A);
-    MPI_VALIDATE_RET(B);
-    if (X != A && !B->n) return mbedtls_mpi_copy(X, A); /* wut */
-    return mpi_sub_abs(X, A, B, mbedtls_mpi_limbs(B));
+    return( 0 );
 }
 
 /**
@@ -1390,35 +1424,34 @@ int mbedtls_mpi_sub_abs(mbedtls_mpi *X, const mbedtls_mpi *A,
  * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if a memory allocation failed.
  * \return         Another negative error code on different kinds of failure.
  */
-int mbedtls_mpi_add_mpi(mbedtls_mpi *X, const mbedtls_mpi *A,
-                        const mbedtls_mpi *B)
+int mbedtls_mpi_add_mpi( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *B )
 {
     int ret, s;
     size_t i, j;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(A);
-    MPI_VALIDATE_RET(B);
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( A );
+    MPI_VALIDATE_RET( B );
     s = A->s;
-    if (A->s * B->s < 0)
+    if( A->s * B->s < 0 )
     {
-        if (mpi_cmp_abs(A, B, &i, &j) >= 0)
+        if( mpi_cmp_abs( A, B, &i, &j ) >= 0 )
         {
-            MBEDTLS_MPI_CHK(mpi_sub_abs(X, A, B, j));
-            X->s = s;
+            MBEDTLS_MPI_CHK( mpi_sub_abs( X, A, B, j ) );
+            X->s =  s;
         }
         else
         {
-            MBEDTLS_MPI_CHK(mpi_sub_abs(X, B, A, i));
+            MBEDTLS_MPI_CHK( mpi_sub_abs( X, B, A, i ) );
             X->s = -s;
         }
     }
     else
     {
-        MBEDTLS_MPI_CHK(mbedtls_mpi_add_abs(X, A, B));
+        MBEDTLS_MPI_CHK( mbedtls_mpi_add_abs( X, A, B ) );
         X->s = s;
     }
 cleanup:
-    return ret;
+    return( ret );
 }
 
 /**
@@ -1432,60 +1465,58 @@ cleanup:
  * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if a memory allocation failed.
  * \return         Another negative error code on different kinds of failure.
  */
-int mbedtls_mpi_sub_mpi(mbedtls_mpi *X, const mbedtls_mpi *A,
-                        const mbedtls_mpi *B)
+int mbedtls_mpi_sub_mpi( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *B )
 {
     int ret, s;
     size_t i, j;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(A);
-    MPI_VALIDATE_RET(B);
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( A );
+    MPI_VALIDATE_RET( B );
     s = A->s;
-    if (A->s * B->s > 0)
+    if( A->s * B->s > 0 )
     {
-        if (mpi_cmp_abs(A, B, &i, &j) >= 0)
+        if( mpi_cmp_abs( A, B, &i, &j ) >= 0 )
         {
-            MBEDTLS_MPI_CHK(mpi_sub_abs(X, A, B, j));
-            X->s = s;
+            MBEDTLS_MPI_CHK( mpi_sub_abs( X, A, B, j ) );
+            X->s =  s;
         }
         else
         {
-            MBEDTLS_MPI_CHK(mpi_sub_abs(X, B, A, i));
+            MBEDTLS_MPI_CHK( mpi_sub_abs( X, B, A, i ) );
             X->s = -s;
         }
     }
     else
     {
-        MBEDTLS_MPI_CHK(mbedtls_mpi_add_abs(X, A, B));
+        MBEDTLS_MPI_CHK( mbedtls_mpi_add_abs( X, A, B ) );
         X->s = s;
     }
 cleanup:
-    return ret;
+    return( ret );
 }
 
 /**
- * \brief          Performs signed addition of MPI and integer: X = A + b
+ * \brief          Perform a signed addition of an MPI and an integer: X = A + b
  *
  * \param X        The destination MPI. This must point to an initialized MPI.
  * \param A        The first summand. This must point to an initialized MPI.
  * \param b        The second summand.
  *
  * \return         \c 0 if successful.
- * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if a allocation failed.
- * \return         Another negative error code on different kinds of
- * failure.
+ * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if a memory allocation failed.
+ * \return         Another negative error code on different kinds of failure.
  */
-int mbedtls_mpi_add_int(mbedtls_mpi *X, const mbedtls_mpi *A,
-                        mbedtls_mpi_sint b) {
+int mbedtls_mpi_add_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_sint b )
+{
     mbedtls_mpi _B;
     mbedtls_mpi_uint p[1];
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(A);
-    p[0] = (b < 0) ? -b : b;
-    _B.s = (b < 0) ? -1 : 1;
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( A );
+    p[0] = ( b < 0 ) ? -b : b;
+    _B.s = ( b < 0 ) ? -1 : 1;
     _B.n = 1;
     _B.p = p;
-    return mbedtls_mpi_add_mpi(X, A, &_B);
+    return( mbedtls_mpi_add_mpi( X, A, &_B ) );
 }
 
 /**
@@ -1500,50 +1531,69 @@ int mbedtls_mpi_add_int(mbedtls_mpi *X, const mbedtls_mpi *A,
  * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if a memory allocation failed.
  * \return         Another negative error code on different kinds of failure.
  */
-int mbedtls_mpi_sub_int(mbedtls_mpi *X, const mbedtls_mpi *A,
-                        mbedtls_mpi_sint b) {
+int mbedtls_mpi_sub_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_sint b )
+{
     mbedtls_mpi _B;
     mbedtls_mpi_uint p[1];
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(A);
-    p[0] = (b < 0) ? -b : b;
-    _B.s = (b < 0) ? -1 : 1;
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( A );
+    p[0] = ( b < 0 ) ? -b : b;
+    _B.s = ( b < 0 ) ? -1 : 1;
     _B.n = 1;
     _B.p = p;
-    return mbedtls_mpi_sub_mpi(X, A, &_B);
+    return( mbedtls_mpi_sub_mpi( X, A, &_B ) );
 }
 
 /*
  * Unsigned integer divide - double mbedtls_mpi_uint dividend, u1/u0, and
  * mbedtls_mpi_uint divisor, d
  */
-static inline mbedtls_mpi_uint mbedtls_int_div_int(mbedtls_mpi_uint u1,
-                                                   mbedtls_mpi_uint u0,
-                                                   mbedtls_mpi_uint d,
-                                                   mbedtls_mpi_uint *r)
+static mbedtls_mpi_uint mbedtls_int_div_int( mbedtls_mpi_uint u1,
+                                             mbedtls_mpi_uint u0,
+                                             mbedtls_mpi_uint d,
+                                             mbedtls_mpi_uint *r )
 {
-  if (d && u1 < d)
-  {
 #ifdef __x86_64__
-    mbedtls_mpi_uint quo, rem;
-    asm("div\t%2" : "=a"(quo), "=d"(rem) : "r"(d), "0"(u0), "1"(u1) : "cc");
-    if (r) *r = rem;
-    return quo;
-#elif defined(MBEDTLS_HAVE_UDBL)
-    mbedtls_t_udbl dividend, quotient;
-    dividend = (mbedtls_t_udbl)u1 << biL;
-    dividend |= (mbedtls_t_udbl)u0;
-    quotient = dividend / d;
-    if (quotient > ((mbedtls_t_udbl)1 << biL) - 1)
-      quotient = ((mbedtls_t_udbl)1 << biL) - 1;
-    if (r) *r = (mbedtls_mpi_uint)(dividend - (quotient * d));
-    return (mbedtls_mpi_uint)quotient;
+    if (d && u1 < d)
+    {
+        mbedtls_mpi_uint quo, rem;
+        asm("div\t%2" : "=a"(quo), "=d"(rem) : "r"(d), "0"(u0), "1"(u1) : "cc");
+        if (r) *r = rem;
+        return quo;
+    }
+    else
+    {
+        if (r) *r = ~0;
+        return ~0;
+    }
 #else
-    size_t s;
-    mbedtls_mpi_uint radix = (mbedtls_mpi_uint)1 << biH;
-    mbedtls_mpi_uint uint_halfword_mask = ((mbedtls_mpi_uint)1 << biH) - 1;
+#if defined(MBEDTLS_HAVE_UDBL)
+    mbedtls_t_udbl dividend, quotient;
+#else
+    const mbedtls_mpi_uint radix = (mbedtls_mpi_uint) 1 << biH;
+    const mbedtls_mpi_uint uint_halfword_mask = ( (mbedtls_mpi_uint) 1 << biH ) - 1;
     mbedtls_mpi_uint d0, d1, q0, q1, rAX, r0, quotient;
     mbedtls_mpi_uint u0_msw, u0_lsw;
+    size_t s;
+#endif
+    /*
+     * Check for overflow
+     */
+    if( 0 == d || u1 >= d )
+    {
+        if (r) *r = ~0;
+        return ( ~0 );
+    }
+#if defined(MBEDTLS_HAVE_UDBL)
+    dividend  = (mbedtls_t_udbl) u1 << biL;
+    dividend |= (mbedtls_t_udbl) u0;
+    quotient = dividend / d;
+    if( quotient > ( (mbedtls_t_udbl) 1 << biL ) - 1 )
+        quotient = ( (mbedtls_t_udbl) 1 << biL ) - 1;
+    if( r )
+        *r = (mbedtls_mpi_uint)( dividend - (quotient * d ) );
+    return (mbedtls_mpi_uint) quotient;
+#else
     /*
      * Algorithm D, Section 4.3.1 - The Art of Computer Programming
      *   Vol. 2 - Seminumerical Algorithms, Knuth
@@ -1551,11 +1601,11 @@ static inline mbedtls_mpi_uint mbedtls_int_div_int(mbedtls_mpi_uint u1,
     /*
      * Normalize the divisor, d, and dividend, u0, u1
      */
-    s = mbedtls_clz(d);
+    s = mbedtls_clz( d );
     d = d << s;
     u1 = u1 << s;
-    u1 |= (u0 >> (biL - s)) & (-(mbedtls_mpi_sint)s >> (biL - 1));
-    u0 = u0 << s;
+    u1 |= ( u0 >> ( biL - s ) ) & ( -(mbedtls_mpi_sint)s >> ( biL - 1 ) );
+    u0 =  u0 << s;
     d1 = d >> biH;
     d0 = d & uint_halfword_mask;
     u0_msw = u0 >> biH;
@@ -1565,33 +1615,27 @@ static inline mbedtls_mpi_uint mbedtls_int_div_int(mbedtls_mpi_uint u1,
      */
     q1 = u1 / d1;
     r0 = u1 - d1 * q1;
-    while (q1 >= radix || (q1 * d0 > radix * r0 + u0_msw))
+    while( q1 >= radix || ( q1 * d0 > radix * r0 + u0_msw ) )
     {
-      q1 -= 1;
-      r0 += d1;
-      if (r0 >= radix)
-          break;
+        q1 -= 1;
+        r0 += d1;
+        if ( r0 >= radix ) break;
     }
-    rAX = (u1 * radix) + (u0_msw - q1 * d);
+    rAX = ( u1 * radix ) + ( u0_msw - q1 * d );
     q0 = rAX / d1;
     r0 = rAX - q0 * d1;
-    while (q0 >= radix || (q0 * d0 > radix * r0 + u0_lsw))
+    while( q0 >= radix || ( q0 * d0 > radix * r0 + u0_lsw ) )
     {
-      q0 -= 1;
-      r0 += d1;
-      if (r0 >= radix)
-          break;
+        q0 -= 1;
+        r0 += d1;
+        if ( r0 >= radix ) break;
     }
-    if (r) *r = (rAX * radix + u0_lsw - q0 * d) >> s;
+    if (r)
+        *r = ( rAX * radix + u0_lsw - q0 * d ) >> s;
     quotient = q1 * radix + q0;
     return quotient;
 #endif
-  }
-  else
-  {
-    if (r) *r = ~0;
-    return ~0;
-  }
+#endif
 }
 
 static inline void Multiply2x1(uint64_t a[3], uint64_t b) {
@@ -1686,10 +1730,10 @@ int mbedtls_mpi_div_mpi(mbedtls_mpi *Q, mbedtls_mpi *R, const mbedtls_mpi *A,
     n = X.n - 1;
     t = Y.n - 1;
     MBEDTLS_MPI_CHK(mbedtls_mpi_shift_l(&Y, biL * (n - t)));
-    while (mpi_cmp_abs(&X, &Y, &Xn, &Yn) >= 0)
+    while (mbedtls_mpi_cmp_abs(&X, &Y) >= 0)
     {
         Z.p[n - t]++;
-        MBEDTLS_MPI_CHK(mpi_sub_abs(&X, &X, &Y, Yn));
+        MBEDTLS_MPI_CHK(mbedtls_mpi_sub_abs(&X, &X, &Y));
     }
     mbedtls_mpi_shift_r(&Y, biL * (n - t));
     for (i = n; i > t; i--)
@@ -1758,17 +1802,18 @@ cleanup:
  * \return         #MBEDTLS_ERR_MPI_DIVISION_BY_ZERO if \p b equals zero.
  * \return         Another negative error code on different kinds of failure.
  */
-int mbedtls_mpi_div_int(mbedtls_mpi *Q, mbedtls_mpi *R, const mbedtls_mpi *A,
-                        mbedtls_mpi_sint b)
+int mbedtls_mpi_div_int( mbedtls_mpi *Q, mbedtls_mpi *R,
+                         const mbedtls_mpi *A,
+                         mbedtls_mpi_sint b )
 {
     mbedtls_mpi _B;
     mbedtls_mpi_uint p[1];
-    MPI_VALIDATE_RET(A);
-    p[0] = (b < 0) ? -b : b;
-    _B.s = (b < 0) ? -1 : 1;
+    MPI_VALIDATE_RET( A );
+    p[0] = ( b < 0 ) ? -b : b;
+    _B.s = ( b < 0 ) ? -1 : 1;
     _B.n = 1;
     _B.p = p;
-    return mbedtls_mpi_div_mpi(Q, R, A, &_B);
+    return( mbedtls_mpi_div_mpi( Q, R, A, &_B ) );
 }
 
 /**
@@ -1786,22 +1831,23 @@ int mbedtls_mpi_div_int(mbedtls_mpi *Q, mbedtls_mpi *R, const mbedtls_mpi *A,
  * \return         #MBEDTLS_ERR_MPI_DIVISION_BY_ZERO if \p B equals zero.
  * \return         #MBEDTLS_ERR_MPI_NEGATIVE_VALUE if \p B is negative.
  * \return         Another negative error code on different kinds of failure.
+ *
  */
-int mbedtls_mpi_mod_mpi(mbedtls_mpi *R, const mbedtls_mpi *A,
-                        const mbedtls_mpi *B)
+int mbedtls_mpi_mod_mpi( mbedtls_mpi *R, const mbedtls_mpi *A, const mbedtls_mpi *B )
 {
-    size_t i, j;
     int ret = MBEDTLS_ERR_THIS_CORRUPTION;
-    MPI_VALIDATE_RET(R);
-    MPI_VALIDATE_RET(A);
-    MPI_VALIDATE_RET(B);
-    if (B->s < 0) return MBEDTLS_ERR_MPI_NEGATIVE_VALUE;
-    MBEDTLS_MPI_CHK(mbedtls_mpi_div_mpi(NULL, R, A, B));
-    while (R->s < 0) MBEDTLS_MPI_CHK(mbedtls_mpi_add_mpi(R, R, B));
-    while (mbedtls_mpi_cmp_mpi(R, B) >= 0)
-        MBEDTLS_MPI_CHK(mbedtls_mpi_sub_mpi(R, R, B));
+    MPI_VALIDATE_RET( R );
+    MPI_VALIDATE_RET( A );
+    MPI_VALIDATE_RET( B );
+    if( mbedtls_mpi_cmp_int( B, 0 ) < 0 )
+        return( MBEDTLS_ERR_MPI_NEGATIVE_VALUE );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_div_mpi( NULL, R, A, B ) );
+    while( mbedtls_mpi_cmp_int( R, 0 ) < 0 )
+      MBEDTLS_MPI_CHK( mbedtls_mpi_add_mpi( R, R, B ) );
+    while( mbedtls_mpi_cmp_mpi( R, B ) >= 0 )
+      MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( R, R, B ) );
 cleanup:
-    return ret;
+    return( ret );
 }
 
 /**
@@ -1820,63 +1866,64 @@ cleanup:
  * \return         #MBEDTLS_ERR_MPI_NEGATIVE_VALUE if \p b is negative.
  * \return         Another negative error code on different kinds of failure.
  */
-int mbedtls_mpi_mod_int( mbedtls_mpi_uint *r, const mbedtls_mpi *A,
-                         mbedtls_mpi_sint b )
+int mbedtls_mpi_mod_int( mbedtls_mpi_uint *r, const mbedtls_mpi *A, mbedtls_mpi_sint b )
 {
     size_t i;
     mbedtls_mpi_uint x, y, z;
-    MPI_VALIDATE_RET(r);
-    MPI_VALIDATE_RET(A);
-    if (!b)
-        return MBEDTLS_ERR_MPI_DIVISION_BY_ZERO;
-    if (b < 0)
-        return MBEDTLS_ERR_MPI_NEGATIVE_VALUE;
+    MPI_VALIDATE_RET( r );
+    MPI_VALIDATE_RET( A );
+    if( b == 0 )
+        return( MBEDTLS_ERR_MPI_DIVISION_BY_ZERO );
+    if( b < 0 )
+        return( MBEDTLS_ERR_MPI_NEGATIVE_VALUE );
     /*
      * handle trivial cases
      */
-    if (b == 1)
+    if( b == 1 )
     {
         *r = 0;
-        return 0;
+        return( 0 );
     }
-    if (b == 2)
+    if( b == 2 )
     {
         *r = A->p[0] & 1;
-        return 0;
+        return( 0 );
     }
     /*
      * general case
      */
-    for (i = A->n, y = 0; i > 0; i--)
+    for( i = A->n, y = 0; i > 0; i-- )
     {
-        x = A->p[i - 1];
-        y = (y << biH) | (x >> biH);
-        z = y / b;
+        x  = A->p[i - 1];
+        y  = ( y << biH ) | ( x >> biH );
+        z  = y / b;
         y -= z * b;
         x <<= biH;
-        y = (y << biH) | (x >> biH);
-        z = y / b;
+        y  = ( y << biH ) | ( x >> biH );
+        z  = y / b;
         y -= z * b;
     }
     /*
      * If A is negative, then the current y represents a negative value.
      * Flipping it to the positive side.
      */
-    if (A->s < 0 && y) y = b - y;
+    if( A->s < 0 && y != 0 )
+        y = b - y;
     *r = y;
-    return 0;
+    return( 0 );
 }
 
 /*
  * Fast Montgomery initialization (thanks to Tom St Denis)
  */
-static void mpi_montg_init(mbedtls_mpi_uint *mm, const mbedtls_mpi *N)
+static void mpi_montg_init( mbedtls_mpi_uint *mm, const mbedtls_mpi *N )
 {
     mbedtls_mpi_uint x, m0 = N->p[0];
     unsigned int i;
-    x = m0;
-    x += ((m0 + 2) & 4) << 1;
-    for (i = biL; i >= 8; i /= 2) x *= 2 - m0 * x;
+    x  = m0;
+    x += ( ( m0 + 2 ) & 4 ) << 1;
+    for( i = biL; i >= 8; i /= 2 )
+        x *= ( 2 - ( m0 * x ) );
     *mm = -x;
 }
 
@@ -1903,42 +1950,40 @@ static void mpi_montg_init(mbedtls_mpi_uint *mm, const mbedtls_mpi *N)
  *                      Note that unlike the usual convention in the library
  *                      for `const mbedtls_mpi*`, the content of T can change.
  */
-static void mpi_montmul(mbedtls_mpi *A, const mbedtls_mpi *B,
-                        const mbedtls_mpi *N, mbedtls_mpi_uint mm,
-                        const mbedtls_mpi *T)
+static void mpi_montmul( mbedtls_mpi *A, const mbedtls_mpi *B, const mbedtls_mpi *N, mbedtls_mpi_uint mm,
+                         const mbedtls_mpi *T )
 {
     size_t i, n, m;
     mbedtls_mpi_uint u0, u1, *d, *Ap, *Bp, *Np;
-    mbedtls_mpi_zeroize(T->p, T->n);
+    mbedtls_platform_zeroize( T->p, T->n * ciL );
     d = T->p;
     n = N->n;
-    m = (B->n < n) ? B->n : n;
+    m = ( B->n < n ) ? B->n : n;
     Ap = A->p;
     Bp = B->p;
     Np = N->p;
-    for (i = 0; i < n; i++)
+    for( i = 0; i < n; i++ )
     {
         /*
          * T = (T + u0*B + u1*N) / 2^biL
          */
         u0 = Ap[i];
-        u1 = (d[0] + u0 * Bp[0]) * mm;
-        mbedtls_mpi_mul_hlp(m, Bp, d, u0);
-        mbedtls_mpi_mul_hlp(n, Np, d, u1);
-        *d++ = u0;
-        d[n + 1] = 0;
+        u1 = ( d[0] + u0 * Bp[0] ) * mm;
+        mbedtls_mpi_mul_hlp( m, Bp, d, u0 );
+        mbedtls_mpi_mul_hlp( n, Np, d, u1 );
+        *d++ = u0; d[n + 1] = 0;
     }
     /* At this point, d is either the desired result or the desired result
      * plus N. We now potentially subtract N, avoiding leaking whether the
      * subtraction is performed through side channels. */
     /* Copy the n least significant limbs of d to A, so that
      * A = d if d < N (recall that N has n limbs). */
-    memcpy(Ap, d, n * ciL);
+    memcpy( Ap, d, n * ciL );
     /* If d >= N then we want to set A to d - N. To prevent timing attacks,
      * do the calculation without using conditional tests. */
     /* Set d to d0 + (2^biL)^n - N where d0 is the current value of d. */
     d[n] += 1;
-    d[n] -= mpi_sub_hlp(d, d, Np, n);
+    d[n] -= mpi_sub_hlp( d, d, Np, n );
     /* If d0 < N then d < (2^biL)^n
      * so d[n] == 0 and we want to keep A as it is.
      * If d0 >= N then d >= (2^biL)^n, and d <= (2^biL)^n + N < 2 * (2^biL)^n
@@ -1955,14 +2000,14 @@ static void mpi_montmul(mbedtls_mpi *A, const mbedtls_mpi *B,
  *
  * See mpi_montmul() regarding constraints and guarantees on the parameters.
  */
-static void mpi_montred(mbedtls_mpi *A, const mbedtls_mpi *N,
-                        mbedtls_mpi_uint mm, const mbedtls_mpi *T)
+static void mpi_montred( mbedtls_mpi *A, const mbedtls_mpi *N,
+                         mbedtls_mpi_uint mm, const mbedtls_mpi *T )
 {
-    mbedtls_mpi U;
     mbedtls_mpi_uint z = 1;
-    U.n = U.s = (int)z;
+    mbedtls_mpi U;
+    U.n = U.s = (int) z;
     U.p = &z;
-    mpi_montmul(A, &U, N, mm, T);
+    mpi_montmul( A, &U, N, mm, T );
 }
 
 /**
@@ -1991,127 +2036,129 @@ static void mpi_montred(mbedtls_mpi *A, const mbedtls_mpi *N,
  * \return         Another negative error code on different kinds of failures.
  *
  */
-int mbedtls_mpi_exp_mod(mbedtls_mpi *X, const mbedtls_mpi *A,
-                        const mbedtls_mpi *E, const mbedtls_mpi *N,
-                        mbedtls_mpi *_RR)
+int mbedtls_mpi_exp_mod( mbedtls_mpi *X, const mbedtls_mpi *A,
+                         const mbedtls_mpi *E, const mbedtls_mpi *N,
+                         mbedtls_mpi *_RR )
 {
     int ret = MBEDTLS_ERR_THIS_CORRUPTION;
     size_t wbits, wsize, one = 1;
     size_t i, j, nblimbs;
     size_t bufsize, nbits;
     mbedtls_mpi_uint ei, mm, state;
-    mbedtls_mpi RR, T, W[1 << MBEDTLS_MPI_WINDOW_SIZE], Apos;
+    mbedtls_mpi RR, T, W[ 1 << MBEDTLS_MPI_WINDOW_SIZE ], Apos;
     int neg;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(A);
-    MPI_VALIDATE_RET(E);
-    MPI_VALIDATE_RET(N);
-    if (mbedtls_mpi_cmp_int(N, 0) <= 0 || !(N->p[0] & 1))
-        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
-    if (E->s < 0)
-        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
-    if (mbedtls_mpi_bitlen(E) > MBEDTLS_MPI_MAX_BITS ||
-        mbedtls_mpi_bitlen(N) > MBEDTLS_MPI_MAX_BITS)
-        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( A );
+    MPI_VALIDATE_RET( E );
+    MPI_VALIDATE_RET( N );
+    if( mbedtls_mpi_cmp_int( N, 0 ) <= 0 || ( N->p[0] & 1 ) == 0 )
+        return( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
+    if( mbedtls_mpi_cmp_int( E, 0 ) < 0 )
+        return( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
+    if( mbedtls_mpi_bitlen( E ) > MBEDTLS_MPI_MAX_BITS ||
+        mbedtls_mpi_bitlen( N ) > MBEDTLS_MPI_MAX_BITS )
+        return ( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
     /*
      * Init temps and window size
      */
-    mpi_montg_init(&mm, N);
-    mbedtls_mpi_init(&RR);
-    mbedtls_mpi_init(&T);
-    mbedtls_mpi_init(&Apos);
-    mbedtls_platform_zeroize(W, sizeof(W));
-    i = mbedtls_mpi_bitlen(E);
-    wsize = (i > 671) ? 6 : (i > 239) ? 5 : (i > 79) ? 4 : (i > 23) ? 3 : 1;
-#if (MBEDTLS_MPI_WINDOW_SIZE < 6)
-    if (wsize > MBEDTLS_MPI_WINDOW_SIZE) wsize = MBEDTLS_MPI_WINDOW_SIZE;
+    mpi_montg_init( &mm, N );
+    mbedtls_mpi_init( &RR ); mbedtls_mpi_init( &T );
+    mbedtls_mpi_init( &Apos );
+    mbedtls_platform_zeroize( W, sizeof( W ) );
+    i = mbedtls_mpi_bitlen( E );
+    wsize = ( i > 671 ) ? 6 : ( i > 239 ) ? 5 :
+            ( i >  79 ) ? 4 : ( i >  23 ) ? 3 : 1;
+#if( MBEDTLS_MPI_WINDOW_SIZE < 6 )
+    if( wsize > MBEDTLS_MPI_WINDOW_SIZE )
+        wsize = MBEDTLS_MPI_WINDOW_SIZE;
 #endif
     j = N->n + 1;
-    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, j));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(&W[1], j));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(&T, j * 2));
+    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, j ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( &W[1],  j ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( &T, j * 2 ) );
     /*
      * Compensate for negative A (and correct at the end)
      */
-    neg = (A->s == -1);
-    if (neg)
+    neg = ( A->s == -1 );
+    if( neg )
     {
-        MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&Apos, A));
+        MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &Apos, A ) );
         Apos.s = 1;
         A = &Apos;
     }
     /*
      * If 1st call, pre-compute R^2 mod N
      */
-    if (!_RR || !_RR->p)
+    if( _RR == NULL || _RR->p == NULL )
     {
-        MBEDTLS_MPI_CHK(mbedtls_mpi_lset(&RR, 1));
-        MBEDTLS_MPI_CHK(mbedtls_mpi_shift_l(&RR, N->n * 2 * biL));
-        MBEDTLS_MPI_CHK(mbedtls_mpi_mod_mpi(&RR, &RR, N));
-        if (_RR) memcpy(_RR, &RR, sizeof(mbedtls_mpi));
+        MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &RR, 1 ) );
+        MBEDTLS_MPI_CHK( mbedtls_mpi_shift_l( &RR, N->n * 2 * biL ) );
+        MBEDTLS_MPI_CHK( mbedtls_mpi_mod_mpi( &RR, &RR, N ) );
+        if( _RR )
+            memcpy( _RR, &RR, sizeof( mbedtls_mpi ) );
     }
     else
-    {
-        memcpy(&RR, _RR, sizeof(mbedtls_mpi));
-    }
+        memcpy( &RR, _RR, sizeof( mbedtls_mpi ) );
     /*
      * W[1] = A * R^2 * R^-1 mod N = A * R mod N
      */
-    if (mbedtls_mpi_cmp_mpi(A, N) >= 0)
-        MBEDTLS_MPI_CHK(mbedtls_mpi_mod_mpi(&W[1], A, N));
+    if( mbedtls_mpi_cmp_mpi( A, N ) >= 0 )
+        MBEDTLS_MPI_CHK( mbedtls_mpi_mod_mpi( &W[1], A, N ) );
     else
-        MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&W[1], A));
-    mpi_montmul(&W[1], &RR, N, mm, &T);
+        MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &W[1], A ) );
+    mpi_montmul( &W[1], &RR, N, mm, &T );
     /*
      * X = R^2 * R^-1 mod N = R mod N
      */
-    MBEDTLS_MPI_CHK(mbedtls_mpi_copy(X, &RR));
-    mpi_montred(X, N, mm, &T);
-    if (wsize > 1)
+    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( X, &RR ) );
+    mpi_montred( X, N, mm, &T );
+    if( wsize > 1 )
     {
         /*
          * W[1 << (wsize - 1)] = W[1] ^ (wsize - 1)
          */
-        j = one << (wsize - 1);
-        MBEDTLS_MPI_CHK(mbedtls_mpi_grow(&W[j], N->n + 1));
-        MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&W[j], &W[1]));
-        for (i = 0; i < wsize - 1; i++)
-            mpi_montmul(&W[j], &W[j], N, mm, &T);
+        j =  one << ( wsize - 1 );
+        MBEDTLS_MPI_CHK( mbedtls_mpi_grow( &W[j], N->n + 1 ) );
+        MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &W[j], &W[1]    ) );
+        for( i = 0; i < wsize - 1; i++ )
+            mpi_montmul( &W[j], &W[j], N, mm, &T );
         /*
          * W[i] = W[i - 1] * W[1]
          */
-        for (i = j + 1; i < (one << wsize); i++)
+        for( i = j + 1; i < ( one << wsize ); i++ )
         {
-            MBEDTLS_MPI_CHK(mbedtls_mpi_grow(&W[i], N->n + 1));
-            MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&W[i], &W[i - 1]));
-            mpi_montmul(&W[i], &W[1], N, mm, &T);
+            MBEDTLS_MPI_CHK( mbedtls_mpi_grow( &W[i], N->n + 1 ) );
+            MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &W[i], &W[i - 1] ) );
+            mpi_montmul( &W[i], &W[1], N, mm, &T );
         }
     }
     nblimbs = E->n;
     bufsize = 0;
-    nbits = 0;
-    wbits = 0;
-    state = 0;
-    while (1)
+    nbits   = 0;
+    wbits   = 0;
+    state   = 0;
+    while( 1 )
     {
-        if (!bufsize)
+        if( bufsize == 0 )
         {
-            if (!nblimbs) break;
+            if( nblimbs == 0 )
+                break;
             nblimbs--;
-            bufsize = sizeof(mbedtls_mpi_uint) << 3;
+            bufsize = sizeof( mbedtls_mpi_uint ) << 3;
         }
         bufsize--;
         ei = (E->p[nblimbs] >> bufsize) & 1;
         /*
          * skip leading 0s
          */
-        if (ei == 0 && state == 0) continue;
-        if (ei == 0 && state == 1)
+        if( ei == 0 && state == 0 )
+            continue;
+        if( ei == 0 && state == 1 )
         {
             /*
              * out of window, square X
              */
-            mpi_montmul(X, X, N, mm, &T);
+            mpi_montmul( X, X, N, mm, &T );
             continue;
         }
         /*
@@ -2119,18 +2166,18 @@ int mbedtls_mpi_exp_mod(mbedtls_mpi *X, const mbedtls_mpi *A,
          */
         state = 2;
         nbits++;
-        wbits |= (ei << (wsize - nbits));
-        if (nbits == wsize)
+        wbits |= ( ei << ( wsize - nbits ) );
+        if( nbits == wsize )
         {
             /*
              * X = X^wsize R^-1 mod N
              */
-            for (i = 0; i < wsize; i++)
-                mpi_montmul(X, X, N, mm, &T);
+            for( i = 0; i < wsize; i++ )
+                mpi_montmul( X, X, N, mm, &T );
             /*
              * X = X * W[wbits] R^-1 mod N
              */
-            mpi_montmul(X, &W[wbits], N, mm, &T);
+            mpi_montmul( X, &W[wbits], N, mm, &T );
             state--;
             nbits = 0;
             wbits = 0;
@@ -2139,47 +2186,29 @@ int mbedtls_mpi_exp_mod(mbedtls_mpi *X, const mbedtls_mpi *A,
     /*
      * process the remaining bits
      */
-    for (i = 0; i < nbits; i++)
+    for( i = 0; i < nbits; i++ )
     {
-        mpi_montmul(X, X, N, mm, &T);
+        mpi_montmul( X, X, N, mm, &T );
         wbits <<= 1;
-        if ((wbits & (one << wsize)))
-            mpi_montmul(X, &W[1], N, mm, &T);
+        if( ( wbits & ( one << wsize ) ) != 0 )
+            mpi_montmul( X, &W[1], N, mm, &T );
     }
     /*
      * X = A^E * R * R^-1 mod N = A^E mod N
      */
-    mpi_montred(X, N, mm, &T);
-    if (neg && E->n && (E->p[0] & 1))
+    mpi_montred( X, N, mm, &T );
+    if( neg && E->n != 0 && ( E->p[0] & 1 ) != 0 )
     {
         X->s = -1;
-        MBEDTLS_MPI_CHK(mbedtls_mpi_add_mpi(X, N, X));
+        MBEDTLS_MPI_CHK( mbedtls_mpi_add_mpi( X, N, X ) );
     }
 cleanup:
-    for (i = (one << (wsize - 1)); i < (one << wsize); i++)
-        mbedtls_mpi_free(&W[i]);
-    mbedtls_mpi_free(&W[1]);
-    mbedtls_mpi_free(&T);
-    mbedtls_mpi_free(&Apos);
-    if (!_RR || !_RR->p)
-        mbedtls_mpi_free(&RR);
-    return ret;
-}
-
-static inline int Compare(const mbedtls_mpi *X,
-                          const mbedtls_mpi *Y,
-                          size_t i,
-                          size_t j)
-{
-    if (!i && !j) return 0;
-    if (i > j) return 1;
-    if (j > i) return -1;
-    for (; i > 0; i--)
-    {
-        if (X->p[i - 1] > Y->p[i - 1]) return 1;
-        if (X->p[i - 1] < Y->p[i - 1]) return -1;
-    }
-    return 0;
+    for( i = ( one << ( wsize - 1 ) ); i < ( one << wsize ); i++ )
+        mbedtls_mpi_free( &W[i] );
+    mbedtls_mpi_free( &W[1] ); mbedtls_mpi_free( &T ); mbedtls_mpi_free( &Apos );
+    if( _RR == NULL || _RR->p == NULL )
+        mbedtls_mpi_free( &RR );
+    return( ret );
 }
 
 /**
@@ -2193,53 +2222,53 @@ static inline int Compare(const mbedtls_mpi *X,
  * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if a memory allocation failed.
  * \return         Another negative error code on different kinds of failure.
  */
-int mbedtls_mpi_gcd(mbedtls_mpi *G, const mbedtls_mpi *A,
-                    const mbedtls_mpi *B)
+int mbedtls_mpi_gcd( mbedtls_mpi *G, const mbedtls_mpi *A, const mbedtls_mpi *B )
 {
     int ret = MBEDTLS_ERR_THIS_CORRUPTION;
     mbedtls_mpi TA, TB;
     size_t lz, lzt, i, j;
-    MPI_VALIDATE_RET(G);
-    MPI_VALIDATE_RET(A);
-    MPI_VALIDATE_RET(B);
-    mbedtls_mpi_init(&TA);
-    mbedtls_mpi_init(&TB);
-    MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&TA, A));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&TB, B));
-    lz = mbedtls_mpi_lsb(&TA);
-    lzt = mbedtls_mpi_lsb(&TB);
-    if (lzt < lz) lz = lzt;
-    mbedtls_mpi_shift_r(&TA, lz);
-    mbedtls_mpi_shift_r(&TB, lz);
+    MPI_VALIDATE_RET( G );
+    MPI_VALIDATE_RET( A );
+    MPI_VALIDATE_RET( B );
+    mbedtls_mpi_init( &TA ); mbedtls_mpi_init( &TB );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &TA, A ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &TB, B ) );
+    lz = mbedtls_mpi_lsb( &TA );
+    lzt = mbedtls_mpi_lsb( &TB );
+    if( lzt < lz )
+        lz = lzt;
+    MBEDTLS_MPI_CHK( mbedtls_mpi_shift_r( &TA, lz ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_shift_r( &TB, lz ) );
     TA.s = TB.s = 1;
-    i = mbedtls_mpi_bitlen(&TA);
-    j = mbedtls_mpi_bitlen(&TB);
-    while (!mbedtls_mpi_is_zero(&TA))
+    while( !mbedtls_mpi_is_zero( &TA ) )
     {
-        mbedtls_mpi_shift_r(&TA, mbedtls_mpi_lsb(&TA));
-        mbedtls_mpi_shift_r(&TB, mbedtls_mpi_lsb(&TB));
-        if (mpi_cmp_abs(&TA, &TB, &i, &j) >= 0)
+        MBEDTLS_MPI_CHK( mbedtls_mpi_shift_r( &TA, mbedtls_mpi_lsb( &TA ) ) );
+        MBEDTLS_MPI_CHK( mbedtls_mpi_shift_r( &TB, mbedtls_mpi_lsb( &TB ) ) );
+        if( mpi_cmp_abs( &TA, &TB, &i, &j ) >= 0 )
         {
-            MBEDTLS_MPI_CHK(mpi_sub_abs(&TA, &TA, &TB, j));
-            mbedtls_mpi_shift_r(&TA, 1);
+            MBEDTLS_MPI_CHK( mpi_sub_abs( &TA, &TA, &TB, j ) );
+            ShiftRight( TA.p, TA.n, 1 );
         }
         else
         {
-            MBEDTLS_MPI_CHK(mpi_sub_abs(&TB, &TB, &TA, i));
-            mbedtls_mpi_shift_r(&TB, 1);
+            MBEDTLS_MPI_CHK( mpi_sub_abs( &TB, &TB, &TA, i ) );
+            ShiftRight( TB.p, TB.n, 1 );
         }
     }
-    MBEDTLS_MPI_CHK(mbedtls_mpi_shift_l(&TB, lz));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_copy(G, &TB));
+    MBEDTLS_MPI_CHK( mbedtls_mpi_shift_l( &TB, lz ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( G, &TB ) );
 cleanup:
-    mbedtls_mpi_free(&TA);
-    mbedtls_mpi_free(&TB);
-    return ret;
+    mbedtls_mpi_free( &TA ); mbedtls_mpi_free( &TB );
+    return( ret );
 }
 
 /**
  * \brief          Fill an MPI with a number of random bytes.
  *
+ * Use a temporary bytes representation to make sure the result is the
+ * same regardless of the platform endianness (useful when f_rng is
+ * actually deterministic, eg for tests).
+ *
  * \param X        The destination MPI. This must point to an initialized MPI.
  * \param size     The number of random bytes to generate.
  * \param f_rng    The RNG function to use. This must not be \c NULL.
@@ -2254,23 +2283,23 @@ cleanup:
  *                 as a big-endian representation of an MPI; this can
  *                 be relevant in applications like deterministic ECDSA.
  */
-int mbedtls_mpi_fill_random(mbedtls_mpi *X, size_t size,
-                            int (*f_rng)(void *, unsigned char *, size_t),
-                            void *p_rng)
+int mbedtls_mpi_fill_random( mbedtls_mpi *X, size_t size,
+                             int (*f_rng)(void *, unsigned char *, size_t),
+                             void *p_rng )
 {
     int ret = MBEDTLS_ERR_THIS_CORRUPTION;
-    size_t const limbs = CHARS_TO_LIMBS(size);
-    size_t const overhead = (limbs * ciL) - size;
+    size_t const limbs = CHARS_TO_LIMBS( size );
+    size_t const overhead = ( limbs * ciL ) - size;
     unsigned char *Xp;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(f_rng);
-    MBEDTLS_MPI_CHK(mbedtls_mpi_resize(X, limbs));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_lset(X, 0));
-    Xp = (unsigned char *)X->p;
-    MBEDTLS_MPI_CHK(f_rng(p_rng, Xp + overhead, size));
-    mpi_bigendian_to_host(X->p, limbs);
+    MPI_VALIDATE_RET( X     );
+    MPI_VALIDATE_RET( f_rng );
+    MBEDTLS_MPI_CHK(mbedtls_mpi_resize( X, limbs ));
+    MBEDTLS_MPI_CHK( mbedtls_mpi_lset( X, 0 ) );
+    Xp = (unsigned char*) X->p;
+    MBEDTLS_MPI_CHK( f_rng( p_rng, Xp + overhead, size ) );
+    mpi_bigendian_to_host( X->p, limbs );
 cleanup:
-    return ret;
+    return( ret );
 }
 
 /**
@@ -2289,136 +2318,108 @@ cleanup:
  * \return         #MBEDTLS_ERR_MPI_NOT_ACCEPTABLE if \p has no modular inverse
  *                 with respect to \p N.
  */
-int mbedtls_mpi_inv_mod(mbedtls_mpi *X, const mbedtls_mpi *A,
-                        const mbedtls_mpi *N)
+int mbedtls_mpi_inv_mod( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *N )
 {
     int ret = MBEDTLS_ERR_THIS_CORRUPTION;
     mbedtls_mpi G, TA, TU, U1, U2, TB, TV, V1, V2;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(A);
-    MPI_VALIDATE_RET(N);
-    if (mbedtls_mpi_cmp_int(N, 1) <= 0)
-        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
-    mbedtls_mpi_init(&TA);
-    mbedtls_mpi_init(&TU);
-    mbedtls_mpi_init(&U1);
-    mbedtls_mpi_init(&U2);
-    mbedtls_mpi_init(&G);
-    mbedtls_mpi_init(&TB);
-    mbedtls_mpi_init(&TV);
-    mbedtls_mpi_init(&V1);
-    mbedtls_mpi_init(&V2);
-    MBEDTLS_MPI_CHK(mbedtls_mpi_gcd(&G, A, N));
-    if (!mbedtls_mpi_is_one(&G))
+    MPI_VALIDATE_RET( X );
+    MPI_VALIDATE_RET( A );
+    MPI_VALIDATE_RET( N );
+    if( mbedtls_mpi_cmp_int( N, 1 ) <= 0 )
+        return( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
+    mbedtls_mpi_init( &TA ); mbedtls_mpi_init( &TU ); mbedtls_mpi_init( &U1 ); mbedtls_mpi_init( &U2 );
+    mbedtls_mpi_init( &G ); mbedtls_mpi_init( &TB ); mbedtls_mpi_init( &TV );
+    mbedtls_mpi_init( &V1 ); mbedtls_mpi_init( &V2 );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_gcd( &G, A, N ) );
+    if( mbedtls_mpi_cmp_int( &G, 1 ) != 0 )
     {
         ret = MBEDTLS_ERR_MPI_NOT_ACCEPTABLE;
         goto cleanup;
     }
-    MBEDTLS_MPI_CHK(mbedtls_mpi_mod_mpi(&TA, A, N));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&TU, &TA));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&TB, N));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&TV, N));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_lset(&U1, 1));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_lset(&U2, 0));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_lset(&V1, 0));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_lset(&V2, 1));
+    MBEDTLS_MPI_CHK( mbedtls_mpi_mod_mpi( &TA, A, N ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &TU, &TA ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &TB, N ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &TV, N ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &U1, 1 ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &U2, 0 ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &V1, 0 ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &V2, 1 ) );
     do
     {
-        while (!(TU.p[0] & 1))
+        while( ( TU.p[0] & 1 ) == 0 )
         {
-            mbedtls_mpi_shift_r(&TU, 1);
-            if ((U1.p[0] & 1) || (U2.p[0] & 1))
+            ShiftRight( TU.p, TU.n, 1 );
+            if( ( U1.p[0] & 1 ) != 0 || ( U2.p[0] & 1 ) != 0 )
             {
-                MBEDTLS_MPI_CHK(mbedtls_mpi_add_mpi(&U1, &U1, &TB));
-                MBEDTLS_MPI_CHK(mbedtls_mpi_sub_mpi(&U2, &U2, &TA));
+                MBEDTLS_MPI_CHK( mbedtls_mpi_add_mpi( &U1, &U1, &TB ) );
+                MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( &U2, &U2, &TA ) );
             }
-            mbedtls_mpi_shift_r(&U1, 1);
-            mbedtls_mpi_shift_r(&U2, 1);
+            ShiftRight( U1.p, U1.n, 1 );
+            ShiftRight( U2.p, U2.n, 1 );
         }
-        while (!(TV.p[0] & 1))
+        while( ( TV.p[0] & 1 ) == 0 )
         {
-            mbedtls_mpi_shift_r(&TV, 1);
-            if ((V1.p[0] & 1) || (V2.p[0] & 1))
+            ShiftRight( TV.p, TV.n, 1 );
+            if( ( V1.p[0] & 1 ) != 0 || ( V2.p[0] & 1 ) != 0 )
             {
-                MBEDTLS_MPI_CHK(mbedtls_mpi_add_mpi(&V1, &V1, &TB));
-                MBEDTLS_MPI_CHK(mbedtls_mpi_sub_mpi(&V2, &V2, &TA));
+                MBEDTLS_MPI_CHK( mbedtls_mpi_add_mpi( &V1, &V1, &TB ) );
+                MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( &V2, &V2, &TA ) );
             }
-            mbedtls_mpi_shift_r(&V1, 1);
-            mbedtls_mpi_shift_r(&V2, 1);
+            ShiftRight( V1.p, V1.n, 1 );
+            ShiftRight( V2.p, V2.n, 1 );
         }
-        if (mbedtls_mpi_cmp_mpi(&TU, &TV) >= 0)
+        if( mbedtls_mpi_cmp_mpi( &TU, &TV ) >= 0 )
         {
-            MBEDTLS_MPI_CHK(mbedtls_mpi_sub_mpi(&TU, &TU, &TV));
-            MBEDTLS_MPI_CHK(mbedtls_mpi_sub_mpi(&U1, &U1, &V1));
-            MBEDTLS_MPI_CHK(mbedtls_mpi_sub_mpi(&U2, &U2, &V2));
+            MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( &TU, &TU, &TV ) );
+            MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( &U1, &U1, &V1 ) );
+            MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( &U2, &U2, &V2 ) );
         }
         else
         {
-            MBEDTLS_MPI_CHK(mbedtls_mpi_sub_mpi(&TV, &TV, &TU));
-            MBEDTLS_MPI_CHK(mbedtls_mpi_sub_mpi(&V1, &V1, &U1));
-            MBEDTLS_MPI_CHK(mbedtls_mpi_sub_mpi(&V2, &V2, &U2));
+            MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( &TV, &TV, &TU ) );
+            MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( &V1, &V1, &U1 ) );
+            MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( &V2, &V2, &U2 ) );
         }
-    } while (!mbedtls_mpi_is_zero(&TU));
-    while (V1.s < 0)
-    {
-        MBEDTLS_MPI_CHK(mbedtls_mpi_add_mpi(&V1, &V1, N));
     }
-    while (mbedtls_mpi_cmp_mpi(&V1, N) >= 0)
-    {
-        MBEDTLS_MPI_CHK(mbedtls_mpi_sub_mpi(&V1, &V1, N));
-    }
-    MBEDTLS_MPI_CHK(mbedtls_mpi_copy(X, &V1));
+    while( !mbedtls_mpi_is_zero(&TU) );
+    while( mbedtls_mpi_cmp_int( &V1, 0 ) < 0 )
+        MBEDTLS_MPI_CHK( mbedtls_mpi_add_mpi( &V1, &V1, N ) );
+    while( mbedtls_mpi_cmp_mpi( &V1, N ) >= 0 )
+        MBEDTLS_MPI_CHK( mbedtls_mpi_sub_mpi( &V1, &V1, N ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( X, &V1 ) );
 cleanup:
-    mbedtls_mpi_free(&TA);
-    mbedtls_mpi_free(&TU);
-    mbedtls_mpi_free(&U1);
-    mbedtls_mpi_free(&U2);
-    mbedtls_mpi_free(&G);
-    mbedtls_mpi_free(&TB);
-    mbedtls_mpi_free(&TV);
-    mbedtls_mpi_free(&V1);
-    mbedtls_mpi_free(&V2);
-    return ret;
+    mbedtls_mpi_free( &TA ); mbedtls_mpi_free( &TU ); mbedtls_mpi_free( &U1 ); mbedtls_mpi_free( &U2 );
+    mbedtls_mpi_free( &G ); mbedtls_mpi_free( &TB ); mbedtls_mpi_free( &TV );
+    mbedtls_mpi_free( &V1 ); mbedtls_mpi_free( &V2 );
+    return( ret );
 }
 
 #if defined(MBEDTLS_GENPRIME)
 
-static const short kSmallPrime[] = {
-    3,   5,   7,   11,  13,  17,  19,  23,  29,  31,  37,  41,  43,  47,
-    53,  59,  61,  67,  71,  73,  79,  83,  89,  97,  101, 103, 107, 109,
-    113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191,
-    193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269,
-    271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353,
-    359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439,
-    443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523,
-    541, 547, 557, 563, 569, 571, 577, 587, 593, 599, 601, 607, 613, 617,
-    619, 631, 641, 643, 647, 653, 659, 661, 673, 677, 683, 691, 701, 709,
-    719, 727, 733, 739, 743, 751, 757, 761, 769, 773, 787, 797, 809, 811,
-    821, 823, 827, 829, 839, 853, 857, 859, 863, 877, 881, 883, 887, 907,
-    911, 919, 929, 937, 941, 947, 953, 967, 971, 977, 983, 991, 997,
-};
-
-static struct Divisor kSmallDivisor[ARRAYLEN(kSmallPrime)];
-
-static bool IsDivisible( const mbedtls_mpi_uint *Ap, size_t An, 
-                         mbedtls_mpi_sint b, struct Divisor d )
+static const short small_prime[] =
 {
-    size_t i;
-    mbedtls_mpi_uint x, y, z;
-    MBEDTLS_ASSERT(b >= 3);
-    for (i = An, y = 0; i > 0; i--)
-    {
-        x = Ap[i - 1];
-        y = (y << biH) | (x >> biH);
-        z = Divide(y, d);
-        y -= z * b;
-        x <<= biH;
-        y = (y << biH) | (x >> biH);
-        z = Divide(y, d);
-        y -= z * b;
-    }
-    return !y;
-}
+        3,    5,    7,   11,   13,   17,   19,   23,
+       29,   31,   37,   41,   43,   47,   53,   59,
+       61,   67,   71,   73,   79,   83,   89,   97,
+      101,  103,  107,  109,  113,  127,  131,  137,
+      139,  149,  151,  157,  163,  167,  173,  179,
+      181,  191,  193,  197,  199,  211,  223,  227,
+      229,  233,  239,  241,  251,  257,  263,  269,
+      271,  277,  281,  283,  293,  307,  311,  313,
+      317,  331,  337,  347,  349,  353,  359,  367,
+      373,  379,  383,  389,  397,  401,  409,  419,
+      421,  431,  433,  439,  443,  449,  457,  461,
+      463,  467,  479,  487,  491,  499,  503,  509,
+      521,  523,  541,  547,  557,  563,  569,  571,
+      577,  587,  593,  599,  601,  607,  613,  617,
+      619,  631,  641,  643,  647,  653,  659,  661,
+      673,  677,  683,  691,  701,  709,  719,  727,
+      733,  739,  743,  751,  757,  761,  769,  773,
+      787,  797,  809,  811,  821,  823,  827,  829,
+      839,  853,  857,  859,  863,  877,  881,  883,
+      887,  907,  911,  919,  929,  937,  941,  947,
+      953,  967,  971,  977,  983,  991,  997, -103
+};
 
 /*
  * Small divisors test (X must be positive)
@@ -2429,126 +2430,114 @@ static bool IsDivisible( const mbedtls_mpi_uint *Ap, size_t An,
  * MBEDTLS_ERR_MPI_NOT_ACCEPTABLE: certain non-prime
  * other negative: error
  */
-static int mpi_check_small_factors(const mbedtls_mpi *X)
+static int mpi_check_small_factors( const mbedtls_mpi *X )
 {
     int ret = 0;
-    size_t i, n;
-    static bool once;
-    if (!(X->p[0] & 1))
-        return MBEDTLS_ERR_MPI_NOT_ACCEPTABLE;
-    n = mbedtls_mpi_limbs(X);
-    if (!once) {
-        for (i = 0; i < ARRAYLEN(kSmallPrime); ++i)
-            kSmallDivisor[i] = GetDivisor(kSmallPrime[i]);
-        once = true;
+    size_t i;
+    mbedtls_mpi_uint r;
+    if( ( X->p[0] & 1 ) == 0 )
+        return( MBEDTLS_ERR_MPI_NOT_ACCEPTABLE );
+    for( i = 0; small_prime[i] > 0; i++ )
+    {
+        if( mbedtls_mpi_cmp_int( X, small_prime[i] ) <= 0 )
+            return( 1 );
+        MBEDTLS_MPI_CHK( mbedtls_mpi_mod_int( &r, X, small_prime[i] ) );
+        if( r == 0 )
+            return( MBEDTLS_ERR_MPI_NOT_ACCEPTABLE );
     }
-    for (i = 0; i < ARRAYLEN(kSmallPrime); i++) {
-        if (n == 1 && mbedtls_mpi_cmp_int(X, kSmallPrime[i]) <= 0)
-            return 1;
-        if (IsDivisible(X->p, X->n, kSmallPrime[i], kSmallDivisor[i]))
-            return MBEDTLS_ERR_MPI_NOT_ACCEPTABLE;
-    }
-    return ret;
+cleanup:
+    return( ret );
 }
 
 /*
  * Miller-Rabin pseudo-primality test  (HAC 4.24)
  */
-static int mpi_miller_rabin(const mbedtls_mpi *X, size_t rounds,
-                            int (*f_rng)(void *, unsigned char *, size_t),
-                            void *p_rng)
+static int mpi_miller_rabin( const mbedtls_mpi *X, size_t rounds,
+                             int (*f_rng)(void *, unsigned char *, size_t),
+                             void *p_rng )
 {
     int ret, count;
     size_t i, j, k, s;
     mbedtls_mpi W, R, T, A, RR;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(f_rng);
-    mbedtls_mpi_init(&W);
-    mbedtls_mpi_init(&R);
-    mbedtls_mpi_init(&T);
-    mbedtls_mpi_init(&A);
-    mbedtls_mpi_init(&RR);
+    MPI_VALIDATE_RET( X     );
+    MPI_VALIDATE_RET( f_rng );
+    mbedtls_mpi_init( &W ); mbedtls_mpi_init( &R );
+    mbedtls_mpi_init( &T ); mbedtls_mpi_init( &A );
+    mbedtls_mpi_init( &RR );
     /*
      * W = |X| - 1
      * R = W >> lsb( W )
      */
-    MBEDTLS_MPI_CHK(mbedtls_mpi_sub_int(&W, X, 1));
-    s = mbedtls_mpi_lsb(&W);
-    MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&R, &W));
-    mbedtls_mpi_shift_r(&R, s);
-    for (i = 0; i < rounds; i++)
+    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_int( &W, X, 1 ) );
+    s = mbedtls_mpi_lsb( &W );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &R, &W ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_shift_r( &R, s ) );
+    for( i = 0; i < rounds; i++ )
     {
         /*
          * pick a random A, 1 < A < |X| - 1
          */
         count = 0;
-        do
-        {
-            MBEDTLS_MPI_CHK(mbedtls_mpi_fill_random(&A, X->n * ciL, f_rng, p_rng));
-            j = mbedtls_mpi_bitlen(&A);
-            k = mbedtls_mpi_bitlen(&W);
-            if (j > k)
-            {
-                A.p[A.n - 1] &= ((mbedtls_mpi_uint)1 << (k - (A.n - 1) * biL - 1)) - 1;
+        do {
+            MBEDTLS_MPI_CHK( mbedtls_mpi_fill_random( &A, X->n * ciL, f_rng, p_rng ) );
+            j = mbedtls_mpi_bitlen( &A );
+            k = mbedtls_mpi_bitlen( &W );
+            if (j > k) {
+                A.p[A.n - 1] &= ( (mbedtls_mpi_uint) 1 << ( k - ( A.n - 1 ) * biL - 1 ) ) - 1;
             }
-            if (count++ > 30)
-            {
+            if (count++ > 30) {
                 ret = MBEDTLS_ERR_MPI_NOT_ACCEPTABLE;
                 goto cleanup;
             }
-        } while (mbedtls_mpi_cmp_mpi(&A, &W) >= 0 ||
-                 mbedtls_mpi_cmp_int(&A, 1) <= 0);
-
+        } while ( mbedtls_mpi_cmp_mpi( &A, &W ) >= 0 ||
+                  mbedtls_mpi_cmp_int( &A, 1 )  <= 0    );
         /*
          * A = A^R mod |X|
          */
-        MBEDTLS_MPI_CHK(mbedtls_mpi_exp_mod(&A, &A, &R, X, &RR));
-        if (!mbedtls_mpi_cmp_mpi(&A, &W) || mbedtls_mpi_is_one(&A))
+        MBEDTLS_MPI_CHK( mbedtls_mpi_exp_mod( &A, &A, &R, X, &RR ) );
+        if( mbedtls_mpi_cmp_mpi( &A, &W ) == 0 ||
+            mbedtls_mpi_cmp_int( &A,  1 ) == 0 )
             continue;
         j = 1;
-
-        while (j < s && mbedtls_mpi_cmp_mpi(&A, &W))
+        while( j < s && mbedtls_mpi_cmp_mpi( &A, &W ) != 0 )
         {
             /*
              * A = A * A mod |X|
              */
-            MBEDTLS_MPI_CHK(mbedtls_mpi_mul_mpi(&T, &A, &A));
-            MBEDTLS_MPI_CHK(mbedtls_mpi_mod_mpi(&A, &T, X));
-            if (mbedtls_mpi_is_one(&A)) break;
+            MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mpi( &T, &A, &A ) );
+            MBEDTLS_MPI_CHK( mbedtls_mpi_mod_mpi( &A, &T, X  ) );
+            if( mbedtls_mpi_cmp_int( &A, 1 ) == 0 )
+                break;
             j++;
         }
-
         /*
          * not prime if A != |X| - 1 or A == 1
          */
-        if (mbedtls_mpi_cmp_mpi(&A, &W) || mbedtls_mpi_is_one(&A))
+        if( mbedtls_mpi_cmp_mpi( &A, &W ) != 0 ||
+            mbedtls_mpi_cmp_int( &A,  1 ) == 0 )
         {
             ret = MBEDTLS_ERR_MPI_NOT_ACCEPTABLE;
             break;
         }
     }
-
 cleanup:
-    mbedtls_mpi_free(&W);
-    mbedtls_mpi_free(&R);
-    mbedtls_mpi_free(&T);
-    mbedtls_mpi_free(&A);
-    mbedtls_mpi_free(&RR);
-    return ret;
+    mbedtls_mpi_free( &W ); mbedtls_mpi_free( &R );
+    mbedtls_mpi_free( &T ); mbedtls_mpi_free( &A );
+    mbedtls_mpi_free( &RR );
+    return( ret );
 }
 
 /**
  * \brief          Miller-Rabin primality test.
  *
- * \warning        If \p X is potentially generated by an adversary, for
- *                 example when validating cryptographic parameters that
- *                 you didn't generate yourself and that are supposed to
- *                 be prime, then \p rounds should be at least the half
- *                 of the security strength of the cryptographic
- *                 algorithm. On the other hand, if \p X is chosen
- *                 uniformly or non-adversially (as is the case when
- *                 mbedtls_mpi_gen_prime calls this function), then \p
- *                 rounds can be much lower.
+ * \warning        If \p X is potentially generated by an adversary, for example
+ *                 when validating cryptographic parameters that you didn't
+ *                 generate yourself and that are supposed to be prime, then
+ *                 \p rounds should be at least the half of the security
+ *                 strength of the cryptographic algorithm. On the other hand,
+ *                 if \p X is chosen uniformly or non-adversially (as is the
+ *                 case when mbedtls_mpi_gen_prime calls this function), then
+ *                 \p rounds can be much lower.
  *
  * \param X        The MPI to check for primality.
  *                 This must point to an initialized MPI.
@@ -2561,32 +2550,33 @@ cleanup:
  *                 a context parameter.
  *
  * \return         \c 0 if successful, i.e. \p X is probably prime.
- * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if a allocation failed.
+ * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if a memory allocation failed.
  * \return         #MBEDTLS_ERR_MPI_NOT_ACCEPTABLE if \p X is not prime.
- * \return         Another negative error code on other failures.
+ * \return         Another negative error code on other kinds of failure.
  */
-int mbedtls_mpi_is_prime_ext(const mbedtls_mpi *X, int rounds,
-                             int (*f_rng)(void *, unsigned char *, size_t),
-                             void *p_rng)
+int mbedtls_mpi_is_prime_ext( const mbedtls_mpi *X, int rounds,
+                              int (*f_rng)(void *, unsigned char *, size_t),
+                              void *p_rng )
 {
     int ret = MBEDTLS_ERR_THIS_CORRUPTION;
     mbedtls_mpi XX;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(f_rng);
+    MPI_VALIDATE_RET( X     );
+    MPI_VALIDATE_RET( f_rng );
     XX.s = 1;
     XX.n = X->n;
     XX.p = X->p;
-    if (mbedtls_mpi_is_zero(&XX) || mbedtls_mpi_is_one(&XX))
-        return MBEDTLS_ERR_MPI_NOT_ACCEPTABLE;
-    if (!mbedtls_mpi_cmp_int(&XX, 2))
-        return 0;
-    if ((ret = mpi_check_small_factors(&XX)))
+    if( mbedtls_mpi_cmp_int( &XX, 0 ) == 0 ||
+        mbedtls_mpi_cmp_int( &XX, 1 ) == 0 )
+        return( MBEDTLS_ERR_MPI_NOT_ACCEPTABLE );
+    if( mbedtls_mpi_cmp_int( &XX, 2 ) == 0 )
+        return( 0 );
+    if( ( ret = mpi_check_small_factors( &XX ) ) != 0 )
     {
-        if (ret == 1)
-            return 0;
-        return ret;
+        if( ret == 1 )
+            return( 0 );
+        return( ret );
     }
-    return mpi_miller_rabin(&XX, rounds, f_rng, p_rng);
+    return( mpi_miller_rabin( &XX, rounds, f_rng, p_rng ) );
 }
 
 /**
@@ -2609,37 +2599,33 @@ int mbedtls_mpi_is_prime_ext(const mbedtls_mpi *X, int rounds,
  *
  * \return         \c 0 if successful, in which case \p X holds a
  *                 probably prime number.
- * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if a allocation failed.
- * \return         #MBEDTLS_ERR_MPI_BAD_INPUT_DATA if `nbits` is not
- *                 between \c 3 and #MBEDTLS_MPI_MAX_BITS.
+ * \return         #MBEDTLS_ERR_MPI_ALLOC_FAILED if a memory allocation failed.
+ * \return         #MBEDTLS_ERR_MPI_BAD_INPUT_DATA if `nbits` is not between
+ *                 \c 3 and #MBEDTLS_MPI_MAX_BITS.
  */
-int mbedtls_mpi_gen_prime(mbedtls_mpi *X, size_t nbits, int flags,
-                          int (*f_rng)(void *, unsigned char *, size_t),
-                          void *p_rng)
+int mbedtls_mpi_gen_prime( mbedtls_mpi *X, size_t nbits, int flags,
+                           int (*f_rng)(void *, unsigned char *, size_t),
+                           void *p_rng )
 {
     int ret = MBEDTLS_ERR_MPI_NOT_ACCEPTABLE;
     size_t k, n;
     int rounds;
     mbedtls_mpi_uint r;
     mbedtls_mpi Y;
-    MPI_VALIDATE_RET(X);
-    MPI_VALIDATE_RET(f_rng);
-    if (nbits < 3 || nbits > MBEDTLS_MPI_MAX_BITS)
-        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
-    mbedtls_mpi_init(&Y);
-    n = BITS_TO_LIMBS(nbits);
-    if (!(flags & MBEDTLS_MPI_GEN_PRIME_FLAG_LOW_ERR))
+    MPI_VALIDATE_RET( X     );
+    MPI_VALIDATE_RET( f_rng );
+    if( nbits < 3 || nbits > MBEDTLS_MPI_MAX_BITS )
+        return( MBEDTLS_ERR_MPI_BAD_INPUT_DATA );
+    mbedtls_mpi_init( &Y );
+    n = BITS_TO_LIMBS( nbits );
+    if( ( flags & MBEDTLS_MPI_GEN_PRIME_FLAG_LOW_ERR ) == 0 )
     {
         /*
          * 2^-80 error probability, number of rounds chosen per HAC, table 4.4
          */
-        rounds = ((nbits >= 1300) ? 2
-                : (nbits >= 850)  ? 3
-                : (nbits >= 650)  ? 4
-                : (nbits >= 350)  ? 8
-                : (nbits >= 250)  ? 12
-                : (nbits >= 150)  ? 18
-                : 27);
+        rounds = ( ( nbits >= 1300 ) ?  2 : ( nbits >=  850 ) ?  3 :
+                   ( nbits >=  650 ) ?  4 : ( nbits >=  350 ) ?  8 :
+                   ( nbits >=  250 ) ? 12 : ( nbits >=  150 ) ? 18 : 27 );
     }
     else
     {
@@ -2647,29 +2633,24 @@ int mbedtls_mpi_gen_prime(mbedtls_mpi *X, size_t nbits, int flags,
          * 2^-100 error probability, number of rounds computed based on HAC,
          * fact 4.48
          */
-        rounds = ((nbits >= 1450) ? 4
-                : (nbits >= 1150) ? 5
-                : (nbits >= 1000) ? 6
-                : (nbits >= 850)  ? 7
-                : (nbits >= 750)  ? 8
-                : (nbits >= 500)  ? 13
-                : (nbits >= 250)  ? 28
-                : (nbits >= 150)  ? 40
-                : 51);
+        rounds = ( ( nbits >= 1450 ) ?  4 : ( nbits >=  1150 ) ?  5 :
+                   ( nbits >= 1000 ) ?  6 : ( nbits >=   850 ) ?  7 :
+                   ( nbits >=  750 ) ?  8 : ( nbits >=   500 ) ? 13 :
+                   ( nbits >=  250 ) ? 28 : ( nbits >=   150 ) ? 40 : 51 );
     }
-    while (1)
+    while( 1 )
     {
-        MBEDTLS_MPI_CHK(mbedtls_mpi_fill_random(X, n * ciL, f_rng, p_rng));
-        /* make sure generated number is at least (nbits-1)+0.5 bits (FIPS 186-4
-         * §B.3.3 steps 4.4, 5.5) */
-        if (X->p[n - 1] < 0xb504f333f9de6485ULL /* ceil(2^63.5) */) continue;
+        MBEDTLS_MPI_CHK( mbedtls_mpi_fill_random( X, n * ciL, f_rng, p_rng ) );
+        /* make sure generated number is at least (nbits-1)+0.5 bits (FIPS 186-4 §B.3.3 steps 4.4, 5.5) */
+        if( X->p[n-1] < 0xb504f333f9de6485ULL  /* ceil(2^63.5) */ ) continue;
         k = n * biL;
-        if (k > nbits) mbedtls_mpi_shift_r(X, k - nbits);
+        if( k > nbits ) MBEDTLS_MPI_CHK( mbedtls_mpi_shift_r( X, k - nbits ) );
         X->p[0] |= 1;
-        if (!(flags & MBEDTLS_MPI_GEN_PRIME_FLAG_DH))
+        if( ( flags & MBEDTLS_MPI_GEN_PRIME_FLAG_DH ) == 0 )
         {
-            ret = mbedtls_mpi_is_prime_ext(X, rounds, f_rng, p_rng);
-            if (ret != MBEDTLS_ERR_MPI_NOT_ACCEPTABLE) goto cleanup;
+            ret = mbedtls_mpi_is_prime_ext( X, rounds, f_rng, p_rng );
+            if( ret != MBEDTLS_ERR_MPI_NOT_ACCEPTABLE )
+                goto cleanup;
         }
         else
         {
@@ -2679,158 +2660,178 @@ int mbedtls_mpi_gen_prime(mbedtls_mpi *X, size_t nbits, int flags,
              * Make sure it is satisfied, while keeping X = 3 mod 4
              */
             X->p[0] |= 2;
-            MBEDTLS_MPI_CHK(mbedtls_mpi_mod_int(&r, X, 3));
-            if (r == 0)
-                MBEDTLS_MPI_CHK(mbedtls_mpi_add_int(X, X, 8));
-            else if (r == 1)
-                MBEDTLS_MPI_CHK(mbedtls_mpi_add_int(X, X, 4));
+            MBEDTLS_MPI_CHK( mbedtls_mpi_mod_int( &r, X, 3 ) );
+            if( r == 0 )
+                MBEDTLS_MPI_CHK( mbedtls_mpi_add_int( X, X, 8 ) );
+            else if( r == 1 )
+                MBEDTLS_MPI_CHK( mbedtls_mpi_add_int( X, X, 4 ) );
             /* Set Y = (X-1) / 2, which is X / 2 because X is odd */
-            MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&Y, X));
-            mbedtls_mpi_shift_r( &Y, 1 );
-            while (1)
+            MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &Y, X ) );
+            ShiftRight( Y.p, Y.n, 1 );
+            while( 1 )
             {
                 /*
                  * First, check small factors for X and Y
                  * before doing Miller-Rabin on any of them
                  */
-                if (!(ret = mpi_check_small_factors(X)) &&
-                    !(ret = mpi_check_small_factors(&Y)) &&
-                    !(ret = mpi_miller_rabin(X, rounds, f_rng, p_rng)) &&
-                    !(ret = mpi_miller_rabin(&Y, rounds, f_rng, p_rng)))
+                if( ( ret = mpi_check_small_factors(  X         ) ) == 0 &&
+                    ( ret = mpi_check_small_factors( &Y         ) ) == 0 &&
+                    ( ret = mpi_miller_rabin(  X, rounds, f_rng, p_rng  ) )
+                                                                    == 0 &&
+                    ( ret = mpi_miller_rabin( &Y, rounds, f_rng, p_rng  ) )
+                                                                    == 0 )
+                    goto cleanup;
+                if( ret != MBEDTLS_ERR_MPI_NOT_ACCEPTABLE )
                     goto cleanup;
-                if (ret != MBEDTLS_ERR_MPI_NOT_ACCEPTABLE) goto cleanup;
                 /*
                  * Next candidates. We want to preserve Y = (X-1) / 2 and
                  * Y = 1 mod 2 and Y = 2 mod 3 (eq X = 3 mod 4 and X = 2 mod 3)
                  * so up Y by 6 and X by 12.
                  */
-                MBEDTLS_MPI_CHK(mbedtls_mpi_add_int(X, X, 12));
-                MBEDTLS_MPI_CHK(mbedtls_mpi_add_int(&Y, &Y, 6));
+                MBEDTLS_MPI_CHK( mbedtls_mpi_add_int(  X,  X, 12 ) );
+                MBEDTLS_MPI_CHK( mbedtls_mpi_add_int( &Y, &Y, 6  ) );
             }
         }
     }
 cleanup:
-    mbedtls_mpi_free(&Y);
-    return ret;
+    mbedtls_mpi_free( &Y );
+    return( ret );
 }
 
 #endif /* MBEDTLS_GENPRIME */
 
 #if defined(MBEDTLS_SELF_TEST)
 
-#define GCD_PAIR_COUNT 3
+#define GCD_PAIR_COUNT  3
 
-static const int gcd_pairs[GCD_PAIR_COUNT][3] = {
-    {693, 609, 21}, {1764, 868, 28}, {768454923, 542167814, 1}};
+static const int gcd_pairs[GCD_PAIR_COUNT][3] =
+{
+    { 693, 609, 21 },
+    { 1764, 868, 28 },
+    { 768454923, 542167814, 1 }
+};
 
 /**
  * \brief          Checkup routine
  *
  * \return         0 if successful, or 1 if the test failed
  */
-int mbedtls_mpi_self_test(int verbose)
+int mbedtls_mpi_self_test( int verbose )
 {
     int ret, i;
     mbedtls_mpi A, E, N, X, Y, U, V;
-    mbedtls_mpi_init(&A);
-    mbedtls_mpi_init(&E);
-    mbedtls_mpi_init(&N);
-    mbedtls_mpi_init(&X);
-    mbedtls_mpi_init(&Y);
-    mbedtls_mpi_init(&U);
-    mbedtls_mpi_init(&V);
-    MBEDTLS_MPI_CHK(mbedtls_mpi_read_string(&A, 16,
-                                            "EFE021C2645FD1DC586E69184AF4A31E"
-                                            "D5F53E93B5F123FA41680867BA110131"
-                                            "944FE7952E2517337780CB0DB80E61AA"
-                                            "E7C8DDC6C5C6AADEB34EB38A2F40D5E6"));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_read_string(&E, 16,
-                                            "B2E7EFD37075B9F03FF989C7C5051C20"
-                                            "34D2A323810251127E7BF8625A4F49A5"
-                                            "F3E27F4DA8BD59C47D6DAABA4C8127BD"
-                                            "5B5C25763222FEFCCFC38B832366C29E"));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_read_string(&N, 16,
-                                            "0066A198186C18C10B2F5ED9B522752A"
-                                            "9830B69916E535C8F047518A889A43A5"
-                                            "94B6BED27A168D31D4A52F88925AA8F5"));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_mul_mpi(&X, &A, &N));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_read_string(&U, 16,
-                                            "602AB7ECA597A3D6B56FF9829A5E8B85"
-                                            "9E857EA95A03512E2BAE7391688D264A"
-                                            "A5663B0341DB9CCFD2C4C5F421FEC814"
-                                            "8001B72E848A38CAE1C65F78E56ABDEF"
-                                            "E12D3C039B8A02D6BE593F0BBBDA56F1"
-                                            "ECF677152EF804370C1A305CAF3B5BF1"
-                                            "30879B56C61DE584A0F53A2447A51E"));
-    if (verbose) mbedtls_printf("  MPI test #1 (mul_mpi): ");
-    if (mbedtls_mpi_cmp_mpi(&X, &U)) {
-        if (verbose) mbedtls_printf("failed\n");
+    mbedtls_mpi_init( &A ); mbedtls_mpi_init( &E ); mbedtls_mpi_init( &N ); mbedtls_mpi_init( &X );
+    mbedtls_mpi_init( &Y ); mbedtls_mpi_init( &U ); mbedtls_mpi_init( &V );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_read_string( &A, 16,
+        "EFE021C2645FD1DC586E69184AF4A31E" \
+        "D5F53E93B5F123FA41680867BA110131" \
+        "944FE7952E2517337780CB0DB80E61AA" \
+        "E7C8DDC6C5C6AADEB34EB38A2F40D5E6" ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_read_string( &E, 16,
+        "B2E7EFD37075B9F03FF989C7C5051C20" \
+        "34D2A323810251127E7BF8625A4F49A5" \
+        "F3E27F4DA8BD59C47D6DAABA4C8127BD" \
+        "5B5C25763222FEFCCFC38B832366C29E" ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_read_string( &N, 16,
+        "0066A198186C18C10B2F5ED9B522752A" \
+        "9830B69916E535C8F047518A889A43A5" \
+        "94B6BED27A168D31D4A52F88925AA8F5" ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_mpi( &X, &A, &N ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_read_string( &U, 16,
+        "602AB7ECA597A3D6B56FF9829A5E8B85" \
+        "9E857EA95A03512E2BAE7391688D264A" \
+        "A5663B0341DB9CCFD2C4C5F421FEC814" \
+        "8001B72E848A38CAE1C65F78E56ABDEF" \
+        "E12D3C039B8A02D6BE593F0BBBDA56F1" \
+        "ECF677152EF804370C1A305CAF3B5BF1" \
+        "30879B56C61DE584A0F53A2447A51E" ) );
+    if( verbose != 0 )
+        mbedtls_printf( "  MPI test #1 (mul_mpi): " );
+    if( mbedtls_mpi_cmp_mpi( &X, &U ) != 0 )
+    {
+        if( verbose != 0 )
+            mbedtls_printf( "failed\n" );
         ret = 1;
         goto cleanup;
     }
-    if (verbose) mbedtls_printf("passed\n");
-    MBEDTLS_MPI_CHK(mbedtls_mpi_div_mpi(&X, &Y, &A, &N));
-    MBEDTLS_MPI_CHK(
-        mbedtls_mpi_read_string(&U, 16, "256567336059E52CAE22925474705F39A94"));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_read_string(&V, 16,
-                                            "6613F26162223DF488E9CD48CC132C7A"
-                                            "0AC93C701B001B092E4E5B9F73BCD27B"
-                                            "9EE50D0657C77F374E903CDFA4C642"));
-    if (verbose) mbedtls_printf("  MPI test #2 (div_mpi): ");
-    if (mbedtls_mpi_cmp_mpi(&X, &U) || mbedtls_mpi_cmp_mpi(&Y, &V)) {
-        if (verbose) mbedtls_printf("failed\n");
+    if( verbose != 0 )
+        mbedtls_printf( "passed\n" );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_div_mpi( &X, &Y, &A, &N ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_read_string( &U, 16,
+        "256567336059E52CAE22925474705F39A94" ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_read_string( &V, 16,
+        "6613F26162223DF488E9CD48CC132C7A" \
+        "0AC93C701B001B092E4E5B9F73BCD27B" \
+        "9EE50D0657C77F374E903CDFA4C642" ) );
+    if( verbose != 0 )
+        mbedtls_printf( "  MPI test #2 (div_mpi): " );
+    if( mbedtls_mpi_cmp_mpi( &X, &U ) != 0 ||
+        mbedtls_mpi_cmp_mpi( &Y, &V ) != 0 )
+    {
+        if( verbose != 0 )
+            mbedtls_printf( "failed\n" );
         ret = 1;
         goto cleanup;
     }
-    if (verbose) mbedtls_printf("passed\n");
-    MBEDTLS_MPI_CHK(mbedtls_mpi_exp_mod(&X, &A, &E, &N, NULL));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_read_string(&U, 16,
-                                            "36E139AEA55215609D2816998ED020BB"
-                                            "BD96C37890F65171D948E9BC7CBAA4D9"
-                                            "325D24D6A3C12710F10A09FA08AB87"));
-    if (verbose) mbedtls_printf("  MPI test #3 (exp_mod): ");
-    if (mbedtls_mpi_cmp_mpi(&X, &U)) {
-        if (verbose) mbedtls_printf("failed\n");
+    if( verbose != 0 )
+        mbedtls_printf( "passed\n" );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_exp_mod( &X, &A, &E, &N, NULL ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_read_string( &U, 16,
+        "36E139AEA55215609D2816998ED020BB" \
+        "BD96C37890F65171D948E9BC7CBAA4D9" \
+        "325D24D6A3C12710F10A09FA08AB87" ) );
+    if( verbose != 0 )
+        mbedtls_printf( "  MPI test #3 (exp_mod): " );
+    if( mbedtls_mpi_cmp_mpi( &X, &U ) != 0 )
+    {
+        if( verbose != 0 )
+            mbedtls_printf( "failed\n" );
         ret = 1;
         goto cleanup;
     }
-    if (verbose) mbedtls_printf("passed\n");
-    MBEDTLS_MPI_CHK(mbedtls_mpi_inv_mod(&X, &A, &N));
-    MBEDTLS_MPI_CHK(mbedtls_mpi_read_string(&U, 16,
-                                            "003A0AAEDD7E784FC07D8F9EC6E3BFD5"
-                                            "C3DBA76456363A10869622EAC2DD84EC"
-                                            "C5B8A74DAC4D09E03B5E0BE779F2DF61"));
-    if (verbose) mbedtls_printf("  MPI test #4 (inv_mod): ");
-    if (mbedtls_mpi_cmp_mpi(&X, &U)) {
-        if (verbose) mbedtls_printf("failed\n");
+    if( verbose != 0 )
+        mbedtls_printf( "passed\n" );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_inv_mod( &X, &A, &N ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_read_string( &U, 16,
+        "003A0AAEDD7E784FC07D8F9EC6E3BFD5" \
+        "C3DBA76456363A10869622EAC2DD84EC" \
+        "C5B8A74DAC4D09E03B5E0BE779F2DF61" ) );
+    if( verbose != 0 )
+        mbedtls_printf( "  MPI test #4 (inv_mod): " );
+    if( mbedtls_mpi_cmp_mpi( &X, &U ) != 0 )
+    {
+        if( verbose != 0 )
+            mbedtls_printf( "failed\n" );
         ret = 1;
         goto cleanup;
     }
-    if (verbose) mbedtls_printf("passed\n");
-    if (verbose) mbedtls_printf("  MPI test #5 (simple gcd): ");
-    for (i = 0; i < GCD_PAIR_COUNT; i++) {
-        MBEDTLS_MPI_CHK(mbedtls_mpi_lset(&X, gcd_pairs[i][0]));
-        MBEDTLS_MPI_CHK(mbedtls_mpi_lset(&Y, gcd_pairs[i][1]));
-        MBEDTLS_MPI_CHK(mbedtls_mpi_gcd(&A, &X, &Y));
-        if (mbedtls_mpi_cmp_int(&A, gcd_pairs[i][2])) {
-            if (verbose) mbedtls_printf("failed at %d\n", i);
+    if( verbose != 0 )
+        mbedtls_printf( "passed\n" );
+    if( verbose != 0 )
+        mbedtls_printf( "  MPI test #5 (simple gcd): " );
+    for( i = 0; i < GCD_PAIR_COUNT; i++ )
+    {
+        MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &X, gcd_pairs[i][0] ) );
+        MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &Y, gcd_pairs[i][1] ) );
+        MBEDTLS_MPI_CHK( mbedtls_mpi_gcd( &A, &X, &Y ) );
+        if( mbedtls_mpi_cmp_int( &A, gcd_pairs[i][2] ) != 0 )
+        {
+            if( verbose != 0 )
+                mbedtls_printf( "failed at %d\n", i );
             ret = 1;
             goto cleanup;
         }
     }
-    if (verbose) mbedtls_printf("passed\n");
+    if( verbose != 0 )
+        mbedtls_printf( "passed\n" );
 cleanup:
-    if (ret && verbose)
-        mbedtls_printf("Unexpected error, return code = %08X\n", (unsigned int)ret);
-    mbedtls_mpi_free(&A);
-    mbedtls_mpi_free(&E);
-    mbedtls_mpi_free(&N);
-    mbedtls_mpi_free(&X);
-    mbedtls_mpi_free(&Y);
-    mbedtls_mpi_free(&U);
-    mbedtls_mpi_free(&V);
-    if (verbose) mbedtls_printf("\n");
-    return ret;
+    if( ret != 0 && verbose != 0 )
+        mbedtls_printf( "Unexpected error, return code = %08X\n", (unsigned int) ret );
+    mbedtls_mpi_free( &A ); mbedtls_mpi_free( &E ); mbedtls_mpi_free( &N ); mbedtls_mpi_free( &X );
+    mbedtls_mpi_free( &Y ); mbedtls_mpi_free( &U ); mbedtls_mpi_free( &V );
+    if( verbose != 0 )
+        mbedtls_printf( "\n" );
+    return( ret );
 }
 
 #endif /* MBEDTLS_SELF_TEST */
diff --git a/third_party/mbedtls/config.h b/third_party/mbedtls/config.h
index fd3a085cf..c94cab0c7 100644
--- a/third_party/mbedtls/config.h
+++ b/third_party/mbedtls/config.h
@@ -80,17 +80,17 @@
 #ifndef TINY
 #define MBEDTLS_ECP_DP_SECP256R1_ENABLED
 #define MBEDTLS_ECP_DP_SECP384R1_ENABLED
+#define MBEDTLS_ECP_DP_SECP521R1_ENABLED
 #define MBEDTLS_ECP_DP_CURVE25519_ENABLED
+#define MBEDTLS_ECP_DP_CURVE448_ENABLED
 /*#define MBEDTLS_ECP_DP_SECP192R1_ENABLED*/
 /*#define MBEDTLS_ECP_DP_SECP224R1_ENABLED*/
-/*#define MBEDTLS_ECP_DP_SECP521R1_ENABLED*/
 /*#define MBEDTLS_ECP_DP_SECP192K1_ENABLED*/
 /*#define MBEDTLS_ECP_DP_SECP224K1_ENABLED*/
 /*#define MBEDTLS_ECP_DP_SECP256K1_ENABLED*/
 /*#define MBEDTLS_ECP_DP_BP256R1_ENABLED*/
 /*#define MBEDTLS_ECP_DP_BP384R1_ENABLED*/
 /*#define MBEDTLS_ECP_DP_BP512R1_ENABLED*/
-/*#define MBEDTLS_ECP_DP_CURVE448_ENABLED*/
 #endif
 
 #define MBEDTLS_X509_CHECK_KEY_USAGE
diff --git a/third_party/mbedtls/ecdh.h b/third_party/mbedtls/ecdh.h
index 93215bae6..16010d7ca 100644
--- a/third_party/mbedtls/ecdh.h
+++ b/third_party/mbedtls/ecdh.h
@@ -1,8 +1,8 @@
 #ifndef MBEDTLS_ECDH_H
 #define MBEDTLS_ECDH_H
 #include "third_party/mbedtls/config.h"
+#include "third_party/mbedtls/ecdh_everest.h"
 #include "third_party/mbedtls/ecp.h"
-#include "third_party/mbedtls/everest.h"
 /* clang-format off */
 
 #ifdef __cplusplus
diff --git a/third_party/mbedtls/ecdh_everest.c b/third_party/mbedtls/ecdh_everest.c
new file mode 100644
index 000000000..d29996de6
--- /dev/null
+++ b/third_party/mbedtls/ecdh_everest.c
@@ -0,0 +1,279 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:4;coding:utf-8 -*-│
+│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright The Mbed TLS Contributors                                          │
+│                                                                              │
+│ Licensed under the Apache License, Version 2.0 (the "License");              │
+│ you may not use this file except in compliance with the License.             │
+│ You may obtain a copy of the License at                                      │
+│                                                                              │
+│     http://www.apache.org/licenses/LICENSE-2.0                               │
+│                                                                              │
+│ Unless required by applicable law or agreed to in writing, software          │
+│ distributed under the License is distributed on an "AS IS" BASIS,            │
+│ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.     │
+│ See the License for the specific language governing permissions and          │
+│ limitations under the License.                                               │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/mbedtls/ecdh_everest.h"
+#include "third_party/mbedtls/everest.h"
+#if defined(MBEDTLS_ECDH_C) && defined(MBEDTLS_ECDH_VARIANT_EVEREST_ENABLED)
+#define KEYSIZE 32
+
+asm(".ident\t\"\\n\\n\
+Mbed TLS (Apache 2.0)\\n\
+Copyright ARM Limited\\n\
+Copyright Mbed TLS Contributors\"");
+asm(".include \"libc/disclaimer.inc\"");
+/* clang-format off */
+
+/**
+ * \brief           This function sets up the ECDH context with the information
+ *                  given.
+ *
+ *                  This function should be called after mbedtls_ecdh_init() but
+ *                  before mbedtls_ecdh_make_params(). There is no need to call
+ *                  this function before mbedtls_ecdh_read_params().
+ *
+ *                  This is the first function used by a TLS server for
+ *                  ECDHE ciphersuites.
+ *
+ * \param ctx       The ECDH context to set up.
+ * \param grp_id    The group id of the group to set up the context for.
+ *
+ * \return          \c 0 on success.
+ */
+int mbedtls_everest_setup(mbedtls_ecdh_context_everest *ctx, int grp_id)
+{
+  if (grp_id != MBEDTLS_ECP_DP_CURVE25519)
+    return MBEDTLS_ERR_ECP_BAD_INPUT_DATA;
+  mbedtls_platform_zeroize(ctx, sizeof(*ctx));
+  return 0;
+}
+
+/**
+ * \brief           This function frees a context.
+ *
+ * \param ctx       The context to free.
+ */
+void mbedtls_everest_free(mbedtls_ecdh_context_everest *ctx)
+{
+  if (!ctx) return;
+  mbedtls_platform_zeroize(ctx, sizeof(*ctx));
+}
+
+/**
+ * \brief           This function generates a public key and a TLS
+ *                  ServerKeyExchange payload.
+ *
+ *                  This is the second function used by a TLS server for ECDHE
+ *                  ciphersuites. (It is called after mbedtls_ecdh_setup().)
+ *
+ * \note            This function assumes that the ECP group (grp) of the
+ *                  \p ctx context has already been properly set,
+ *                  for example, using mbedtls_ecp_group_load().
+ *
+ * \see             ecp.h
+ *
+ * \param ctx       The ECDH context.
+ * \param olen      The number of characters written.
+ * \param buf       The destination buffer.
+ * \param blen      The length of the destination buffer.
+ * \param f_rng     The RNG function.
+ * \param p_rng     The RNG context.
+ *
+ * \return          \c 0 on success.
+ * \return          An \c MBEDTLS_ERR_ECP_XXX error code on failure.
+ */
+int mbedtls_everest_make_params(mbedtls_ecdh_context_everest *ctx, size_t *olen,
+                                unsigned char *buf, size_t blen,
+                                int (*f_rng)(void *, unsigned char *, size_t),
+                                void *p_rng)
+{
+  int ret = 0;
+  uint8_t base[KEYSIZE] = {9};
+  if ((ret = f_rng(p_rng, ctx->our_secret, KEYSIZE)) != 0) return ret;
+  *olen = KEYSIZE + 4;
+  if (blen < *olen) return MBEDTLS_ERR_ECP_BUFFER_TOO_SMALL;
+  *buf++ = MBEDTLS_ECP_TLS_NAMED_CURVE;
+  *buf++ = MBEDTLS_ECP_TLS_CURVE25519 >> 8;
+  *buf++ = MBEDTLS_ECP_TLS_CURVE25519 & 0xFF;
+  *buf++ = KEYSIZE;
+  curve25519(buf, ctx->our_secret, base);
+  base[0] = 0;
+  if (!timingsafe_memcmp(buf, base, KEYSIZE))
+    return MBEDTLS_ERR_ECP_RANDOM_FAILED;
+  return 0;
+}
+
+/**
+ * \brief           This function parses and processes a TLS ServerKeyExhange
+ *                  payload.
+ *
+ *                  This is the first function used by a TLS client for ECDHE
+ *                  ciphersuites.
+ *
+ * \see             ecp.h
+ *
+ * \param ctx       The ECDH context.
+ * \param buf       The pointer to the start of the input buffer.
+ * \param end       The address for one Byte past the end of the buffer.
+ *
+ * \return          \c 0 on success.
+ * \return          An \c MBEDTLS_ERR_ECP_XXX error code on failure.
+ */
+int mbedtls_everest_read_params(mbedtls_ecdh_context_everest *ctx,
+                                const unsigned char **buf,
+                                const unsigned char *end)
+{
+  if (end - *buf < KEYSIZE + 1) return MBEDTLS_ERR_ECP_BAD_INPUT_DATA;
+  if ((*(*buf)++ != KEYSIZE)) return MBEDTLS_ERR_ECP_BAD_INPUT_DATA;
+  memcpy(ctx->peer_point, *buf, KEYSIZE);
+  *buf += KEYSIZE;
+  return 0;
+}
+
+/**
+ * \brief           This function sets up an ECDH context from an EC key.
+ *
+ *                  It is used by clients and servers in place of the
+ *                  ServerKeyEchange for static ECDH, and imports ECDH
+ *                  parameters from the EC key information of a certificate.
+ *
+ * \see             ecp.h
+ *
+ * \param ctx       The ECDH context to set up.
+ * \param key       The EC key to use.
+ * \param side      Defines the source of the key: 1: Our key, or
+ *                  0: The key of the peer.
+ *
+ * \return          \c 0 on success.
+ * \return          An \c MBEDTLS_ERR_ECP_XXX error code on failure.
+ */
+int mbedtls_everest_get_params(mbedtls_ecdh_context_everest *ctx,
+                               const mbedtls_ecp_keypair *key,
+                               mbedtls_everest_ecdh_side side)
+{
+  size_t olen = 0;
+  mbedtls_everest_ecdh_side s;
+  switch (side)
+  {
+    case MBEDTLS_EVEREST_ECDH_THEIRS:
+      return mbedtls_ecp_point_write_binary(&key->grp, &key->Q,
+                                            MBEDTLS_ECP_PF_COMPRESSED, &olen,
+                                            ctx->peer_point, KEYSIZE);
+    case MBEDTLS_EVEREST_ECDH_OURS:
+      return mbedtls_mpi_write_binary_le(&key->d, ctx->our_secret, KEYSIZE);
+    default:
+      return MBEDTLS_ERR_ECP_BAD_INPUT_DATA;
+  }
+}
+
+/**
+ * \brief           This function generates a public key and a TLS
+ *                  ClientKeyExchange payload.
+ *
+ *                  This is the second function used by a TLS client for ECDH(E)
+ *                  ciphersuites.
+ *
+ * \see             ecp.h
+ *
+ * \param ctx       The ECDH context.
+ * \param olen      The number of Bytes written.
+ * \param buf       The destination buffer.
+ * \param blen      The size of the destination buffer.
+ * \param f_rng     The RNG function.
+ * \param p_rng     The RNG context.
+ *
+ * \return          \c 0 on success.
+ * \return          An \c MBEDTLS_ERR_ECP_XXX error code on failure.
+ */
+int mbedtls_everest_make_public(mbedtls_ecdh_context_everest *ctx, size_t *olen,
+                                unsigned char *buf, size_t blen,
+                                int (*f_rng)(void *, unsigned char *, size_t),
+                                void *p_rng)
+{
+  int ret = 0;
+  unsigned char base[KEYSIZE] = {9};
+  if (!ctx) return MBEDTLS_ERR_ECP_BAD_INPUT_DATA;
+  if ((ret = f_rng(p_rng, ctx->our_secret, KEYSIZE))) return ret;
+  *olen = KEYSIZE + 1;
+  if (blen < *olen) return MBEDTLS_ERR_ECP_BUFFER_TOO_SMALL;
+  *buf++ = KEYSIZE;
+  curve25519(buf, ctx->our_secret, base);
+  base[0] = 0;
+  if (!timingsafe_memcmp(buf, base, KEYSIZE))
+    return MBEDTLS_ERR_ECP_RANDOM_FAILED;
+  return ret;
+}
+
+/**
+ * \brief       This function parses and processes a TLS ClientKeyExchange
+ *              payload.
+ *
+ *              This is the third function used by a TLS server for ECDH(E)
+ *              ciphersuites. (It is called after mbedtls_ecdh_setup() and
+ *              mbedtls_ecdh_make_params().)
+ *
+ * \see         ecp.h
+ *
+ * \param ctx   The ECDH context.
+ * \param buf   The start of the input buffer.
+ * \param blen  The length of the input buffer.
+ *
+ * \return      \c 0 on success.
+ * \return      An \c MBEDTLS_ERR_ECP_XXX error code on failure.
+ */
+int mbedtls_everest_read_public(mbedtls_ecdh_context_everest *ctx,
+                                const unsigned char *buf, size_t blen)
+{
+  if (blen < KEYSIZE + 1) return MBEDTLS_ERR_ECP_BUFFER_TOO_SMALL;
+  if ((*buf++ != KEYSIZE)) return MBEDTLS_ERR_ECP_BAD_INPUT_DATA;
+  memcpy(ctx->peer_point, buf, KEYSIZE);
+  return 0;
+}
+
+/**
+ * \brief           This function derives and exports the shared secret.
+ *
+ *                  This is the last function used by both TLS client
+ *                  and servers.
+ *
+ * \note            If \p f_rng is not NULL, it is used to implement
+ *                  countermeasures against side-channel attacks.
+ *                  For more information, see mbedtls_ecp_mul().
+ *
+ * \see             ecp.h
+ *
+ * \param ctx       The ECDH context.
+ * \param olen      The number of Bytes written.
+ * \param buf       The destination buffer.
+ * \param blen      The length of the destination buffer.
+ * \param f_rng     The RNG function.
+ * \param p_rng     The RNG context.
+ *
+ * \return          \c 0 on success.
+ * \return          An \c MBEDTLS_ERR_ECP_XXX error code on failure.
+ */
+int mbedtls_everest_calc_secret(mbedtls_ecdh_context_everest *ctx, size_t *olen,
+                                unsigned char *buf, size_t blen,
+                                int (*f_rng)(void *, unsigned char *, size_t),
+                                void *p_rng)
+{
+  /* f_rng and p_rng are not used here because this implementation does not
+     need blinding since it has constant trace. (todo(jart): wut?) */
+  *olen = KEYSIZE;
+  if (blen < *olen) return MBEDTLS_ERR_ECP_BUFFER_TOO_SMALL;
+  curve25519(buf, ctx->our_secret, ctx->peer_point);
+  if (!timingsafe_memcmp(buf, ctx->our_secret, KEYSIZE)) goto wut;
+  /* Wipe the DH secret and don't let the peer chose a small subgroup point */
+  mbedtls_platform_zeroize(ctx->our_secret, KEYSIZE);
+  if (!timingsafe_memcmp(buf, ctx->our_secret, KEYSIZE)) goto wut;
+  return 0;
+wut:
+  mbedtls_platform_zeroize(buf, KEYSIZE);
+  mbedtls_platform_zeroize(ctx->our_secret, KEYSIZE);
+  return MBEDTLS_ERR_ECP_RANDOM_FAILED;
+}
+
+#endif
diff --git a/third_party/mbedtls/ecdh_everest.h b/third_party/mbedtls/ecdh_everest.h
new file mode 100644
index 000000000..6f756d5ac
--- /dev/null
+++ b/third_party/mbedtls/ecdh_everest.h
@@ -0,0 +1,43 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_MBEDTLS_X25519_H_
+#define COSMOPOLITAN_THIRD_PARTY_MBEDTLS_X25519_H_
+#include "third_party/mbedtls/config.h"
+#include "third_party/mbedtls/ecp.h"
+COSMOPOLITAN_C_START_
+
+#define MBEDTLS_ECP_TLS_CURVE25519    0x1d
+#define MBEDTLS_X25519_KEY_SIZE_BYTES 32
+
+typedef enum {
+  MBEDTLS_EVEREST_ECDH_OURS,
+  MBEDTLS_EVEREST_ECDH_THEIRS,
+} mbedtls_everest_ecdh_side;
+
+typedef struct {
+  unsigned char our_secret[MBEDTLS_X25519_KEY_SIZE_BYTES];
+  unsigned char peer_point[MBEDTLS_X25519_KEY_SIZE_BYTES];
+} mbedtls_ecdh_context_everest;
+
+int mbedtls_everest_setup(mbedtls_ecdh_context_everest *, int);
+void mbedtls_everest_free(mbedtls_ecdh_context_everest *);
+int mbedtls_everest_make_params(mbedtls_ecdh_context_everest *, size_t *,
+                                unsigned char *, size_t,
+                                int (*)(void *, unsigned char *, size_t),
+                                void *);
+int mbedtls_everest_read_params(mbedtls_ecdh_context_everest *,
+                                const unsigned char **, const unsigned char *);
+int mbedtls_everest_get_params(mbedtls_ecdh_context_everest *,
+                               const mbedtls_ecp_keypair *,
+                               mbedtls_everest_ecdh_side);
+int mbedtls_everest_make_public(mbedtls_ecdh_context_everest *, size_t *,
+                                unsigned char *, size_t,
+                                int (*)(void *, unsigned char *, size_t),
+                                void *);
+int mbedtls_everest_read_public(mbedtls_ecdh_context_everest *,
+                                const unsigned char *, size_t);
+int mbedtls_everest_calc_secret(mbedtls_ecdh_context_everest *, size_t *,
+                                unsigned char *, size_t,
+                                int (*)(void *, unsigned char *, size_t),
+                                void *);
+
+COSMOPOLITAN_C_END_
+#endif /* COSMOPOLITAN_THIRD_PARTY_MBEDTLS_X25519_H_ */
diff --git a/third_party/mbedtls/ecdsa.c b/third_party/mbedtls/ecdsa.c
index 04beab389..0ac74dcd1 100644
--- a/third_party/mbedtls/ecdsa.c
+++ b/third_party/mbedtls/ecdsa.c
@@ -28,31 +28,12 @@ Mbed TLS (Apache 2.0)\\n\
 Copyright ARM Limited\\n\
 Copyright Mbed TLS Contributors\"");
 asm(".include \"libc/disclaimer.inc\"");
-
 /* clang-format off */
-/*
- *  Elliptic curve DSA
- *
- *  Copyright The Mbed TLS Contributors
- *  SPDX-License-Identifier: Apache-2.0
- *
- *  Licensed under the Apache License, Version 2.0 (the "License"); you may
- *  not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
 
-/*
- * References:
+/**
+ * @fileoverview Elliptic curve Digital Signature Algorithm
  *
- * SEC1 http://www.secg.org/index.php?action=secg,docs_secg
+ * @see SEC1 http://www.secg.org/index.php?action=secg,docs_secg
  */
 
 #if defined(MBEDTLS_ECDSA_C)
diff --git a/third_party/mbedtls/ecp.c b/third_party/mbedtls/ecp.c
index 69e3cce88..d30fd906c 100644
--- a/third_party/mbedtls/ecp.c
+++ b/third_party/mbedtls/ecp.c
@@ -511,12 +511,15 @@ static const mbedtls_ecp_curve_info ecp_supported_curves[] =
 #if defined(MBEDTLS_ECP_DP_CURVE25519_ENABLED)
     { MBEDTLS_ECP_DP_CURVE25519,   29,     256,    "x25519"            },
 #endif
-#if defined(MBEDTLS_ECP_DP_SECP256R1_ENABLED)
-    { MBEDTLS_ECP_DP_SECP256R1,    23,     256,    "secp256r1"         },
-#endif
 #if defined(MBEDTLS_ECP_DP_SECP384R1_ENABLED)
     { MBEDTLS_ECP_DP_SECP384R1,    24,     384,    "secp384r1"         },
 #endif
+#if defined(MBEDTLS_ECP_DP_CURVE448_ENABLED)
+    { MBEDTLS_ECP_DP_CURVE448,     30,     448,    "x448"              },
+#endif
+#if defined(MBEDTLS_ECP_DP_SECP256R1_ENABLED)
+    { MBEDTLS_ECP_DP_SECP256R1,    23,     256,    "secp256r1"         },
+#endif
 #if defined(MBEDTLS_ECP_DP_SECP521R1_ENABLED)
     { MBEDTLS_ECP_DP_SECP521R1,    25,     521,    "secp521r1"         },
 #endif
@@ -543,9 +546,6 @@ static const mbedtls_ecp_curve_info ecp_supported_curves[] =
 #endif
 #if defined(MBEDTLS_ECP_DP_SECP192K1_ENABLED)
     { MBEDTLS_ECP_DP_SECP192K1,    18,     192,    "secp192k1"         },
-#endif
-#if defined(MBEDTLS_ECP_DP_CURVE448_ENABLED)
-    { MBEDTLS_ECP_DP_CURVE448,     30,     448,    "x448"              },
 #endif
     { MBEDTLS_ECP_DP_NONE,          0,     0,      NULL                },
 };
diff --git a/third_party/mbedtls/ecp256.c b/third_party/mbedtls/ecp256.c
index d2e4c1f65..8e5157669 100644
--- a/third_party/mbedtls/ecp256.c
+++ b/third_party/mbedtls/ecp256.c
@@ -38,32 +38,15 @@ mbedtls_p256_isz( uint64_t p[4] )
 static inline bool
 mbedtls_p256_gte( uint64_t p[5] )
 {
-    return( (p[4] ||
-             p[3] > 0xffffffff00000001 ||
-             (p[3] == 0xffffffff00000001 &&
-              p[2] > 0x0000000000000000 ||
-              (p[2] == 0x0000000000000000 &&
-               p[1] > 0x00000000ffffffff ||
-               (p[1] == 0x00000000ffffffff &&
-                p[0] > 0xffffffffffffffff ||
-                (p[0] == 0xffffffffffffffff))))) );
-}
-
-static int
-mbedtls_p256_cmp( const uint64_t a[5],
-                  const uint64_t b[5] )
-{
-    if( a[4] < b[4] ) return -1;
-    if( a[4] > b[4] ) return  1;
-    if( a[3] < b[3] ) return -1;
-    if( a[3] > b[3] ) return  1;
-    if( a[2] < b[2] ) return -1;
-    if( a[2] > b[2] ) return  1;
-    if( a[1] < b[1] ) return -1;
-    if( a[1] > b[1] ) return  1;
-    if( a[0] < b[0] ) return -1;
-    if( a[0] > b[0] ) return  1;
-    return 0;
+    return( ((int64_t)p[4] > 0 ||
+             (p[3] > 0xffffffff00000001 ||
+              (p[3] == 0xffffffff00000001 &&
+               (p[2] > 0x0000000000000000 ||
+                (p[2] == 0x0000000000000000 &&
+                 (p[1] > 0x00000000ffffffff ||
+                  (p[1] == 0x00000000ffffffff &&
+                   (p[0] > 0xffffffffffffffff ||
+                    (p[0] == 0xffffffffffffffff))))))))) );
 }
 
 static inline void
@@ -119,125 +102,49 @@ mbedtls_p256_rum( uint64_t p[5] )
         mbedtls_p256_red( p );
 }
 
+static void
+mbedtls_p256_mod(uint64_t X[8])
+{
+    secp256r1(X);
+    if ((int64_t)X[4] < 0) {
+        do {
+            mbedtls_p256_gro(X);
+        } while ((int64_t)X[4] < 0);
+    } else {
+        while (mbedtls_p256_gte(X)) {
+            mbedtls_p256_red(X);
+        }
+    }
+}
+
 static inline void
 mbedtls_p256_sar( uint64_t p[5] )
 {
-#if defined(__x86_64__) && !defined(__STRICT_ANSI__)
-    asm("sarq\t32+%0\n\t"
-        "rcrq\t24+%0\n\t"
-        "rcrq\t16+%0\n\t"
-        "rcrq\t8+%0\n\t"
-        "rcrq\t%0\n\t"
-        : "+o"(*p)
-        : /* no inputs */
-        : "memory", "cc");
-#else
     p[0] = p[0] >> 1 | p[1] << 63;
     p[1] = p[1] >> 1 | p[2] << 63;
     p[2] = p[2] >> 1 | p[3] << 63;
     p[3] = p[3] >> 1 | p[4] << 63;
     p[4] = (int64_t)p[4] >> 1;
-#endif
 }
 
 static inline void
 mbedtls_p256_shl( uint64_t p[5] )
 {
-#if defined(__x86_64__) && !defined(__STRICT_ANSI__)
-    asm("shlq\t%0\n\t"
-        "rclq\t8+%0\n\t"
-        "rclq\t16+%0\n\t"
-        "rclq\t24+%0\n\t"
-        "rclq\t32+%0\n\t"
-        : "+o"(*p)
-        : /* no inputs */
-        : "memory", "cc");
-#else
     p[4] =             p[3] >> 63;
     p[3] = p[3] << 1 | p[2] >> 63;
     p[2] = p[2] << 1 | p[1] >> 63;
     p[1] = p[1] << 1 | p[0] >> 63;
     p[0] = p[0] << 1;
-#endif
     mbedtls_p256_rum( p );
 }
 
 static inline void
-mbedtls_p256_jam( uint64_t p[5] )
-{
-    secp256r1( p );
-    if( (int64_t)p[4] < 0 )
-        do
-            mbedtls_p256_gro( p );
-        while( (int64_t)p[4] < 0 );
-    else
-        mbedtls_p256_rum( p );
-}
-
-static void
-mbedtls_p256_mul_1x1( uint64_t X[8],
-                      const uint64_t A[4], size_t n,
-                      const uint64_t B[4], size_t m )
-{
-    uint128_t t;
-    t = A[0];
-    t *= B[0];
-    X[ 0] = t;
-    X[ 1] = t >> 64;
-    X[ 2] = 0;
-    X[ 3] = 0;
-    X[ 4] = 0;
-    X[ 5] = 0;
-    X[ 6] = 0;
-    X[ 7] = 0;
-}
-
-static void
-mbedtls_p256_mul_nx1( uint64_t X[8],
-                      const uint64_t A[4], size_t n,
-                      const uint64_t B[4], size_t m )
-{
-    mbedtls_mpi_mul_hlp1(n, A, X, B[0]);
-    mbedtls_platform_zeroize( X + n + m, ( 8 - n - m ) * 8 );
-    if ( n + m >= 4 )
-        mbedtls_p256_jam( X );
-}
-
-static void
-mbedtls_p256_mul_4x4( uint64_t X[8],
-                      const uint64_t A[4], size_t n,
-                      const uint64_t B[4], size_t m )
-{
-    Mul4x4( X, A, B );
-    mbedtls_p256_jam( X );
-}
-
-static void
-mbedtls_p256_mul_nxm( uint64_t X[8],
-                      const uint64_t A[4], size_t n,
-                      const uint64_t B[4], size_t m )
-{
-    if (A == X) A = gc(memcpy(malloc(4 * 8), A, 4 * 8));
-    if (B == X) B = gc(memcpy(malloc(4 * 8), B, 4 * 8));
-    Mul( X, A, n, B, m );
-    mbedtls_platform_zeroize( X + n + m, (8 - n - m) * 8 );
-    if ( n + m >= 4 )
-        mbedtls_p256_jam( X );
-}
-
-static void
 mbedtls_p256_mul( uint64_t X[8],
                   const uint64_t A[4], size_t n,
                   const uint64_t B[4], size_t m )
 {
-    if( n == 4 && m == 4 )
-        mbedtls_p256_mul_4x4( X, A, n, B, m );
-    else if( m == 1 && n == 1 )
-        mbedtls_p256_mul_1x1( X, A, n, B, m );
-    else if( m == 1 )
-        mbedtls_p256_mul_nx1( X, A, n, B, m );
-    else
-        mbedtls_p256_mul_nxm( X, A, n, B, m );
+    Mul4x4( X, A, B );
+    mbedtls_p256_mod( X );
 }
 
 static void
diff --git a/third_party/mbedtls/ecp384.c b/third_party/mbedtls/ecp384.c
index d4421f22f..a213f8b87 100644
--- a/third_party/mbedtls/ecp384.c
+++ b/third_party/mbedtls/ecp384.c
@@ -36,42 +36,20 @@ mbedtls_p384_isz( uint64_t p[6] )
 }
 
 static inline bool
-mbedtls_p384_gte( uint64_t p[7] )
-{
-    return( (p[6] ||
-             p[5] > 0xffffffffffffffff ||
-             (p[5] == 0xffffffffffffffff &&
-              p[4] > 0xffffffffffffffff ||
-              (p[4] == 0xffffffffffffffff &&
-               p[3] > 0xffffffffffffffff ||
-               (p[3] == 0xffffffffffffffff &&
-                p[2] > 0xfffffffffffffffe ||
-                (p[2] == 0xfffffffffffffffe &&
-                 p[1] > 0xffffffff00000000 ||
-                 (p[1] == 0xffffffff00000000 &&
-                  p[0] > 0x00000000ffffffff ||
-                  (p[0] == 0x00000000ffffffff))))))) );
-}
-
-static int
-mbedtls_p384_cmp( const uint64_t a[7],
-                  const uint64_t b[7] )
-{
-    if( a[6] < b[6] ) return -1;
-    if( a[6] > b[6] ) return  1;
-    if( a[5] < b[5] ) return -1;
-    if( a[5] > b[5] ) return  1;
-    if( a[4] < b[4] ) return -1;
-    if( a[4] > b[4] ) return  1;
-    if( a[3] < b[3] ) return -1;
-    if( a[3] > b[3] ) return  1;
-    if( a[2] < b[2] ) return -1;
-    if( a[2] > b[2] ) return  1;
-    if( a[1] < b[1] ) return -1;
-    if( a[1] > b[1] ) return  1;
-    if( a[0] < b[0] ) return -1;
-    if( a[0] > b[0] ) return  1;
-    return 0;
+mbedtls_p384_gte( uint64_t p[7] ) {
+    return( ((int64_t)p[6] > 0 ||
+             (p[5] > 0xffffffffffffffff ||
+              (p[5] == 0xffffffffffffffff &&
+               (p[4] > 0xffffffffffffffff ||
+                (p[4] == 0xffffffffffffffff &&
+                 (p[3] > 0xffffffffffffffff ||
+                  (p[3] == 0xffffffffffffffff &&
+                   (p[2] > 0xfffffffffffffffe ||
+                    (p[2] == 0xfffffffffffffffe &&
+                     (p[1] > 0xffffffff00000000 ||
+                      (p[1] == 0xffffffff00000000 &&
+                       (p[0] > 0x00000000ffffffff ||
+                        (p[0] == 0x00000000ffffffff))))))))))))) );
 }
 
 static inline void
@@ -97,11 +75,11 @@ mbedtls_p384_red( uint64_t p[7] )
     SBB( p[3], p[3], 0xffffffffffffffff, c, c );
     SBB( p[4], p[4], 0xffffffffffffffff, c, c );
     SBB( p[5], p[5], 0xffffffffffffffff, c, c );
-    SBB( p[6], p[6], 0, c, c );
+    SBB( p[6], p[6], 0,                  c, c );
 #endif
 }
 
-static noinline void
+static inline void
 mbedtls_p384_gro( uint64_t p[7] )
 {
 #if defined(__x86_64__) && !defined(__STRICT_ANSI__)
@@ -128,28 +106,31 @@ mbedtls_p384_gro( uint64_t p[7] )
 #endif
 }
 
-static void
+static inline void
 mbedtls_p384_rum( uint64_t p[7] )
 {
     while( mbedtls_p384_gte( p ) )
         mbedtls_p384_red( p );
 }
 
+static inline void
+mbedtls_p384_mod(uint64_t X[12])
+{
+    secp384r1(X);
+    if ((int64_t)X[6] < 0) {
+        do {
+            mbedtls_p384_gro(X);
+        } while ((int64_t)X[6] < 0);
+    } else {
+        while (mbedtls_p384_gte(X)) {
+            mbedtls_p384_red(X);
+        }
+    }
+}
+
 static inline void
 mbedtls_p384_sar( uint64_t p[7] )
 {
-#if defined(__x86_64__) && !defined(__STRICT_ANSI__)
-    asm("sarq\t48+%0\n\t"
-        "rcrq\t40+%0\n\t"
-        "rcrq\t32+%0\n\t"
-        "rcrq\t24+%0\n\t"
-        "rcrq\t16+%0\n\t"
-        "rcrq\t8+%0\n\t"
-        "rcrq\t%0\n\t"
-        : "+o"(*p)
-        : /* no inputs */
-        : "memory", "cc");
-#else
     p[0] = p[0] >> 1 | p[1] << 63;
     p[1] = p[1] >> 1 | p[2] << 63;
     p[2] = p[2] >> 1 | p[3] << 63;
@@ -157,24 +138,11 @@ mbedtls_p384_sar( uint64_t p[7] )
     p[4] = p[4] >> 1 | p[5] << 63;
     p[5] = p[5] >> 1 | p[6] << 63;
     p[6] = (int64_t)p[6] >> 1;
-#endif
 }
 
 static inline void
 mbedtls_p384_shl( uint64_t p[7] )
 {
-#if defined(__x86_64__) && !defined(__STRICT_ANSI__)
-    asm("shlq\t%0\n\t"
-        "rclq\t8+%0\n\t"
-        "rclq\t16+%0\n\t"
-        "rclq\t24+%0\n\t"
-        "rclq\t32+%0\n\t"
-        "rclq\t40+%0\n\t"
-        "rclq\t48+%0\n\t"
-        : "+o"(*p)
-        : /* no inputs */
-        : "memory", "cc");
-#else
     p[6] =             p[5] >> 63;
     p[5] = p[5] << 1 | p[4] >> 63;
     p[4] = p[4] << 1 | p[3] >> 63;
@@ -182,90 +150,24 @@ mbedtls_p384_shl( uint64_t p[7] )
     p[2] = p[2] << 1 | p[1] >> 63;
     p[1] = p[1] << 1 | p[0] >> 63;
     p[0] = p[0] << 1;
-#endif
     mbedtls_p384_rum( p );
 }
 
-static inline void
-mbedtls_p384_jam( uint64_t p[7] )
-{
-    secp384r1( p );
-    if( (int64_t)p[6] < 0 )
-        do
-            mbedtls_p384_gro( p );
-        while( (int64_t)p[6] < 0 );
-    else
-        mbedtls_p384_rum( p );
-}
-
-static void
-mbedtls_p384_mul_1x1( uint64_t X[12],
-                      const uint64_t A[6], size_t n,
-                      const uint64_t B[6], size_t m )
-{
-    uint128_t t;
-    t = A[0];
-    t *= B[0];
-    X[ 0] = t;
-    X[ 1] = t >> 64;
-    X[ 2] = 0;
-    X[ 3] = 0;
-    X[ 4] = 0;
-    X[ 5] = 0;
-    X[ 6] = 0;
-    X[ 7] = 0;
-    X[ 8] = 0;
-    X[ 9] = 0;
-    X[10] = 0;
-    X[11] = 0;
-}
-
-static void
-mbedtls_p384_mul_nx1( uint64_t X[12],
-                      const uint64_t A[6], size_t n,
-                      const uint64_t B[6], size_t m )
-{
-    mbedtls_mpi_mul_hlp1(n, A, X, B[0]);
-    mbedtls_platform_zeroize( X + n + m, ( 12 - n - m ) * 8 );
-    if ( n + m >= 6 )
-        mbedtls_p384_jam( X );
-}
-
-static void
-mbedtls_p384_mul_6x6( uint64_t X[12],
-                      const uint64_t A[6], size_t n,
-                      const uint64_t B[6], size_t m )
-{
-    Mul6x6Adx( X, A, B );
-    mbedtls_p384_jam( X );
-}
-
-static void
-mbedtls_p384_mul_nxm( uint64_t X[12],
-                      const uint64_t A[6], size_t n,
-                      const uint64_t B[6], size_t m )
-{
-    if (A == X) A = gc(memcpy(malloc(6 * 8), A, 6 * 8));
-    if (B == X) B = gc(memcpy(malloc(6 * 8), B, 6 * 8));
-    Mul( X, A, n, B, m );
-    mbedtls_platform_zeroize( X + n + m, (12 - n - m) * 8 );
-    if ( n + m >= 6 )
-        mbedtls_p384_jam( X );
-}
-
 static void
 mbedtls_p384_mul( uint64_t X[12],
                   const uint64_t A[6], size_t n,
                   const uint64_t B[6], size_t m )
 {
-    if( n == 6 && m == 6 && X86_HAVE(ADX) && X86_HAVE(BMI2) )
-        mbedtls_p384_mul_6x6( X, A, n, B, m );
-    else if( m == 1 && n == 1 )
-        mbedtls_p384_mul_1x1( X, A, n, B, m );
-    else if( m == 1 )
-        mbedtls_p384_mul_nx1( X, A, n, B, m );
+    if( X86_HAVE(ADX) && X86_HAVE(BMI2) )
+        Mul6x6Adx( X, A, B );
     else
-        mbedtls_p384_mul_nxm( X, A, n, B, m );
+    {
+        if (A == X) A = gc(memcpy(malloc(6 * 8), A, 6 * 8));
+        if (B == X) B = gc(memcpy(malloc(6 * 8), B, 6 * 8));
+        Mul( X, A, n, B, m );
+        mbedtls_platform_zeroize( X + n + m, (12 - n - m) * 8 );
+    }
+    mbedtls_p384_mod( X );
 }
 
 static void
diff --git a/third_party/mbedtls/ecp_curves.c b/third_party/mbedtls/ecp_curves.c
index 2dc189e8f..2b13aed58 100644
--- a/third_party/mbedtls/ecp_curves.c
+++ b/third_party/mbedtls/ecp_curves.c
@@ -46,7 +46,7 @@ asm(".include \"libc/disclaimer.inc\"");
  *  limitations under the License.
  */
 
-/* #if defined(MBEDTLS_ECP_C) */
+#if defined(MBEDTLS_ECP_C)
 
 #if !defined(MBEDTLS_ECP_ALT)
 
@@ -635,12 +635,7 @@ static int ecp_group_load( mbedtls_ecp_group *grp,
 #endif /* ECP_LOAD_GROUP */
 
 #if defined(MBEDTLS_ECP_NIST_OPTIM)
-#define NIST_MODP( P )      grp->modp = ecp_mod_ ## P;
-#else
-#define NIST_MODP( P )
-#endif
-
-#if defined(MBEDTLS_ECP_NIST_OPTIM)
+/* Forward declarations */
 #if defined(MBEDTLS_ECP_DP_SECP192R1_ENABLED)
 static int ecp_mod_p192( mbedtls_mpi * );
 #endif
@@ -650,8 +645,13 @@ static int ecp_mod_p224( mbedtls_mpi * );
 #if defined(MBEDTLS_ECP_DP_SECP521R1_ENABLED)
 static int ecp_mod_p521( mbedtls_mpi * );
 #endif
+
+#define NIST_MODP( P )      grp->modp = ecp_mod_ ## P;
+#else
+#define NIST_MODP( P )
 #endif /* MBEDTLS_ECP_NIST_OPTIM */
 
+/* Additional forward declarations */
 #if defined(MBEDTLS_ECP_DP_CURVE25519_ENABLED)
 static int ecp_mod_p255( mbedtls_mpi * );
 #endif
@@ -771,8 +771,6 @@ cleanup:
 }
 #endif /* MBEDTLS_ECP_DP_CURVE448_ENABLED */
 
-
-#if defined(MBEDTLS_ECP_C)
 /**
  * \brief           This function sets up an ECP group context
  *                  from a standardized set of domain parameters.
@@ -879,7 +877,6 @@ int mbedtls_ecp_group_load( mbedtls_ecp_group *grp, mbedtls_ecp_group_id id )
             return( MBEDTLS_ERR_ECP_FEATURE_UNAVAILABLE );
     }
 }
-#endif /* MBEDTLS_ECP_C */
 
 #if defined(MBEDTLS_ECP_NIST_OPTIM)
 /*
@@ -892,6 +889,7 @@ int mbedtls_ecp_group_load( mbedtls_ecp_group *grp, mbedtls_ecp_group_id id )
  * MPI remains loose, since these functions can be deactivated at will.
  */
 
+#if defined(MBEDTLS_ECP_DP_SECP192R1_ENABLED)
 /*
  * Compared to the way things are presented in FIPS 186-3 D.2,
  * we proceed in columns, from right (least significant chunk) to left,
@@ -942,13 +940,17 @@ static int ecp_mod_p192( mbedtls_mpi *N )
     int ret = MBEDTLS_ERR_THIS_CORRUPTION;
     mbedtls_mpi_uint c = 0;
     mbedtls_mpi_uint *p, *end;
+
     /* Make sure we have enough blocks so that A(5) is legal */
     MBEDTLS_MPI_CHK( mbedtls_mpi_grow( N, 6 * WIDTH ) );
+
     p = N->p;
     end = p + N->n;
+
     ADD( 3 ); ADD( 5 );             NEXT; // A0 += A3 + A5
     ADD( 3 ); ADD( 4 ); ADD( 5 );   NEXT; // A1 += A3 + A4 + A5
     ADD( 4 ); ADD( 5 );             LAST; // A2 += A4 + A5
+
 cleanup:
     return( ret );
 }
@@ -958,7 +960,11 @@ cleanup:
 #undef ADD
 #undef NEXT
 #undef LAST
+#endif /* MBEDTLS_ECP_DP_SECP192R1_ENABLED */
 
+#if defined(MBEDTLS_ECP_DP_SECP224R1_ENABLED) ||   \
+    defined(MBEDTLS_ECP_DP_SECP256R1_ENABLED) ||   \
+    defined(MBEDTLS_ECP_DP_SECP384R1_ENABLED)
 /*
  * The reader is advised to first understand ecp_mod_p192() since the same
  * general structure is used here, but with additional complications:
@@ -1059,6 +1065,7 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
 static inline int fix_negative( mbedtls_mpi *N, signed char c, mbedtls_mpi *C, size_t bits )
 {
     int ret = MBEDTLS_ERR_THIS_CORRUPTION;
+
     /* C = - c * 2^(bits + 32) */
 #if !defined(MBEDTLS_HAVE_INT64)
     ((void) bits);
@@ -1068,19 +1075,24 @@ static inline int fix_negative( mbedtls_mpi *N, signed char c, mbedtls_mpi *C, s
     else
 #endif
         C->p[ C->n - 1 ] = (mbedtls_mpi_uint) -c;
+
     /* N = - ( C - N ) */
     MBEDTLS_MPI_CHK( mbedtls_mpi_sub_abs( N, C, N ) );
     N->s = -1;
+
 cleanup:
+
     return( ret );
 }
 
+#if defined(MBEDTLS_ECP_DP_SECP224R1_ENABLED)
 /*
  * Fast quasi-reduction modulo p224 (FIPS 186-3 D.2.2)
  */
 static int ecp_mod_p224( mbedtls_mpi *N )
 {
     INIT( 224 );
+
     SUB(  7 ); SUB( 11 );               NEXT; // A0 += -A7 - A11
     SUB(  8 ); SUB( 12 );               NEXT; // A1 += -A8 - A12
     SUB(  9 ); SUB( 13 );               NEXT; // A2 += -A9 - A13
@@ -1088,9 +1100,97 @@ static int ecp_mod_p224( mbedtls_mpi *N )
     SUB( 11 ); ADD(  8 ); ADD( 12 );    NEXT; // A4 += -A11 + A8 + A12
     SUB( 12 ); ADD(  9 ); ADD( 13 );    NEXT; // A5 += -A12 + A9 + A13
     SUB( 13 ); ADD( 10 );               LAST; // A6 += -A13 + A10
+
 cleanup:
     return( ret );
 }
+#endif /* MBEDTLS_ECP_DP_SECP224R1_ENABLED */
+
+#if defined(MBEDTLS_ECP_DP_SECP256R1_ENABLED)
+/*
+ * Fast quasi-reduction modulo p256 (FIPS 186-3 D.2.3)
+ */
+int ecp_mod_p256_old( mbedtls_mpi *N )
+{
+    INIT( 256 );
+
+    ADD(  8 ); ADD(  9 );
+    SUB( 11 ); SUB( 12 ); SUB( 13 ); SUB( 14 );             NEXT; // A0
+
+    ADD(  9 ); ADD( 10 );
+    SUB( 12 ); SUB( 13 ); SUB( 14 ); SUB( 15 );             NEXT; // A1
+
+    ADD( 10 ); ADD( 11 );
+    SUB( 13 ); SUB( 14 ); SUB( 15 );                        NEXT; // A2
+
+    ADD( 11 ); ADD( 11 ); ADD( 12 ); ADD( 12 ); ADD( 13 );
+    SUB( 15 ); SUB(  8 ); SUB(  9 );                        NEXT; // A3
+
+    ADD( 12 ); ADD( 12 ); ADD( 13 ); ADD( 13 ); ADD( 14 );
+    SUB(  9 ); SUB( 10 );                                   NEXT; // A4
+
+    ADD( 13 ); ADD( 13 ); ADD( 14 ); ADD( 14 ); ADD( 15 );
+    SUB( 10 ); SUB( 11 );                                   NEXT; // A5
+
+    ADD( 14 ); ADD( 14 ); ADD( 15 ); ADD( 15 ); ADD( 14 ); ADD( 13 );
+    SUB(  8 ); SUB(  9 );                                   NEXT; // A6
+
+    ADD( 15 ); ADD( 15 ); ADD( 15 ); ADD( 8 );
+    SUB( 10 ); SUB( 11 ); SUB( 12 ); SUB( 13 );             LAST; // A7
+
+cleanup:
+    return( ret );
+}
+#endif /* MBEDTLS_ECP_DP_SECP256R1_ENABLED */
+
+#if defined(MBEDTLS_ECP_DP_SECP384R1_ENABLED)
+/*
+ * Fast quasi-reduction modulo p384 (FIPS 186-3 D.2.4)
+ */
+int ecp_mod_p384_old( mbedtls_mpi *N )
+{
+    INIT( 384 );
+
+    ADD( 12 ); ADD( 21 ); ADD( 20 );
+    SUB( 23 );                                              NEXT; // A0
+
+    ADD( 13 ); ADD( 22 ); ADD( 23 );
+    SUB( 12 ); SUB( 20 );                                   NEXT; // A2
+
+    ADD( 14 ); ADD( 23 );
+    SUB( 13 ); SUB( 21 );                                   NEXT; // A2
+
+    ADD( 15 ); ADD( 12 ); ADD( 20 ); ADD( 21 );
+    SUB( 14 ); SUB( 22 ); SUB( 23 );                        NEXT; // A3
+
+    ADD( 21 ); ADD( 21 ); ADD( 16 ); ADD( 13 ); ADD( 12 ); ADD( 20 ); ADD( 22 );
+    SUB( 15 ); SUB( 23 ); SUB( 23 );                        NEXT; // A4
+
+    ADD( 22 ); ADD( 22 ); ADD( 17 ); ADD( 14 ); ADD( 13 ); ADD( 21 ); ADD( 23 );
+    SUB( 16 );                                              NEXT; // A5
+
+    ADD( 23 ); ADD( 23 ); ADD( 18 ); ADD( 15 ); ADD( 14 ); ADD( 22 );
+    SUB( 17 );                                              NEXT; // A6
+
+    ADD( 19 ); ADD( 16 ); ADD( 15 ); ADD( 23 );
+    SUB( 18 );                                              NEXT; // A7
+
+    ADD( 20 ); ADD( 17 ); ADD( 16 );
+    SUB( 19 );                                              NEXT; // A8
+
+    ADD( 21 ); ADD( 18 ); ADD( 17 );
+    SUB( 20 );                                              NEXT; // A9
+
+    ADD( 22 ); ADD( 19 ); ADD( 18 );
+    SUB( 21 );                                              NEXT; // A10
+
+    ADD( 23 ); ADD( 20 ); ADD( 19 );
+    SUB( 22 );                                              LAST; // A11
+
+cleanup:
+    return( ret );
+}
+#endif /* MBEDTLS_ECP_DP_SECP384R1_ENABLED */
 
 #undef A
 #undef LOAD32
@@ -1100,6 +1200,10 @@ cleanup:
 #undef NEXT
 #undef LAST
 
+#endif /* MBEDTLS_ECP_DP_SECP224R1_ENABLED ||
+          MBEDTLS_ECP_DP_SECP256R1_ENABLED ||
+          MBEDTLS_ECP_DP_SECP384R1_ENABLED */
+
 #if defined(MBEDTLS_ECP_DP_SECP521R1_ENABLED)
 /*
  * Here we have an actual Mersenne prime, so things are more straightforward.
@@ -1156,6 +1260,8 @@ cleanup:
 
 #endif /* MBEDTLS_ECP_NIST_OPTIM */
 
+#if defined(MBEDTLS_ECP_DP_CURVE25519_ENABLED)
+
 /* Size of p255 in terms of mbedtls_mpi_uint */
 #define P255_WIDTH      ( 255 / 8 / sizeof( mbedtls_mpi_uint ) + 1 )
 
@@ -1169,28 +1275,34 @@ static int ecp_mod_p255( mbedtls_mpi *N )
     size_t i;
     mbedtls_mpi M;
     mbedtls_mpi_uint Mp[P255_WIDTH + 2];
+
     if( N->n < P255_WIDTH )
         return( 0 );
+
     /* M = A1 */
     M.s = 1;
     M.n = N->n - ( P255_WIDTH - 1 );
     if( M.n > P255_WIDTH + 1 )
         return( MBEDTLS_ERR_ECP_BAD_INPUT_DATA );
     M.p = Mp;
-    mbedtls_platform_zeroize( Mp, sizeof Mp );
+    memset( Mp, 0, sizeof Mp );
     memcpy( Mp, N->p + P255_WIDTH - 1, M.n * sizeof( mbedtls_mpi_uint ) );
     MBEDTLS_MPI_CHK( mbedtls_mpi_shift_r( &M, 255 % ( 8 * sizeof( mbedtls_mpi_uint ) ) ) );
     M.n++; /* Make room for multiplication by 19 */
+
     /* N = A0 */
     MBEDTLS_MPI_CHK( mbedtls_mpi_set_bit( N, 255, 0 ) );
     for( i = P255_WIDTH; i < N->n; i++ )
         N->p[i] = 0;
+
     /* N = A0 + 19 * A1 */
     MBEDTLS_MPI_CHK( mbedtls_mpi_mul_int( &M, &M, 19 ) );
     MBEDTLS_MPI_CHK( mbedtls_mpi_add_abs( N, N, &M ) );
+
 cleanup:
     return( ret );
 }
+#endif /* MBEDTLS_ECP_DP_CURVE25519_ENABLED */
 
 #if defined(MBEDTLS_ECP_DP_CURVE448_ENABLED)
 
@@ -1231,7 +1343,7 @@ static int ecp_mod_p448( mbedtls_mpi *N )
         /* Shouldn't be called with N larger than 2^896! */
         return( MBEDTLS_ERR_ECP_BAD_INPUT_DATA );
     M.p = Mp;
-    mbedtls_platform_zeroize( Mp, sizeof( Mp ) );
+    memset( Mp, 0, sizeof( Mp ) );
     memcpy( Mp, N->p + P448_WIDTH, M.n * sizeof( mbedtls_mpi_uint ) );
 
     /* N = A0 */
@@ -1299,7 +1411,7 @@ static inline int ecp_mod_koblitz( mbedtls_mpi *N, mbedtls_mpi_uint *Rp, size_t
     M.n = N->n - ( p_limbs - adjust );
     if( M.n > p_limbs + adjust )
         M.n = p_limbs + adjust;
-    mbedtls_platform_zeroize( Mp, sizeof Mp );
+    memset( Mp, 0, sizeof Mp );
     memcpy( Mp, N->p + p_limbs - adjust, M.n * sizeof( mbedtls_mpi_uint ) );
     if( shift != 0 )
         MBEDTLS_MPI_CHK( mbedtls_mpi_shift_r( &M, shift ) );
@@ -1321,7 +1433,7 @@ static inline int ecp_mod_koblitz( mbedtls_mpi *N, mbedtls_mpi_uint *Rp, size_t
     M.n = N->n - ( p_limbs - adjust );
     if( M.n > p_limbs + adjust )
         M.n = p_limbs + adjust;
-    mbedtls_platform_zeroize( Mp, sizeof Mp );
+    memset( Mp, 0, sizeof Mp );
     memcpy( Mp, N->p + p_limbs - adjust, M.n * sizeof( mbedtls_mpi_uint ) );
     if( shift != 0 )
         MBEDTLS_MPI_CHK( mbedtls_mpi_shift_r( &M, shift ) );
@@ -1392,4 +1504,4 @@ static int ecp_mod_p256k1( mbedtls_mpi *N )
 
 #endif /* !MBEDTLS_ECP_ALT */
 
-/* #endif /\* MBEDTLS_ECP_C *\/ */
+#endif /* MBEDTLS_ECP_C */
diff --git a/third_party/mbedtls/everest.c b/third_party/mbedtls/everest.c
index fe9ff9d00..437efd6a9 100644
--- a/third_party/mbedtls/everest.c
+++ b/third_party/mbedtls/everest.c
@@ -16,1186 +16,255 @@
 │ limitations under the License.                                               │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/bits/bits.h"
-#include "libc/limits.h"
-#include "third_party/mbedtls/asn1.h"
-#include "third_party/mbedtls/bignum.h"
-#include "third_party/mbedtls/common.h"
-#include "third_party/mbedtls/error.h"
-#include "third_party/mbedtls/everest.h"
-#include "third_party/mbedtls/platform.h"
-#include "third_party/mbedtls/profile.h"
+#include "third_party/mbedtls/endian.h"
 
 asm(".ident\t\"\\n\\n\
 Everest (Apache 2.0)\\n\
 Copyright 2016-2018 INRIA and Microsoft Corporation\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
 
-#if defined(MBEDTLS_ECDH_C) && defined(MBEDTLS_ECDH_VARIANT_EVEREST_ENABLED)
+#define DW(x)     (uint128_t)(x)
+#define EQ(x, y)  ((((x ^ y) | (~(x ^ y) + 1)) >> 63) - 1)
+#define GTE(x, y) (((x ^ ((x ^ y) | ((x - y) ^ y))) >> 63) - 1)
 
-#define load64_le(b) READ64LE(b)
-#define store64_le(b, i) WRITE64LE(b, i)
-
-static uint64_t
-FStar_UInt64_eq_mask(uint64_t a, uint64_t b)
-{
-  uint64_t x = a ^ b;
-  uint64_t minus_x = ~x + 1;
-  uint64_t x_or_minus_x = x | minus_x;
-  uint64_t xnx = x_or_minus_x >> 63;
-  return xnx - 1;
-}
-
-static uint64_t
-FStar_UInt64_gte_mask(uint64_t a, uint64_t b)
-{
-  uint64_t x = a;
-  uint64_t y = b;
-  uint64_t x_xor_y = x ^ y;
-  uint64_t x_sub_y = x - y;
-  uint64_t x_sub_y_xor_y = x_sub_y ^ y;
-  uint64_t q = x_xor_y | x_sub_y_xor_y;
-  uint64_t x_xor_q = x ^ q;
-  uint64_t x_xor_q_ = x_xor_q >> 63;
-  return x_xor_q_ - 1;
-}
-
-static uint32_t
-FStar_UInt32_eq_mask(uint32_t a, uint32_t b)
-{
-  uint32_t x = a ^ b;
-  uint32_t minus_x = ~x + 1;
-  uint32_t x_or_minus_x = x | minus_x;
-  uint32_t xnx = x_or_minus_x >> 31;
-  return xnx - 1;
-}
-
-static uint32_t
-FStar_UInt32_gte_mask(uint32_t a, uint32_t b)
-{
-  uint32_t x = a;
-  uint32_t y = b;
-  uint32_t x_xor_y = x ^ y;
-  uint32_t x_sub_y = x - y;
-  uint32_t x_sub_y_xor_y = x_sub_y ^ y;
-  uint32_t q = x_xor_y | x_sub_y_xor_y;
-  uint32_t x_xor_q = x ^ q;
-  uint32_t x_xor_q_ = x_xor_q >> 31;
-  return x_xor_q_ - 1;
-}
-
-static uint16_t
-FStar_UInt16_eq_mask(uint16_t a, uint16_t b)
-{
-  uint16_t x = a ^ b;
-  uint16_t minus_x = ~x + 1;
-  uint16_t x_or_minus_x = x | minus_x;
-  uint16_t xnx = x_or_minus_x >> 15;
-  return xnx - 1;
-}
-
-static uint16_t
-FStar_UInt16_gte_mask(uint16_t a, uint16_t b)
-{
-  uint16_t x = a;
-  uint16_t y = b;
-  uint16_t x_xor_y = x ^ y;
-  uint16_t x_sub_y = x - y;
-  uint16_t x_sub_y_xor_y = x_sub_y ^ y;
-  uint16_t q = x_xor_y | x_sub_y_xor_y;
-  uint16_t x_xor_q = x ^ q;
-  uint16_t x_xor_q_ = x_xor_q >> 15;
-  return x_xor_q_ - 1;
-}
-
-static uint8_t
-FStar_UInt8_eq_mask(uint8_t a, uint8_t b)
-{
-  uint8_t x = a ^ b;
-  uint8_t minus_x = ~x + 1;
-  uint8_t x_or_minus_x = x | minus_x;
-  uint8_t xnx = x_or_minus_x >> 7;
-  return xnx - 1;
-}
-
-static uint8_t
-FStar_UInt8_gte_mask(uint8_t a, uint8_t b)
-{
-  uint8_t x = a;
-  uint8_t y = b;
-  uint8_t x_xor_y = x ^ y;
-  uint8_t x_sub_y = x - y;
-  uint8_t x_sub_y_xor_y = x_sub_y ^ y;
-  uint8_t q = x_xor_y | x_sub_y_xor_y;
-  uint8_t x_xor_q = x ^ q;
-  uint8_t x_xor_q_ = x_xor_q >> 7;
-  return x_xor_q_ - 1;
-}
-
-static void
-Hacl_Bignum_Modulo_carry_top(uint64_t *b)
-{
-  uint64_t b4 = b[4];
-  uint64_t b0 = b[0];
-  uint64_t b4_ = b4 & 0x7ffffffffffff;
-  uint64_t b0_ = b0 + 19 * (b4 >> 51);
-  b[4] = b4_;
-  b[0] = b0_;
-}
-
-forceinline void
-Hacl_Bignum_Fproduct_copy_from_wide_(uint64_t *output, uint128_t *input)
-{
-  uint32_t i;
-  for (i = 0; i < 5; ++i)
-  {
-    uint128_t xi = input[i];
-    output[i] = xi;
+forceinline void HaclBignumCopy(uint64_t o[5], uint64_t p[5]) {
+  for (unsigned i = 0; i < 5; ++i) {
+    o[i] = p[i];
   }
 }
 
-forceinline void
-Hacl_Bignum_Fproduct_sum_scalar_multiplication_(uint128_t *output, uint64_t *input, uint64_t s)
-{
-  uint32_t i;
-  for (i = 0; i < 5; ++i)
-  {
-    uint128_t xi = output[i];
-    uint64_t yi = input[i];
-    output[i] = xi + (uint128_t)yi * s;
+forceinline void HaclBignumFsum(uint64_t o[5], uint64_t p[5]) {
+  for (unsigned i = 0; i < 5; ++i) {
+    o[i] += p[i];
   }
 }
 
-forceinline void
-Hacl_Bignum_Fproduct_carry_wide_(uint128_t *tmp)
-{
-  uint32_t i;
-  for (i = 0; i < 4; ++i)
-  {
-    uint32_t ctr = i;
-    uint128_t tctr = tmp[ctr];
-    uint128_t tctrp1 = tmp[ctr + 1];
-    uint64_t r0 = (uint64_t)tctr & 0x7ffffffffffff;
-    uint128_t c = tctr >> 51;
-    tmp[ctr] = (uint128_t)r0;
-    tmp[ctr + 1] = tctrp1 + c;
+forceinline void HaclBignumTrunc(uint64_t o[5], uint128_t p[5]) {
+  for (unsigned i = 0; i < 5; ++i) {
+    o[i] = p[i];
   }
 }
 
-forceinline void
-Hacl_Bignum_Fmul_shift_reduce(uint64_t *output)
-{
-  uint64_t tmp = output[4];
-  uint32_t i;
-  for (i = 0; i < 4; ++i)
-  {
-    uint32_t ctr = 5 - i - 1;
-    uint64_t z = output[ctr - 1];
-    output[ctr] = z;
-  }
-  output[0] = tmp * 19;
-}
-
-static inline void
-Hacl_Bignum_Fmul_mul_shift_reduce_(uint128_t *output, uint64_t *input, uint64_t *input2)
-{
-  uint32_t i;
-  for (i = 0; i < 4; ++i)
-  {
-    Hacl_Bignum_Fproduct_sum_scalar_multiplication_(output, input, input2[i]);
-    Hacl_Bignum_Fmul_shift_reduce(input);
-  }
-  Hacl_Bignum_Fproduct_sum_scalar_multiplication_(output, input, input2[4]);
-}
-
-static inline void
-Hacl_Bignum_Fmul_fmul(uint64_t *output, uint64_t *input, uint64_t *input2)
-{
-  uint64_t i0;
-  uint64_t i1;
-  uint64_t i0_;
-  uint64_t i1_;
-  uint128_t b4;
-  uint128_t b0;
-  uint128_t b4_;
-  uint128_t b0_;
-  uint128_t t[5];
-  uint64_t tmp[5];
-  t[0] = 0;
-  t[1] = 0;
-  t[2] = 0;
-  t[3] = 0;
-  t[4] = 0;
-  tmp[0] = input[0];
-  tmp[1] = input[1];
-  tmp[2] = input[2];
-  tmp[3] = input[3];
-  tmp[4] = input[4];
-  Hacl_Bignum_Fmul_mul_shift_reduce_(t, tmp, input2);
-  Hacl_Bignum_Fproduct_carry_wide_(t);
-  b4 = t[4];
-  b0 = t[0];
-  b4_ = b4 & 0x7ffffffffffff;
-  b0_ = b0 + (uint128_t)19 * (uint64_t)(b4 >> 51);
-  t[4] = b4_;
-  t[0] = b0_;
-  Hacl_Bignum_Fproduct_copy_from_wide_(output, t);
-  i0 = output[0];
-  i1 = output[1];
-  i0_ = i0 & 0x7ffffffffffff;
-  i1_ = i1 + (i0 >> 51);
-  output[0] = i0_;
-  output[1] = i1_;
-}
-
-forceinline void
-Hacl_Bignum_Fsquare_fsquare__(uint128_t *tmp, uint64_t *output)
-{
-  uint64_t r0 = output[0];
-  uint64_t r1 = output[1];
-  uint64_t r2 = output[2];
-  uint64_t r3 = output[3];
-  uint64_t r4 = output[4];
-  uint64_t d0 = r0 * 2;
-  uint64_t d1 = r1 * 2;
-  uint64_t d2 = r2 * 2 * 19;
-  uint64_t d419 = r4 * 19;
-  uint64_t d4 = d419 * 2;
-  uint128_t s0 = (uint128_t)r0 * r0 + (uint128_t)d4 * r1 + (uint128_t)d2 * r3;
-  uint128_t s1 = (uint128_t)d0 * r1 + (uint128_t)d4 * r2 + (uint128_t)(r3 * 19) * r3;
-  uint128_t s2 = (uint128_t)d0 * r2 + (uint128_t)r1 * r1 + (uint128_t)d4 * r3;
-  uint128_t s3 = (uint128_t)d0 * r3 + (uint128_t)d1 * r2 + (uint128_t)r4 * d419;
-  uint128_t s4 = (uint128_t)d0 * r4 + (uint128_t)d1 * r3 + (uint128_t)r2 * r2;
-  tmp[0] = s0;
-  tmp[1] = s1;
-  tmp[2] = s2;
-  tmp[3] = s3;
-  tmp[4] = s4;
-}
-
-forceinline void
-Hacl_Bignum_Fsquare_fsquare_(uint128_t *tmp, uint64_t *output)
-{
-  uint128_t b4;
-  uint128_t b0;
-  uint128_t b4_;
-  uint128_t b0_;
-  uint64_t i0;
-  uint64_t i1;
-  uint64_t i0_;
-  uint64_t i1_;
-  Hacl_Bignum_Fsquare_fsquare__(tmp, output);
-  Hacl_Bignum_Fproduct_carry_wide_(tmp);
-  b4 = tmp[4];
-  b0 = tmp[0];
-  b4_ = b4 & 0x7ffffffffffff;
-  b0_ = b0 + (uint128_t)19 * (b4 >> 51);
-  tmp[4] = b4_;
-  tmp[0] = b0_;
-  Hacl_Bignum_Fproduct_copy_from_wide_(output, tmp);
-  i0 = output[0];
-  i1 = output[1];
-  i0_ = i0 & 0x7ffffffffffff;
-  i1_ = i1 + (i0 >> 51);
-  output[0] = i0_;
-  output[1] = i1_;
-}
-
-static void
-Hacl_Bignum_Fsquare_fsquare_times_(uint64_t *input, uint128_t *tmp, uint32_t count1)
-{
-  uint32_t i;
-  Hacl_Bignum_Fsquare_fsquare_(tmp, input);
-  for (i = 1; i < count1; ++i)
-    Hacl_Bignum_Fsquare_fsquare_(tmp, input);
-}
-
-forceinline void
-Hacl_Bignum_Fsquare_fsquare_times(uint64_t *output, uint64_t *input, uint32_t count1)
-{
-  uint128_t t[5];
-  t[0] = 0;
-  t[1] = 0;
-  t[2] = 0;
-  t[3] = 0;
-  t[4] = 0;
-  output[0] = input[0];
-  output[1] = input[1];
-  output[2] = input[2];
-  output[3] = input[3];
-  output[4] = input[4];
-  Hacl_Bignum_Fsquare_fsquare_times_(output, t, count1);
-}
-
-forceinline void
-Hacl_Bignum_Fsquare_fsquare_times_inplace(uint64_t *output, uint32_t count1)
-{
-  uint128_t t[5];
-  t[0] = 0;
-  t[1] = 0;
-  t[2] = 0;
-  t[3] = 0;
-  t[4] = 0;
-  Hacl_Bignum_Fsquare_fsquare_times_(output, t, count1);
-}
-
-forceinline void
-Hacl_Bignum_Crecip_crecip(uint64_t *out, uint64_t *z)
-{
-  uint32_t i;
-  uint64_t buf[20];
-  uint64_t *a0 = buf;
-  uint64_t *t00 = buf + 5;
-  uint64_t *b0 = buf + 10;
-  uint64_t *t01;
-  uint64_t *b1;
-  uint64_t *c0;
-  uint64_t *a;
-  uint64_t *t0;
-  uint64_t *b;
-  uint64_t *c;
-  for (i = 0; i < 20; ++i) buf[i] = 0;
-  Hacl_Bignum_Fsquare_fsquare_times(a0, z, 1);
-  Hacl_Bignum_Fsquare_fsquare_times(t00, a0, 2);
-  Hacl_Bignum_Fmul_fmul(b0, t00, z);
-  Hacl_Bignum_Fmul_fmul(a0, b0, a0);
-  Hacl_Bignum_Fsquare_fsquare_times(t00, a0, 1);
-  Hacl_Bignum_Fmul_fmul(b0, t00, b0);
-  Hacl_Bignum_Fsquare_fsquare_times(t00, b0, 5);
-  t01 = buf + 5;
-  b1 = buf + 10;
-  c0 = buf + 15;
-  Hacl_Bignum_Fmul_fmul(b1, t01, b1);
-  Hacl_Bignum_Fsquare_fsquare_times(t01, b1, 10);
-  Hacl_Bignum_Fmul_fmul(c0, t01, b1);
-  Hacl_Bignum_Fsquare_fsquare_times(t01, c0, 20);
-  Hacl_Bignum_Fmul_fmul(t01, t01, c0);
-  Hacl_Bignum_Fsquare_fsquare_times_inplace(t01, 10);
-  Hacl_Bignum_Fmul_fmul(b1, t01, b1);
-  Hacl_Bignum_Fsquare_fsquare_times(t01, b1, 50);
-  a = buf;
-  t0 = buf + 5;
-  b = buf + 10;
-  c = buf + 15;
-  Hacl_Bignum_Fmul_fmul(c, t0, b);
-  Hacl_Bignum_Fsquare_fsquare_times(t0, c, 100);
-  Hacl_Bignum_Fmul_fmul(t0, t0, c);
-  Hacl_Bignum_Fsquare_fsquare_times_inplace(t0, 50);
-  Hacl_Bignum_Fmul_fmul(t0, t0, b);
-  Hacl_Bignum_Fsquare_fsquare_times_inplace(t0, 5);
-  Hacl_Bignum_Fmul_fmul(out, t0, a);
-}
-
-forceinline void
-Hacl_Bignum_fsum(uint64_t *a, uint64_t *b)
-{
-  uint32_t i;
-  for (i = 0; i < 5; ++i)
-  {
-    uint64_t xi = a[i];
-    uint64_t yi = b[i];
-    a[i] = xi + yi;
+forceinline void HaclBignumCarry(uint64_t p[5]) {
+  for (unsigned i = 0; i < 4; ++i) {
+    p[i + 1] += p[i] >> 51;
+    p[i] &= 0x7ffffffffffff;
   }
 }
 
-forceinline void
-Hacl_Bignum_fdifference(uint64_t *a, uint64_t *b)
-{
-  uint32_t i;
-  uint64_t tmp[5];
-  tmp[0] = b[0] + 0x3fffffffffff68;
-  tmp[1] = b[1] + 0x3ffffffffffff8;
-  tmp[2] = b[2] + 0x3ffffffffffff8;
-  tmp[3] = b[3] + 0x3ffffffffffff8;
-  tmp[4] = b[4] + 0x3ffffffffffff8;
-  for (i = 0; i < 5; ++i)
-  {
-    uint64_t xi = a[i];
-    uint64_t yi = tmp[i];
-    a[i] = yi - xi;
+forceinline void HaclBignumCarryWide(uint128_t p[5]) {
+  for (unsigned i = 0; i < 4; ++i) {
+    p[i + 1] += p[i] >> 51;
+    p[i] &= 0x7ffffffffffff;
   }
 }
 
-forceinline void
-Hacl_Bignum_fscalar(uint64_t *output, uint64_t *b, uint64_t s)
-{
-  uint32_t i;
-  uint128_t b4;
-  uint128_t b0;
-  uint128_t b4_;
-  uint128_t b0_;
-  uint128_t tmp[5];
-  for (i = 0; i < 5; ++i)
-  {
-    tmp[i] = (uint128_t)b[i] * s;
-  }
-  Hacl_Bignum_Fproduct_carry_wide_(tmp);
-  b4 = tmp[4];
-  b0 = tmp[0];
-  b4_ = b4 & 0x7ffffffffffff;
-  b0_ = b0 + (uint128_t)19 * (uint64_t)(b4 >> 51);
-  tmp[4] = b4_;
-  tmp[0] = b0_;
-  Hacl_Bignum_Fproduct_copy_from_wide_(output, tmp);
-}
-
-forceinline void
-Hacl_Bignum_fmul(uint64_t *output, uint64_t *a, uint64_t *b)
-{
-  Hacl_Bignum_Fmul_fmul(output, a, b);
-}
-
-forceinline void
-Hacl_Bignum_crecip(uint64_t *output, uint64_t *input)
-{
-  Hacl_Bignum_Crecip_crecip(output, input);
-}
-
-static void
-Hacl_EC_Point_swap_conditional_step(uint64_t *a, uint64_t *b, uint64_t swap1, uint32_t ctr)
-{
-  uint32_t i = ctr - 1;
-  uint64_t ai = a[i];
-  uint64_t bi = b[i];
-  uint64_t x = swap1 & (ai ^ bi);
-  uint64_t ai1 = ai ^ x;
-  uint64_t bi1 = bi ^ x;
-  a[i] = ai1;
-  b[i] = bi1;
-}
-
-static void
-Hacl_EC_Point_swap_conditional_(uint64_t *a, uint64_t *b, uint64_t swap1, uint32_t ctr)
-{
-  if (ctr)
-  {
-    Hacl_EC_Point_swap_conditional_step(a, b, swap1, ctr);
-    Hacl_EC_Point_swap_conditional_(a, b, swap1, ctr - 1);
-  }
-}
-
-static void
-Hacl_EC_Point_swap_conditional(uint64_t *a, uint64_t *b, uint64_t iswap)
-{
-  uint64_t swap1 = 0 - iswap;
-  Hacl_EC_Point_swap_conditional_(a, b, swap1, 5);
-  Hacl_EC_Point_swap_conditional_(a + 5, b + 5, swap1, 5);
-}
-
-static void
-Hacl_EC_Point_copy(uint64_t *output, uint64_t *input)
-{
-  output[0] = input[0];
-  output[1] = input[1];
-  output[2] = input[2];
-  output[3] = input[3];
-  output[4] = input[4];
-  output[5] = input[5];
-  output[6] = input[6];
-  output[7] = input[7];
-  output[8] = input[8];
-  output[9] = input[9];
-}
-
-static void
-Hacl_EC_Format_fexpand(uint64_t *output, uint8_t *input)
-{
-  uint64_t i0 = load64_le(input);
-  uint8_t *x00 = input + 6;
-  uint64_t i1 = load64_le(x00);
-  uint8_t *x01 = input + 12;
-  uint64_t i2 = load64_le(x01);
-  uint8_t *x02 = input + 19;
-  uint64_t i3 = load64_le(x02);
-  uint8_t *x0 = input + 24;
-  uint64_t i4 = load64_le(x0);
-  uint64_t output0 = i0 & 0x7ffffffffffff;
-  uint64_t output1 = i1 >> 3 & 0x7ffffffffffff;
-  uint64_t output2 = i2 >> 6 & 0x7ffffffffffff;
-  uint64_t output3 = i3 >> 1 & 0x7ffffffffffff;
-  uint64_t output4 = i4 >> 12 & 0x7ffffffffffff;
-  output[0] = output0;
-  output[1] = output1;
-  output[2] = output2;
-  output[3] = output3;
-  output[4] = output4;
-}
-
-static void
-Hacl_EC_Format_fcontract_first_carry_pass(uint64_t *input)
-{
-  uint64_t t0 = input[0];
-  uint64_t t1 = input[1];
-  uint64_t t2 = input[2];
-  uint64_t t3 = input[3];
-  uint64_t t4 = input[4];
-  uint64_t t1_ = t1 + (t0 >> 51);
-  uint64_t t0_ = t0 & 0x7ffffffffffff;
-  uint64_t t2_ = t2 + (t1_ >> 51);
-  uint64_t t1__ = t1_ & 0x7ffffffffffff;
-  uint64_t t3_ = t3 + (t2_ >> 51);
-  uint64_t t2__ = t2_ & 0x7ffffffffffff;
-  uint64_t t4_ = t4 + (t3_ >> 51);
-  uint64_t t3__ = t3_ & 0x7ffffffffffff;
-  input[0] = t0_;
-  input[1] = t1__;
-  input[2] = t2__;
-  input[3] = t3__;
-  input[4] = t4_;
-}
-
-static void
-Hacl_EC_Format_fcontract_first_carry_full(uint64_t *input)
-{
-  Hacl_EC_Format_fcontract_first_carry_pass(input);
-  Hacl_Bignum_Modulo_carry_top(input);
-}
-
-static void
-Hacl_EC_Format_fcontract_second_carry_pass(uint64_t *input)
-{
-  uint64_t t0 = input[0];
-  uint64_t t1 = input[1];
-  uint64_t t2 = input[2];
-  uint64_t t3 = input[3];
-  uint64_t t4 = input[4];
-  uint64_t t1_ = t1 + (t0 >> 51);
-  uint64_t t0_ = t0 & 0x7ffffffffffff;
-  uint64_t t2_ = t2 + (t1_ >> 51);
-  uint64_t t1__ = t1_ & 0x7ffffffffffff;
-  uint64_t t3_ = t3 + (t2_ >> 51);
-  uint64_t t2__ = t2_ & 0x7ffffffffffff;
-  uint64_t t4_ = t4 + (t3_ >> 51);
-  uint64_t t3__ = t3_ & 0x7ffffffffffff;
-  input[0] = t0_;
-  input[1] = t1__;
-  input[2] = t2__;
-  input[3] = t3__;
-  input[4] = t4_;
-}
-
-static void
-Hacl_EC_Format_fcontract_second_carry_full(uint64_t *input)
-{
-  uint64_t i0;
-  uint64_t i1;
-  uint64_t i0_;
-  uint64_t i1_;
-  Hacl_EC_Format_fcontract_second_carry_pass(input);
-  Hacl_Bignum_Modulo_carry_top(input);
-  i0 = input[0];
-  i1 = input[1];
-  i0_ = i0 & 0x7ffffffffffff;
-  i1_ = i1 + (i0 >> 51);
-  input[0] = i0_;
-  input[1] = i1_;
-}
-
-static void
-Hacl_EC_Format_fcontract_trim(uint64_t *input)
-{
-  uint64_t a0 = input[0];
-  uint64_t a1 = input[1];
-  uint64_t a2 = input[2];
-  uint64_t a3 = input[3];
-  uint64_t a4 = input[4];
-  uint64_t mask0 = FStar_UInt64_gte_mask(a0, 0x7ffffffffffed);
-  uint64_t mask1 = FStar_UInt64_eq_mask( a1, 0x7ffffffffffff);
-  uint64_t mask2 = FStar_UInt64_eq_mask( a2, 0x7ffffffffffff);
-  uint64_t mask3 = FStar_UInt64_eq_mask( a3, 0x7ffffffffffff);
-  uint64_t mask4 = FStar_UInt64_eq_mask( a4, 0x7ffffffffffff);
-  uint64_t mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
-  uint64_t a0_ = a0 - (0x7ffffffffffed & mask);
-  uint64_t a1_ = a1 - (0x7ffffffffffff & mask);
-  uint64_t a2_ = a2 - (0x7ffffffffffff & mask);
-  uint64_t a3_ = a3 - (0x7ffffffffffff & mask);
-  uint64_t a4_ = a4 - (0x7ffffffffffff & mask);
-  input[0] = a0_;
-  input[1] = a1_;
-  input[2] = a2_;
-  input[3] = a3_;
-  input[4] = a4_;
-}
-
-static void
-Hacl_EC_Format_fcontract_store(uint8_t *output, uint64_t *input)
-{
-  uint64_t t0 = input[0];
-  uint64_t t1 = input[1];
-  uint64_t t2 = input[2];
-  uint64_t t3 = input[3];
-  uint64_t t4 = input[4];
-  uint64_t o0 = t1 << 51 | t0;
-  uint64_t o1 = t2 << 38 | t1 >> 13;
-  uint64_t o2 = t3 << 25 | t2 >> 26;
-  uint64_t o3 = t4 << 12 | t3 >> 39;
-  uint8_t *b0 = output;
-  uint8_t *b1 = output + 8;
-  uint8_t *b2 = output + 16;
-  uint8_t *b3 = output + 24;
-  store64_le(b0, o0);
-  store64_le(b1, o1);
-  store64_le(b2, o2);
-  store64_le(b3, o3);
-}
-
-static void
-Hacl_EC_Format_fcontract(uint8_t *output, uint64_t *input)
-{
-  Hacl_EC_Format_fcontract_first_carry_full(input);
-  Hacl_EC_Format_fcontract_second_carry_full(input);
-  Hacl_EC_Format_fcontract_trim(input);
-  Hacl_EC_Format_fcontract_store(output, input);
-}
-
-static void
-Hacl_EC_Format_scalar_of_point(uint8_t *scalar, uint64_t *point)
-{
-  uint32_t i;
-  uint64_t *x = point;
-  uint64_t *z = point + 5;
-  uint64_t buf[10];
-  uint64_t *zmone = buf;
-  uint64_t *sc = buf + 5;
-  for (i = 0; i < 10; ++i) buf[i] = 0;
-  Hacl_Bignum_crecip(zmone, z);
-  Hacl_Bignum_fmul(sc, x, zmone);
-  Hacl_EC_Format_fcontract(scalar, sc);
-}
-
-static void
-Hacl_EC_AddAndDouble_fmonty(
-  uint64_t *pp,
-  uint64_t *ppq,
-  uint64_t *p,
-  uint64_t *pq,
-  uint64_t *qmqp
-)
-{
-  uint32_t i;
-  uint64_t *qx = qmqp;
-  uint64_t *x2 = pp;
-  uint64_t *z2 = pp + 5;
-  uint64_t *x3 = ppq;
-  uint64_t *z3 = ppq + 5;
-  uint64_t *x = p;
-  uint64_t *z = p + 5;
-  uint64_t *xprime = pq;
-  uint64_t *zprime = pq + 5;
-  uint64_t buf[40];
-  uint64_t *origx = buf;
-  uint64_t *origxprime0 = buf + 5;
-  uint64_t *xxprime0 = buf + 25;
-  uint64_t *zzprime0 = buf + 30;
-  uint64_t *origxprime;
-  uint64_t *xx0;
-  uint64_t *zz0;
-  uint64_t *xxprime;
-  uint64_t *zzprime;
-  uint64_t *zzzprime;
-  uint64_t *zzz;
-  uint64_t *xx;
-  uint64_t *zz;
-  uint64_t scalar;
-  for (i = 0; i < 40; ++i) buf[i] = 0;
-  origx[0] = x[0];
-  origx[1] = x[1];
-  origx[2] = x[2];
-  origx[3] = x[3];
-  origx[4] = x[4];
-  Hacl_Bignum_fsum(x, z);
-  Hacl_Bignum_fdifference(z, origx);
-  origxprime0[0] = xprime[0];
-  origxprime0[1] = xprime[1];
-  origxprime0[2] = xprime[2];
-  origxprime0[3] = xprime[3];
-  origxprime0[4] = xprime[4];
-  Hacl_Bignum_fsum(xprime, zprime);
-  Hacl_Bignum_fdifference(zprime, origxprime0);
-  Hacl_Bignum_fmul(xxprime0, xprime, z);
-  Hacl_Bignum_fmul(zzprime0, x, zprime);
-  origxprime = buf + 5;
-  xx0 = buf + 15;
-  zz0 = buf + 20;
-  xxprime = buf + 25;
-  zzprime = buf + 30;
-  zzzprime = buf + 35;
-  origxprime[0] = xxprime[0];
-  origxprime[1] = xxprime[1];
-  origxprime[2] = xxprime[2];
-  origxprime[3] = xxprime[3];
-  origxprime[4] = xxprime[4];
-  Hacl_Bignum_fsum(xxprime, zzprime);
-  Hacl_Bignum_fdifference(zzprime, origxprime);
-  Hacl_Bignum_Fsquare_fsquare_times(x3, xxprime, 1);
-  Hacl_Bignum_Fsquare_fsquare_times(zzzprime, zzprime, 1);
-  Hacl_Bignum_fmul(z3, zzzprime, qx);
-  Hacl_Bignum_Fsquare_fsquare_times(xx0, x, 1);
-  Hacl_Bignum_Fsquare_fsquare_times(zz0, z, 1);
-  zzz = buf + 10;
-  xx = buf + 15;
-  zz = buf + 20;
-  Hacl_Bignum_fmul(x2, xx, zz);
-  Hacl_Bignum_fdifference(zz, xx);
-  scalar = 121665;
-  Hacl_Bignum_fscalar(zzz, zz, scalar);
-  Hacl_Bignum_fsum(zzz, xx);
-  Hacl_Bignum_fmul(z2, zzz, zz);
-}
-
-static void
-Hacl_EC_Ladder_SmallLoop_cmult_small_loop_step(
-  uint64_t *nq,
-  uint64_t *nqpq,
-  uint64_t *nq2,
-  uint64_t *nqpq2,
-  uint64_t *q,
-  uint8_t byt
-)
-{
-  uint64_t bit = byt >> 7;
-  Hacl_EC_Point_swap_conditional(nq, nqpq, bit);
-  Hacl_EC_AddAndDouble_fmonty(nq2, nqpq2, nq, nqpq, q);
-  Hacl_EC_Point_swap_conditional(nq2, nqpq2, bit);
-}
-
-static void
-Hacl_EC_Ladder_SmallLoop_cmult_small_loop_double_step(
-  uint64_t *nq,
-  uint64_t *nqpq,
-  uint64_t *nq2,
-  uint64_t *nqpq2,
-  uint64_t *q,
-  uint8_t byt
-)
-{
-  Hacl_EC_Ladder_SmallLoop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
-  Hacl_EC_Ladder_SmallLoop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt<<1);
-}
-
-static void
-Hacl_EC_Ladder_SmallLoop_cmult_small_loop(
-  uint64_t *nq,
-  uint64_t *nqpq,
-  uint64_t *nq2,
-  uint64_t *nqpq2,
-  uint64_t *q,
-  uint8_t byt,
-  uint32_t i
-)
-{
-  if (i)
-  {
-    uint32_t i_ = i - 1;
-    Hacl_EC_Ladder_SmallLoop_cmult_small_loop_double_step(nq, nqpq, nq2, nqpq2, q, byt);
-    Hacl_EC_Ladder_SmallLoop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q, byt << 2, i_);
-  }
-}
-
-static void
-Hacl_EC_Ladder_BigLoop_cmult_big_loop(
-  uint8_t *n1,
-  uint64_t *nq,
-  uint64_t *nqpq,
-  uint64_t *nq2,
-  uint64_t *nqpq2,
-  uint64_t *q,
-  uint32_t i
-)
-{
-  if (i)
-  {
-    uint32_t i1 = i - 1;
-    uint8_t byte = n1[i1];
-    Hacl_EC_Ladder_SmallLoop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q, byte, 4);
-    Hacl_EC_Ladder_BigLoop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, i1);
-  }
-}
-
-static void Hacl_EC_Ladder_cmult(uint64_t *result, uint8_t *n1, uint64_t *q)
-{
-  uint32_t i;
-  uint64_t point_buf[40];
-  uint64_t *nq = point_buf;
-  uint64_t *nqpq = point_buf + 10;
-  uint64_t *nq2 = point_buf + 20;
-  uint64_t *nqpq2 = point_buf + 30;
-  for (i = 0; i < 40; ++i) point_buf[i] = 0;
-  Hacl_EC_Point_copy(nqpq, q);
-  nq[0] = 1;
-  Hacl_EC_Ladder_BigLoop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
-  Hacl_EC_Point_copy(result, nq);
-}
-
-static void
-Hacl_Curve25519_crypto_scalarmult(uint8_t *mypublic, uint8_t *secret, uint8_t *basepoint)
-{
-  uint32_t i;
-  uint64_t buf0[10];
-  uint64_t *x0 = buf0;
-  uint64_t *z = buf0 + 5;
-  for (i = 0; i < 10; ++i) buf0[i] = 0;
-  Hacl_EC_Format_fexpand(x0, basepoint);
-  z[0] = 1;
-  {
-    uint8_t e[32];
-    uint8_t e0;
-    uint8_t e31;
-    uint8_t e01;
-    uint8_t e311;
-    uint8_t e312;
-    uint8_t *scalar;
-    for (i = 0; i < 32; ++i) {
-      e[i] = secret[i];
+static void HaclBignumFmulReduce(uint128_t o[5], uint64_t p[5], uint64_t q[5]) {
+  uint64_t t;
+  unsigned i, j;
+  for (i = 0;; ++i) {
+    for (j = 0; j < 5; ++j) {
+      o[j] += DW(p[j]) * q[i];
     }
-    e0 = e[0];
-    e31 = e[31];
-    e01 = e0 & 248;
-    e311 = e31 & 127;
-    e312 = e311 | 64;
-    e[0] = e01;
-    e[31] = e312;
-    scalar = e;
-    {
-      uint64_t buf[15];
-      buf[0] = 1;
-      for (i = 1; i < 15; ++i) buf[i] = 0;
-      Hacl_EC_Ladder_cmult(buf, scalar, buf0);
-      Hacl_EC_Format_scalar_of_point(mypublic, buf);
+    if (i == 4) break;
+    t = p[4] * 19;
+    p[4] = p[3];
+    p[3] = p[2];
+    p[2] = p[1];
+    p[1] = p[0];
+    p[0] = t;
+  }
+}
+
+static void HaclBignumFmul(uint64_t o[5], uint64_t p[5], uint64_t q[5]) {
+  uint128_t t[5] = {0};
+  uint64_t u[5] = {p[0], p[1], p[2], p[3], p[4]};
+  HaclBignumFmulReduce(t, u, q);
+  HaclBignumCarryWide(t);
+  t[0] += DW(19) * (uint64_t)(t[4] >> 51);
+  HaclBignumTrunc(o, t);
+  o[1] += o[0] >> 51;
+  o[4] &= 0x7ffffffffffff;
+  o[0] &= 0x7ffffffffffff;
+}
+
+static void HaclBignumFsquare(uint128_t t[5], uint64_t p[5]) {
+  t[0] = DW(p[0] * 1) * p[0] + DW(p[4] * 38) * p[1] + DW(p[2] * 38) * p[3];
+  t[1] = DW(p[0] * 2) * p[1] + DW(p[4] * 38) * p[2] + DW(p[3] * 19) * p[3];
+  t[2] = DW(p[0] * 2) * p[2] + DW(p[1] * 01) * p[1] + DW(p[4] * 38) * p[3];
+  t[3] = DW(p[0] * 2) * p[3] + DW(p[1] * 02) * p[2] + DW(p[4]) * (p[4] * 19);
+  t[4] = DW(p[0] * 2) * p[4] + DW(p[1] * 02) * p[3] + DW(p[2]) * p[2];
+}
+
+static void HaclBignumFsqa(uint64_t o[5], uint32_t n) {
+  uint128_t t[5];
+  for (unsigned i = 0; i < n; ++i) {
+    HaclBignumFsquare(t, o);
+    HaclBignumCarryWide(t);
+    t[0] += DW(19) * (uint64_t)(t[4] >> 51);
+    HaclBignumTrunc(o, t);
+    o[1] += o[0] >> 51;
+    o[4] &= 0x7ffffffffffff;
+    o[0] &= 0x7ffffffffffff;
+  }
+}
+
+static void HaclBignumFsqr(uint64_t o[5], uint64_t p[5], uint32_t n) {
+  HaclBignumCopy(o, p);
+  HaclBignumFsqa(o, n);
+}
+
+static void HaclBignumCrecip(uint64_t o[5], uint64_t z[5]) {
+  uint64_t b[4][5];
+  HaclBignumFsqr(b[0], z, 1);
+  HaclBignumFsqr(b[1], b[0], 2);
+  HaclBignumFmul(b[2], b[1], z);
+  HaclBignumFmul(b[0], b[2], b[0]);
+  HaclBignumFsqr(b[1], b[0], 1);
+  HaclBignumFmul(b[2], b[1], b[2]);
+  HaclBignumFsqr(b[1], b[2], 5);
+  HaclBignumFmul(b[2], b[1], b[2]);
+  HaclBignumFsqr(b[1], b[2], 10);
+  HaclBignumFmul(b[3], b[1], b[2]);
+  HaclBignumFsqr(b[1], b[3], 20);
+  HaclBignumFmul(b[1], b[1], b[3]);
+  HaclBignumFsqa(b[1], 10);
+  HaclBignumFmul(b[2], b[1], b[2]);
+  HaclBignumFsqr(b[1], b[2], 50);
+  HaclBignumFmul(b[3], b[1], b[2]);
+  HaclBignumFsqr(b[1], b[3], 100);
+  HaclBignumFmul(b[1], b[1], b[3]);
+  HaclBignumFsqa(b[1], 50);
+  HaclBignumFmul(b[1], b[1], b[2]);
+  HaclBignumFsqa(b[1], 5);
+  HaclBignumFmul(o, b[1], b[0]);
+}
+
+static void HaclBignumFdif(uint64_t a[5], uint64_t b[5]) {
+  a[0] = b[0] + 0x3fffffffffff68 - a[0];
+  a[1] = b[1] + 0x3ffffffffffff8 - a[1];
+  a[2] = b[2] + 0x3ffffffffffff8 - a[2];
+  a[3] = b[3] + 0x3ffffffffffff8 - a[3];
+  a[4] = b[4] + 0x3ffffffffffff8 - a[4];
+}
+
+static void HaclBignumFscalar(uint64_t o[5], uint64_t p[5], uint64_t s) {
+  unsigned i;
+  uint128_t t[5];
+  for (i = 0; i < 5; ++i) t[i] = DW(p[i]) * s;
+  HaclBignumCarryWide(t);
+  t[0] += DW(19) * (uint64_t)(t[4] >> 51);
+  t[4] &= 0x7ffffffffffff;
+  HaclBignumTrunc(o, t);
+}
+
+static void HaclEcPointSwap(uint64_t a[2][5], uint64_t b[2][5], uint64_t m) {
+  unsigned i, j;
+  uint64_t x, y;
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < 5; ++j) {
+      x = a[i][j] ^ (-m & (a[i][j] ^ b[i][j]));
+      y = b[i][j] ^ (-m & (a[i][j] ^ b[i][j]));
+      a[i][j] = x;
+      b[i][j] = y;
     }
   }
 }
 
-static void
-mbedtls_x25519_init( mbedtls_x25519_context *ctx )
-{
-    mbedtls_platform_zeroize( ctx, sizeof( mbedtls_x25519_context ) );
+static void HaclEcFormatFexpand(uint64_t o[5], uint8_t p[32]) {
+  o[0] = READ64LE(p + 000) >> 00 & 0x7ffffffffffff;
+  o[1] = READ64LE(p + 006) >> 03 & 0x7ffffffffffff;
+  o[2] = READ64LE(p + 014) >> 06 & 0x7ffffffffffff;
+  o[3] = READ64LE(p + 023) >> 01 & 0x7ffffffffffff;
+  o[4] = READ64LE(p + 030) >> 12 & 0x7ffffffffffff;
 }
 
-static void
-mbedtls_x25519_free( mbedtls_x25519_context *ctx )
-{
-    if( !ctx )
-        return;
-    mbedtls_platform_zeroize( ctx->our_secret, MBEDTLS_X25519_KEY_SIZE_BYTES );
-    mbedtls_platform_zeroize( ctx->peer_point, MBEDTLS_X25519_KEY_SIZE_BYTES );
+static void HaclEcFormatFcontract(uint8_t o[32], uint64_t p[5]) {
+  uint64_t m;
+  HaclBignumCarry(p);
+  p[0] += 19 * (p[4] >> 51);
+  p[4] &= 0x7ffffffffffff;
+  HaclBignumCarry(p);
+  p[0] += 19 * (p[4] >> 51);
+  p[1] += p[0] >> 51;
+  p[0] &= 0x7ffffffffffff;
+  p[1] &= 0x7ffffffffffff;
+  p[4] &= 0x7ffffffffffff;
+  m = GTE(p[0], 0x7ffffffffffed);
+  m &= EQ(p[1], 0x7ffffffffffff);
+  m &= EQ(p[2], 0x7ffffffffffff);
+  m &= EQ(p[3], 0x7ffffffffffff);
+  m &= EQ(p[4], 0x7ffffffffffff);
+  p[0] -= 0x7ffffffffffed & m;
+  p[1] -= 0x7ffffffffffff & m;
+  p[2] -= 0x7ffffffffffff & m;
+  p[3] -= 0x7ffffffffffff & m;
+  p[4] -= 0x7ffffffffffff & m;
+  Write64le(o + 000, p[1] << 51 | p[0] >> 00);
+  Write64le(o + 010, p[2] << 38 | p[1] >> 13);
+  Write64le(o + 020, p[3] << 25 | p[2] >> 26);
+  Write64le(o + 030, p[4] << 12 | p[3] >> 39);
 }
 
-static int
-mbedtls_x25519_make_params( mbedtls_x25519_context *ctx, size_t *olen,
-                            unsigned char *buf, size_t blen,
-                            int( *f_rng )(void *, unsigned char *, size_t),
-                            void *p_rng )
-{
-    int ret = 0;
-    uint8_t base[MBEDTLS_X25519_KEY_SIZE_BYTES] = {0};
-    if( ( ret = f_rng( p_rng, ctx->our_secret, MBEDTLS_X25519_KEY_SIZE_BYTES ) ) != 0 )
-        return ret;
-    *olen = MBEDTLS_X25519_KEY_SIZE_BYTES + 4;
-    if( blen < *olen )
-        return( MBEDTLS_ERR_ECP_BUFFER_TOO_SMALL );
-    *buf++ = MBEDTLS_ECP_TLS_NAMED_CURVE;
-    *buf++ = MBEDTLS_ECP_TLS_CURVE25519 >> 8;
-    *buf++ = MBEDTLS_ECP_TLS_CURVE25519 & 0xFF;
-    *buf++ = MBEDTLS_X25519_KEY_SIZE_BYTES;
-    base[0] = 9;
-    Hacl_Curve25519_crypto_scalarmult( buf, ctx->our_secret, base );
-    base[0] = 0;
-    if( timingsafe_memcmp( buf, base, MBEDTLS_X25519_KEY_SIZE_BYTES) == 0 )
-        return MBEDTLS_ERR_ECP_RANDOM_FAILED;
-    return( 0 );
+static void HaclEcFormatScalarOfPoint(uint8_t o[32], uint64_t p[2][5]) {
+  uint64_t t[2][5];
+  HaclBignumCrecip(t[0], p[1]);
+  HaclBignumFmul(t[1], p[0], t[0]);
+  HaclEcFormatFcontract(o, t[1]);
 }
 
-static int
-mbedtls_x25519_read_params( mbedtls_x25519_context *ctx,
-                            const unsigned char **buf, const unsigned char *end )
-{
-    if( end - *buf < MBEDTLS_X25519_KEY_SIZE_BYTES + 1 )
-        return( MBEDTLS_ERR_ECP_BAD_INPUT_DATA );
-    if( ( *(*buf)++ != MBEDTLS_X25519_KEY_SIZE_BYTES ) )
-        return( MBEDTLS_ERR_ECP_BAD_INPUT_DATA );
-    memcpy( ctx->peer_point, *buf, MBEDTLS_X25519_KEY_SIZE_BYTES );
-    *buf += MBEDTLS_X25519_KEY_SIZE_BYTES;
-    return( 0 );
+static void HaclEcAddAndDoubleFmonty(uint64_t xz2[2][5], uint64_t xz3[2][5],
+                                     uint64_t xz[2][5], uint64_t xzprime[2][5],
+                                     uint64_t qx[5]) {
+  uint64_t b[7][5];
+  HaclBignumCopy(b[0], xz[0]);
+  HaclBignumFsum(xz[0], xz[1]);
+  HaclBignumFdif(xz[1], b[0]);
+  HaclBignumCopy(b[0], xzprime[0]);
+  HaclBignumFsum(xzprime[0], xzprime[1]);
+  HaclBignumFdif(xzprime[1], b[0]);
+  HaclBignumFmul(b[4], xzprime[0], xz[1]);
+  HaclBignumFmul(b[5], xz[0], xzprime[1]);
+  HaclBignumCopy(b[0], b[4]);
+  HaclBignumFsum(b[4], b[5]);
+  HaclBignumFdif(b[5], b[0]);
+  HaclBignumFsqr(xz3[0], b[4], 1);
+  HaclBignumFsqr(b[6], b[5], 1);
+  HaclBignumFmul(xz3[1], b[6], qx);
+  HaclBignumFsqr(b[2], xz[0], 1);
+  HaclBignumFsqr(b[3], xz[1], 1);
+  HaclBignumFmul(xz2[0], b[2], b[3]);
+  HaclBignumFdif(b[3], b[2]);
+  HaclBignumFscalar(b[1], b[3], 121665);
+  HaclBignumFsum(b[1], b[2]);
+  HaclBignumFmul(xz2[1], b[1], b[3]);
 }
 
-static int
-mbedtls_x25519_get_params( mbedtls_x25519_context *ctx, const mbedtls_ecp_keypair *key,
-                           mbedtls_x25519_ecdh_side side )
-{
-    size_t olen = 0;
-    switch( side ) {
-    case MBEDTLS_X25519_ECDH_THEIRS:
-        return mbedtls_ecp_point_write_binary( &key->grp, &key->Q, 
-                                               MBEDTLS_ECP_PF_COMPRESSED, 
-                                               &olen, ctx->peer_point, 
-                                               MBEDTLS_X25519_KEY_SIZE_BYTES );
-    case MBEDTLS_X25519_ECDH_OURS:
-        return mbedtls_mpi_write_binary_le( &key->d, ctx->our_secret, 
-                                            MBEDTLS_X25519_KEY_SIZE_BYTES );
-    default:
-        return( MBEDTLS_ERR_ECP_BAD_INPUT_DATA );
+/**
+ * Computes elliptic curve 25519.
+ * @note it has 126 bits of security
+ */
+void curve25519(uint8_t mypublic[32], const uint8_t secret[32],
+                const uint8_t basepoint[32]) {
+  uint32_t i, j;
+  uint8_t e[32], s;
+  uint64_t q[5], t[4][2][5] = {{{1}}, {{0}, {1}}};
+  HaclEcFormatFexpand(q, basepoint);
+  for (j = 0; j < 32; ++j) e[j] = secret[j];
+  e[0] &= 248;
+  e[31] = (e[31] & 127) | 64;
+  HaclBignumCopy(t[1][0], q);
+  for (i = 32; i--;) {
+    for (s = e[i], j = 4; j--;) {
+      HaclEcPointSwap(t[0], t[1], s >> 7);
+      HaclEcAddAndDoubleFmonty(t[2], t[3], t[0], t[1], q);
+      HaclEcPointSwap(t[2], t[3], s >> 7);
+      s <<= 1;
+      HaclEcPointSwap(t[2], t[3], s >> 7);
+      HaclEcAddAndDoubleFmonty(t[0], t[1], t[2], t[3], q);
+      HaclEcPointSwap(t[0], t[1], s >> 7);
+      s <<= 1;
     }
+  }
+  HaclEcFormatScalarOfPoint(mypublic, t[0]);
 }
-
-static int
-mbedtls_x25519_calc_secret( mbedtls_x25519_context *ctx, size_t *olen,
-                            unsigned char *buf, size_t blen,
-                            int( *f_rng )(void *, unsigned char *, size_t),
-                            void *p_rng )
-{
-    /* f_rng and p_rng are not used here because this implementation does not
-       need blinding since it has constant trace. (todo(jart): wut?) */
-    (( void )f_rng);
-    (( void )p_rng);
-    *olen = MBEDTLS_X25519_KEY_SIZE_BYTES;
-    if( blen < *olen )
-        return( MBEDTLS_ERR_ECP_BUFFER_TOO_SMALL );
-    Hacl_Curve25519_crypto_scalarmult( buf, ctx->our_secret, ctx->peer_point);
-    /* Wipe the DH secret and don't let the peer chose a small subgroup point */
-    mbedtls_platform_zeroize( ctx->our_secret, MBEDTLS_X25519_KEY_SIZE_BYTES );
-    if( timingsafe_memcmp( buf, ctx->our_secret, MBEDTLS_X25519_KEY_SIZE_BYTES ) == 0 )
-        return MBEDTLS_ERR_ECP_RANDOM_FAILED;
-    return( 0 );
-}
-
-static int
-mbedtls_x25519_make_public( mbedtls_x25519_context *ctx, size_t *olen,
-                            unsigned char *buf, size_t blen,
-                            int( *f_rng )(void *, unsigned char *, size_t),
-                            void *p_rng )
-{
-    int ret = 0;
-    unsigned char base[MBEDTLS_X25519_KEY_SIZE_BYTES] = { 0 };
-    if( ctx == NULL )
-        return( MBEDTLS_ERR_ECP_BAD_INPUT_DATA );
-    if( ( ret = f_rng( p_rng, ctx->our_secret, MBEDTLS_X25519_KEY_SIZE_BYTES ) ) != 0 )
-        return ret;
-    *olen = MBEDTLS_X25519_KEY_SIZE_BYTES + 1;
-    if( blen < *olen )
-        return(MBEDTLS_ERR_ECP_BUFFER_TOO_SMALL);
-    *buf++ = MBEDTLS_X25519_KEY_SIZE_BYTES;
-    base[0] = 9;
-    Hacl_Curve25519_crypto_scalarmult( buf, ctx->our_secret, base );
-    base[0] = 0;
-    if( memcmp( buf, base, MBEDTLS_X25519_KEY_SIZE_BYTES ) == 0 )
-        return MBEDTLS_ERR_ECP_RANDOM_FAILED;
-    return( ret );
-}
-
-static int
-mbedtls_x25519_read_public( mbedtls_x25519_context *ctx,
-                            const unsigned char *buf, size_t blen )
-{
-    if( blen < MBEDTLS_X25519_KEY_SIZE_BYTES + 1 )
-        return(MBEDTLS_ERR_ECP_BUFFER_TOO_SMALL);
-    if( (*buf++ != MBEDTLS_X25519_KEY_SIZE_BYTES) )
-        return(MBEDTLS_ERR_ECP_BAD_INPUT_DATA);
-    memcpy( ctx->peer_point, buf, MBEDTLS_X25519_KEY_SIZE_BYTES );
-    return( 0 );
-}
-
-/**
- * \brief           This function sets up the ECDH context with the information
- *                  given.
- *
- *                  This function should be called after mbedtls_ecdh_init() but
- *                  before mbedtls_ecdh_make_params(). There is no need to call
- *                  this function before mbedtls_ecdh_read_params().
- *
- *                  This is the first function used by a TLS server for ECDHE
- *                  ciphersuites.
- *
- * \param ctx       The ECDH context to set up.
- * \param grp_id    The group id of the group to set up the context for.
- *
- * \return          \c 0 on success.
- */
-int mbedtls_everest_setup( mbedtls_ecdh_context_everest *ctx, int grp_id )
-{
-    if( grp_id != MBEDTLS_ECP_DP_CURVE25519 )
-        return MBEDTLS_ERR_ECP_BAD_INPUT_DATA;
-    mbedtls_x25519_init( &ctx->ctx );
-    return 0;
-}
-
-/**
- * \brief           This function frees a context.
- *
- * \param ctx       The context to free.
- */
-void mbedtls_everest_free( mbedtls_ecdh_context_everest *ctx )
-{
-    mbedtls_x25519_free( &ctx->ctx );
-}
-
-/**
- * \brief           This function generates a public key and a TLS
- *                  ServerKeyExchange payload.
- *
- *                  This is the second function used by a TLS server for ECDHE
- *                  ciphersuites. (It is called after mbedtls_ecdh_setup().)
- *
- * \note            This function assumes that the ECP group (grp) of the
- *                  \p ctx context has already been properly set,
- *                  for example, using mbedtls_ecp_group_load().
- *
- * \see             ecp.h
- *
- * \param ctx       The ECDH context.
- * \param olen      The number of characters written.
- * \param buf       The destination buffer.
- * \param blen      The length of the destination buffer.
- * \param f_rng     The RNG function.
- * \param p_rng     The RNG context.
- *
- * \return          \c 0 on success.
- * \return          An \c MBEDTLS_ERR_ECP_XXX error code on failure.
- */
-int mbedtls_everest_make_params( mbedtls_ecdh_context_everest *ctx, size_t *olen,
-                                 unsigned char *buf, size_t blen,
-                                 int( *f_rng )( void *, unsigned char *, size_t ),
-                                 void *p_rng )
-{
-    mbedtls_x25519_context *x25519_ctx = &ctx->ctx;
-    return mbedtls_x25519_make_params( x25519_ctx, olen, buf, blen, f_rng, p_rng );
-}
-
-/**
- * \brief           This function parses and processes a TLS ServerKeyExhange
- *                  payload.
- *
- *                  This is the first function used by a TLS client for ECDHE
- *                  ciphersuites.
- *
- * \see             ecp.h
- *
- * \param ctx       The ECDH context.
- * \param buf       The pointer to the start of the input buffer.
- * \param end       The address for one Byte past the end of the buffer.
- *
- * \return          \c 0 on success.
- * \return          An \c MBEDTLS_ERR_ECP_XXX error code on failure.
- *
- */
-int mbedtls_everest_read_params( mbedtls_ecdh_context_everest *ctx,
-                                 const unsigned char **buf,
-                                 const unsigned char *end )
-{
-    mbedtls_x25519_context *x25519_ctx = &ctx->ctx;
-    return mbedtls_x25519_read_params( x25519_ctx, buf, end );
-}
-
-/**
- * \brief           This function sets up an ECDH context from an EC key.
- *
- *                  It is used by clients and servers in place of the
- *                  ServerKeyEchange for static ECDH, and imports ECDH
- *                  parameters from the EC key information of a certificate.
- *
- * \see             ecp.h
- *
- * \param ctx       The ECDH context to set up.
- * \param key       The EC key to use.
- * \param side      Defines the source of the key: 1: Our key, or
- *                  0: The key of the peer.
- *
- * \return          \c 0 on success.
- * \return          An \c MBEDTLS_ERR_ECP_XXX error code on failure.
- *
- */
-int mbedtls_everest_get_params( mbedtls_ecdh_context_everest *ctx,
-                                const mbedtls_ecp_keypair *key,
-                                mbedtls_everest_ecdh_side side )
-{
-    mbedtls_x25519_context *x25519_ctx = &ctx->ctx;
-    mbedtls_x25519_ecdh_side s = side == MBEDTLS_EVEREST_ECDH_OURS ?
-                                            MBEDTLS_X25519_ECDH_OURS :
-                                            MBEDTLS_X25519_ECDH_THEIRS;
-    return mbedtls_x25519_get_params( x25519_ctx, key, s );
-}
-
-/**
- * \brief           This function generates a public key and a TLS
- *                  ClientKeyExchange payload.
- *
- *                  This is the second function used by a TLS client for ECDH(E)
- *                  ciphersuites.
- *
- * \see             ecp.h
- *
- * \param ctx       The ECDH context.
- * \param olen      The number of Bytes written.
- * \param buf       The destination buffer.
- * \param blen      The size of the destination buffer.
- * \param f_rng     The RNG function.
- * \param p_rng     The RNG context.
- *
- * \return          \c 0 on success.
- * \return          An \c MBEDTLS_ERR_ECP_XXX error code on failure.
- */
-int mbedtls_everest_make_public( mbedtls_ecdh_context_everest *ctx, size_t *olen,
-                                 unsigned char *buf, size_t blen,
-                                 int( *f_rng )( void *, unsigned char *, size_t ),
-                                 void *p_rng )
-{
-    mbedtls_x25519_context *x25519_ctx = &ctx->ctx;
-    return mbedtls_x25519_make_public( x25519_ctx, olen, buf, blen, f_rng, p_rng );
-}
-
-/**
- * \brief       This function parses and processes a TLS ClientKeyExchange
- *              payload.
- *
- *              This is the third function used by a TLS server for ECDH(E)
- *              ciphersuites. (It is called after mbedtls_ecdh_setup() and
- *              mbedtls_ecdh_make_params().)
- *
- * \see         ecp.h
- *
- * \param ctx   The ECDH context.
- * \param buf   The start of the input buffer.
- * \param blen  The length of the input buffer.
- *
- * \return      \c 0 on success.
- * \return      An \c MBEDTLS_ERR_ECP_XXX error code on failure.
- */
-int mbedtls_everest_read_public( mbedtls_ecdh_context_everest *ctx,
-                                 const unsigned char *buf, size_t blen )
-{
-    mbedtls_x25519_context *x25519_ctx = &ctx->ctx;
-    return mbedtls_x25519_read_public( x25519_ctx, buf, blen );
-}
-
-/**
- * \brief           This function derives and exports the shared secret.
- *
- *                  This is the last function used by both TLS client
- *                  and servers.
- *
- * \note            If \p f_rng is not NULL, it is used to implement
- *                  countermeasures against side-channel attacks.
- *                  For more information, see mbedtls_ecp_mul().
- *
- * \see             ecp.h
- *
- * \param ctx       The ECDH context.
- * \param olen      The number of Bytes written.
- * \param buf       The destination buffer.
- * \param blen      The length of the destination buffer.
- * \param f_rng     The RNG function.
- * \param p_rng     The RNG context.
- *
- * \return          \c 0 on success.
- * \return          An \c MBEDTLS_ERR_ECP_XXX error code on failure.
- */
-int mbedtls_everest_calc_secret( mbedtls_ecdh_context_everest *ctx, size_t *olen,
-                                 unsigned char *buf, size_t blen,
-                                 int( *f_rng )( void *, unsigned char *, size_t ),
-                                 void *p_rng )
-{
-    mbedtls_x25519_context *x25519_ctx = &ctx->ctx;
-    return mbedtls_x25519_calc_secret( x25519_ctx, olen, buf, blen, f_rng, p_rng );
-}
-
-#endif /* MBEDTLS_ECDH_C && MBEDTLS_ECDH_VARIANT_EVEREST_ENABLED */
diff --git a/third_party/mbedtls/everest.h b/third_party/mbedtls/everest.h
index be4c43f16..592aff1ea 100644
--- a/third_party/mbedtls/everest.h
+++ b/third_party/mbedtls/everest.h
@@ -1,52 +1,10 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_MBEDTLS_X25519_H_
-#define COSMOPOLITAN_THIRD_PARTY_MBEDTLS_X25519_H_
-#include "third_party/mbedtls/config.h"
-#include "third_party/mbedtls/ecp.h"
+#ifndef COSMOPOLITAN_THIRD_PARTY_MBEDTLS_EVEREST_H_
+#define COSMOPOLITAN_THIRD_PARTY_MBEDTLS_EVEREST_H_
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
 COSMOPOLITAN_C_START_
 
-#define MBEDTLS_ECP_TLS_CURVE25519    0x1d
-#define MBEDTLS_X25519_KEY_SIZE_BYTES 32
-
-typedef enum {
-  MBEDTLS_X25519_ECDH_OURS,
-  MBEDTLS_X25519_ECDH_THEIRS,
-} mbedtls_x25519_ecdh_side;
-
-typedef struct {
-  unsigned char our_secret[MBEDTLS_X25519_KEY_SIZE_BYTES];
-  unsigned char peer_point[MBEDTLS_X25519_KEY_SIZE_BYTES];
-} mbedtls_x25519_context;
-
-typedef enum {
-  MBEDTLS_EVEREST_ECDH_OURS,
-  MBEDTLS_EVEREST_ECDH_THEIRS,
-} mbedtls_everest_ecdh_side;
-
-typedef struct {
-  mbedtls_x25519_context ctx;
-} mbedtls_ecdh_context_everest;
-
-int mbedtls_everest_setup(mbedtls_ecdh_context_everest *, int);
-void mbedtls_everest_free(mbedtls_ecdh_context_everest *);
-int mbedtls_everest_make_params(mbedtls_ecdh_context_everest *, size_t *,
-                                unsigned char *, size_t,
-                                int (*)(void *, unsigned char *, size_t),
-                                void *);
-int mbedtls_everest_read_params(mbedtls_ecdh_context_everest *,
-                                const unsigned char **, const unsigned char *);
-int mbedtls_everest_get_params(mbedtls_ecdh_context_everest *,
-                               const mbedtls_ecp_keypair *,
-                               mbedtls_everest_ecdh_side);
-int mbedtls_everest_make_public(mbedtls_ecdh_context_everest *, size_t *,
-                                unsigned char *, size_t,
-                                int (*)(void *, unsigned char *, size_t),
-                                void *);
-int mbedtls_everest_read_public(mbedtls_ecdh_context_everest *,
-                                const unsigned char *, size_t);
-int mbedtls_everest_calc_secret(mbedtls_ecdh_context_everest *, size_t *,
-                                unsigned char *, size_t,
-                                int (*)(void *, unsigned char *, size_t),
-                                void *);
+void curve25519(uint8_t[32], const uint8_t[32], const uint8_t[32]);
 
 COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_MBEDTLS_X25519_H_ */
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_THIRD_PARTY_MBEDTLS_EVEREST_H_ */
diff --git a/third_party/mbedtls/mbedtls.mk b/third_party/mbedtls/mbedtls.mk
index 6f3ea1b68..8bbea0621 100644
--- a/third_party/mbedtls/mbedtls.mk
+++ b/third_party/mbedtls/mbedtls.mk
@@ -55,7 +55,7 @@ $(THIRD_PARTY_MBEDTLS_A_OBJS):						\
 
 o/$(MODE)/third_party/mbedtls/everest.o:				\
 			OVERRIDE_CFLAGS +=				\
-				-Os
+				-O3
 
 o/$(MODE)/third_party/mbedtls/bigmul4.o					\
 o/$(MODE)/third_party/mbedtls/bigmul6.o:				\
@@ -70,11 +70,6 @@ o/$(MODE)/third_party/mbedtls/shiftright2-avx.o:			\
 			OVERRIDE_CFLAGS +=				\
 				-O3 -mavx
 
-# tail recursion is so important because everest was written in f*
-o/$(MODE)/third_party/mbedtls/everest.o:				\
-			OVERRIDE_CFLAGS +=				\
-				-foptimize-sibling-calls
-
 THIRD_PARTY_MBEDTLS_LIBS = $(foreach x,$(THIRD_PARTY_MBEDTLS_ARTIFACTS),$($(x)))
 THIRD_PARTY_MBEDTLS_SRCS = $(foreach x,$(THIRD_PARTY_MBEDTLS_ARTIFACTS),$($(x)_SRCS))
 THIRD_PARTY_MBEDTLS_HDRS = $(foreach x,$(THIRD_PARTY_MBEDTLS_ARTIFACTS),$($(x)_HDRS))
diff --git a/third_party/mbedtls/secp256r1.c b/third_party/mbedtls/secp256r1.c
index 53ad1f62d..7df7f9ac8 100644
--- a/third_party/mbedtls/secp256r1.c
+++ b/third_party/mbedtls/secp256r1.c
@@ -26,7 +26,7 @@
 #define H(w) (w & 0xffffffff00000000)
 
 /**
- * Fastest quasi-reduction modulo NIST P-256.
+ * Fastest quasi-reduction modulo ℘256.
  *
  *     p  = 2²⁵⁶ - 2²²⁴ + 2¹⁹² + 2⁹⁶ - 1
  *     B  = T + 2×S₁ + 2×S₂ + S₃ + S₄ – D₁ – D₂ – D₃ – D₄ mod p
diff --git a/third_party/mbedtls/secp384r1.c b/third_party/mbedtls/secp384r1.c
index 96652c43e..307b72164 100644
--- a/third_party/mbedtls/secp384r1.c
+++ b/third_party/mbedtls/secp384r1.c
@@ -24,7 +24,7 @@
 #define Q(i) p[i >> 1]
 
 /**
- * Fastest quasi-reduction modulo Prime 384.
+ * Fastest quasi-reduction modulo ℘384.
  *
  *     p  = 2³⁸⁴ – 2¹²⁸ – 2⁶ + 2³² – 1
  *     B  = T + 2×S₁ + S₂ + S₃ + S₄ + S₅ + S₆ – D₁ – D₂ – D₃ mod p
@@ -44,8 +44,7 @@
 void secp384r1(uint64_t p[12]) {
   int r;
   char o;
-  signed char G;
-  uint64_t A, B, C, D, E, F, a, b, c;
+  uint64_t A, B, C, D, E, F, G, a, b, c;
   A = Q(0);
   B = Q(2);
   C = Q(4);
@@ -57,8 +56,8 @@ void secp384r1(uint64_t p[12]) {
   a = Q(22) << 32 | Q(21) >> 32;
   b = Q(23) >> 32;
   ADC(C, C, a << 1, 0, o);
-  ADC(D, D, (b << 1 | a >> 63), o, o);
-  ADC(E, E, (b >> 63), o, o);
+  ADC(D, D, b << 1 | a >> 63, o, o);
+  ADC(E, E, b >> 63, o, o);
   ADC(F, F, o, o, o);
   G += o;
   ADC(A, A, Q(12), 0, o);
@@ -118,91 +117,105 @@ void secp384r1(uint64_t p[12]) {
   asm volatile(/* S₁ = (0  ‖0  ‖0  ‖0  ‖0  ‖A₂₃‖A₂₂‖A₂₁‖0  ‖0  ‖0  ‖0  ) */
                "mov\t21*4(%9),%7\n\t"
                "mov\t23*4(%9),%k8\n\t"
+               "mov\t%7,%%r12\n\t"
+               "shr\t$63,%%r12\n\t"
                "shl\t%7\n\t"
-               "rcl\t%8\n\t"
+               "shl\t%8\n\t"
+               "or\t%%r12,%8\n\t"
+               "mov\t13*4(%9),%%r12\n\t"
                "add\t%7,%2\n\t"
+               "mov\t23*4(%9),%k7\n\t"
                "adc\t%8,%3\n\t"
+               "mov\t15*4(%9),%%r13\n\t"
                "adc\t$0,%4\n\t"
+               "mov\t12*4(%9),%k8\n\t"
                "adc\t$0,%5\n\t"
-               "adc\t$0,%b6\n\t"
-               /* S₂ = (A₂₃‖A₂₂‖A₂₁‖A₂₀‖A₁₉‖A₁₈‖A₁₇‖A₁₆‖A₁₅‖A₁₄‖A₁₃‖A₁₂) */
-               "add\t12*4(%9),%0\n\t"
-               "adc\t14*4(%9),%1\n\t"
-               "adc\t16*4(%9),%2\n\t"
-               "adc\t18*4(%9),%3\n\t"
-               "adc\t20*4(%9),%4\n\t"
-               "adc\t22*4(%9),%5\n\t"
-               "adc\t$0,%b6\n\t"
-               /* S₃ = (A₂₀‖A₁₉‖A₁₈‖A₁₇‖A₁₆‖A₁₅‖A₁₄‖A₁₃‖A₁₂‖A₂₃‖A₂₂‖A₂₁) */
-               "mov\t12*4(%9),%k7\n\t"
+               "mov\t17*4(%9),%%r14\n\t"
+               "adc\t$0,%6\n\t"
+               "mov\t19*4(%9),%%r15\n\t"
+               /* D₁ = (A₂₂‖A₂₁‖A₂₀‖A₁₉‖A₁₈‖A₁₇‖A₁₆‖A₁₅‖A₁₄‖A₁₃‖A₁₂‖A₂₃) */
+               "shl\t$32,%8\n\t"
+               "or\t%8,%7\n\t"
                "mov\t23*4(%9),%k8\n\t"
+               "sub\t%7,%0\n\t"
+               "mov\t21*4(%9),%7\n\t"
+               "sbb\t%%r12,%1\n\t"
+               "sbb\t%%r13,%2\n\t"
+               "sbb\t%%r14,%3\n\t"
+               "sbb\t%%r15,%4\n\t"
+               "sbb\t%7,%5\n\t"
+               "mov\t12*4(%9),%k7\n\t"
+               "sbb\t$0,%6\n\t"
+               /* S₃ = (A₂₀‖A₁₉‖A₁₈‖A₁₇‖A₁₆‖A₁₅‖A₁₄‖A₁₃‖A₁₂‖A₂₃‖A₂₂‖A₂₁) */
                "shl\t$32,%7\n\t"
                "or\t%7,%8\n\t"
                "add\t21*4(%9),%0\n\t"
-               "adc\t%8,%1\n\t"
-               "adc\t13*4(%9),%2\n\t"
-               "adc\t15*4(%9),%3\n\t"
-               "adc\t17*4(%9),%4\n\t"
-               "adc\t19*4(%9),%5\n\t"
-               "adc\t$0,%b6\n\t"
-               /* S₄ = (A₁₉‖A₁₈‖A₁₇‖A₁₆‖A₁₅‖A₁₄‖A₁₃‖A₁₂‖A₂₀‖0  ‖A₂₃‖0  ) */
                "mov\t23*4(%9),%k7\n\t"
+               "adc\t%8,%1\n\t"
                "mov\t20*4(%9),%k8\n\t"
+               "adc\t%%r12,%2\n\t"
+               "mov\t12*4(%9),%%r12\n\t"
+               "adc\t%%r13,%3\n\t"
+               "mov\t14*4(%9),%%r13\n\t"
+               "adc\t%%r14,%4\n\t"
+               "mov\t16*4(%9),%%r14\n\t"
+               "adc\t%%r15,%5\n\t"
+               "mov\t18*4(%9),%%r15\n\t"
+               "adc\t$0,%6\n\t"
+               /* S₄ = (A₁₉‖A₁₈‖A₁₇‖A₁₆‖A₁₅‖A₁₄‖A₁₃‖A₁₂‖A₂₀‖0  ‖A₂₃‖0  ) */
                "shl\t$32,%7\n\t"
                "shl\t$32,%8\n\t"
                "add\t%7,%0\n\t"
                "adc\t%8,%1\n\t"
-               "adc\t12*4(%9),%2\n\t"
-               "adc\t14*4(%9),%3\n\t"
-               "adc\t16*4(%9),%4\n\t"
-               "adc\t18*4(%9),%5\n\t"
-               "adc\t$0,%b6\n\t"
+               "adc\t%%r12,%2\n\t"
+               "adc\t%%r13,%3\n\t"
+               "adc\t%%r14,%4\n\t"
+               "adc\t%%r15,%5\n\t"
+               "adc\t$0,%6\n\t"
+               /* S₂ = (A₂₃‖A₂₂‖A₂₁‖A₂₀‖A₁₉‖A₁₈‖A₁₇‖A₁₆‖A₁₅‖A₁₄‖A₁₃‖A₁₂) */
+               "add\t%%r12,%0\n\t"
+               "mov\t20*4(%9),%%r12\n\t"
+               "adc\t%%r13,%1\n\t"
+               "mov\t22*4(%9),%%r13\n\t"
+               "adc\t%%r14,%2\n\t"
+               "adc\t%%r15,%3\n\t"
+               "adc\t%%r12,%4\n\t"
+               "adc\t%%r13,%5\n\t"
+               "adc\t$0,%6\n\t"
                /* S₅ = (0  ‖0  ‖0  ‖0  ‖A₂₃‖A₂₂‖A₂₁‖A₂₀‖0  ‖0  ‖0  ‖0  ) */
-               "mov\t23*4(%9),%k7\n\t"
-               "mov\t20*4(%9),%k8\n\t"
-               "shl\t$32,%7\n\t"
-               "shl\t$32,%8\n\t"
-               "add\t20*4(%9),%2\n\t"
-               "adc\t22*4(%9),%3\n\t"
+               "add\t%%r12,%2\n\t"
+               "adc\t%%r13,%3\n\t"
                "adc\t$0,%4\n\t"
                "adc\t$0,%5\n\t"
-               "adc\t$0,%b6\n\t"
+               "adc\t$0,%6\n\t"
                /* S₆ = (0  ‖0  ‖0  ‖0  ‖0  ‖0  ‖A₂₃‖A₂₂‖A₂₁‖0  ‖0  ‖A₂₀) */
-               "mov\t20*4(%9),%k7\n\t"
-               "mov\t21*4(%9),%k8\n\t"
+               "mov\t%%r12d,%k7\n\t"
+               "mov\t%%r12,%8\n\t"
+               "shr\t$32,%8\n\t"
                "shl\t$32,%8\n\t"
                "add\t%7,%0\n\t"
                "adc\t%8,%1\n\t"
-               "adc\t22*4(%9),%2\n\t"
+               "adc\t%%r13,%2\n\t"
                "adc\t$0,%3\n\t"
                "adc\t$0,%4\n\t"
                "adc\t$0,%5\n\t"
-               "adc\t$0,%b6\n\t"
-               /* D₁ = (A₂₂‖A₂₁‖A₂₀‖A₁₉‖A₁₈‖A₁₇‖A₁₆‖A₁₅‖A₁₄‖A₁₃‖A₁₂‖A₂₃) */
-               "mov\t23*4(%9),%k7\n\t"
-               "mov\t12*4(%9),%k8\n\t"
-               "shl\t$32,%8\n\t"
-               "or\t%8,%7\n\t"
-               "sub\t%7,%0\n\t"
-               "sbb\t13*4(%9),%1\n\t"
-               "sbb\t15*4(%9),%2\n\t"
-               "sbb\t17*4(%9),%3\n\t"
-               "sbb\t19*4(%9),%4\n\t"
-               "sbb\t21*4(%9),%5\n\t"
-               "sbb\t$0,%b6\n\t"
+               "adc\t$0,%6\n\t"
                /* D₂ = (0  ‖0  ‖0  ‖0  ‖0  ‖0  ‖0  ‖A₂₃‖A₂₂‖A₂₁‖A₂₀‖0  ) */
-               "mov\t20*4(%9),%k7\n\t"
-               "mov\t23*4(%9),%k8\n\t"
+               "mov\t%%r12d,%k7\n\t"
+               "mov\t21*4(%9),%%r12\n\t"
+               "mov\t%%r13,%8\n\t"
+               "shr\t$32,%8\n\t"
                "shl\t$32,%7\n\t"
                "sub\t%7,%0\n\t"
-               "sbb\t21*4(%9),%1\n\t"
+               "sbb\t%%r12,%1\n\t"
                "sbb\t%8,%2\n\t"
                "sbb\t$0,%3\n\t"
                "sbb\t$0,%4\n\t"
                "sbb\t$0,%5\n\t"
-               "sbb\t$0,%b6\n\t"
+               "sbb\t$0,%6\n\t"
                /* D₃ = (0  ‖0  ‖0  ‖0  ‖0  ‖0  ‖0  ‖A₂₃‖A₂₃‖0  ‖0  ‖0  ) */
-               "mov\t23*4(%9),%k7\n\t"
+               "mov\t%%r13,%7\n\t"
+               "shr\t$32,%7\n\t"
                "mov\t%k7,%k8\n\t"
                "shl\t$32,%7\n\t"
                "sub\t%7,%1\n\t"
@@ -210,11 +223,11 @@ void secp384r1(uint64_t p[12]) {
                "sbb\t$0,%3\n\t"
                "sbb\t$0,%4\n\t"
                "sbb\t$0,%5\n\t"
-               "sbb\t$0,%b6\n\t"
+               "sbb\t$0,%6"
                : "+r"(A), "+r"(B), "+r"(C), "+r"(D), "+r"(E), "+r"(F), "+q"(G),
                  "=&r"(a), "=&r"(b)
                : "r"(p)
-               : "memory");
+               : "memory", "r12", "r13", "r14", "r15");
 #endif
   p[0] = A;
   p[1] = B;
@@ -223,11 +236,12 @@ void secp384r1(uint64_t p[12]) {
   p[4] = E;
   p[5] = F;
   p[6] = G;
-  p[7] = 0;
-  p[8] = 0;
-  p[9] = 0;
-  p[10] = 0;
-  p[11] = 0;
+  G = CONCEAL("r", 0L);
+  p[7] = G;
+  p[8] = G;
+  p[9] = G;
+  p[10] = G;
+  p[11] = G;
 }
 
 int ecp_mod_p384(mbedtls_mpi *N) {
@@ -249,3 +263,130 @@ int ecp_mod_p384(mbedtls_mpi *N) {
   }
   return 0;
 }
+
+/*
+Instructions:      115
+Total Cycles:      46
+Total uOps:        116
+uOps Per Cycle:    2.52
+IPC:               2.50
+Block RThroughput: 31.0
+
+SIMULATION          0123456789          0123456789
+Index     0123456789          0123456789          012345
+[0,0]     DR   .    .    .    .    .    .    .    .    .   xorl	%r10d, %r10d
+[0,1]     DeeeeeER  .    .    .    .    .    .    .    .   movq	(%rdi), %r9
+[0,2]     DeeeeeER  .    .    .    .    .    .    .    .   movq	8(%rdi), %r8
+[0,3]     D=eeeeeER .    .    .    .    .    .    .    .   movq	16(%rdi), %rsi
+[0,4]     D=eeeeeER .    .    .    .    .    .    .    .   movq	24(%rdi), %rcx
+[0,5]     D==eeeeeER.    .    .    .    .    .    .    .   movq	32(%rdi), %rdx
+[0,6]     .D==eeeeeER    .    .    .    .    .    .    .   movq	40(%rdi), %rax
+[0,7]     .D=eeeeeE-R    .    .    .    .    .    .    .   movq	84(%rdi), %r11
+[0,8]     .D==eeeeeER    .    .    .    .    .    .    .   movl	92(%rdi), %ebx
+[0,9]     .D======eER    .    .    .    .    .    .    .   movq	%r11, %r12
+[0,10]    .D=======eER   .    .    .    .    .    .    .   shrq	$63, %r12
+[0,11]    .D======eE-R   .    .    .    .    .    .    .   shlq	%r11
+[0,12]    . D======eER   .    .    .    .    .    .    .   shlq	%rbx
+[0,13]    . D=======eER  .    .    .    .    .    .    .   orq	%r12, %rbx
+[0,14]    . D==eeeeeE-R  .    .    .    .    .    .    .   movq	52(%rdi), %r12
+[0,15]    . D======eE-R  .    .    .    .    .    .    .   addq	%r11, %rsi
+[0,16]    . D==eeeeeE-R  .    .    .    .    .    .    .   movl	92(%rdi), %r11d
+[0,17]    . D========eER .    .    .    .    .    .    .   adcq	%rbx, %rcx
+[0,18]    .  D==eeeeeE-R .    .    .    .    .    .    .   movq	60(%rdi), %r13
+[0,19]    .  D========eER.    .    .    .    .    .    .   adcq	$0, %rdx
+[0,20]    .  D==eeeeeE--R.    .    .    .    .    .    .   movl	48(%rdi), %ebx
+[0,21]    .  D=========eER    .    .    .    .    .    .   adcq	$0, %rax
+[0,22]    .  D===eeeeeE--R    .    .    .    .    .    .   movq	68(%rdi), %r14
+[0,23]    .  D==========eER   .    .    .    .    .    .   adcq	$0, %r10
+[0,24]    .   D==eeeeeE---R   .    .    .    .    .    .   movq	76(%rdi), %r15
+[0,25]    .   D======eE---R   .    .    .    .    .    .   shlq	$32, %rbx
+[0,26]    .   D=======eE--R   .    .    .    .    .    .   orq	%rbx, %r11
+[0,27]    .   D===eeeeeE--R   .    .    .    .    .    .   movl	92(%rdi), %ebx
+[0,28]    .   D========eE-R   .    .    .    .    .    .   subq	%r11, %r9
+[0,29]    .   D===eeeeeE--R   .    .    .    .    .    .   movq	84(%rdi), %r11
+[0,30]    .    D========eER   .    .    .    .    .    .   sbbq	%r12, %r8
+[0,31]    .    D=========eER  .    .    .    .    .    .   sbbq	%r13, %rsi
+[0,32]    .    D==========eER .    .    .    .    .    .   sbbq	%r14, %rcx
+[0,33]    .    D===========eER.    .    .    .    .    .   sbbq	%r15, %rdx
+[0,34]    .    D============eER    .    .    .    .    .   sbbq	%r11, %rax
+[0,35]    .    D===eeeeeE-----R    .    .    .    .    .   movl	48(%rdi), %r11d
+[0,36]    .    .D============eER   .    .    .    .    .   sbbq	$0, %r10
+[0,37]    .    .D========eE----R   .    .    .    .    .   shlq	$32, %r11
+[0,38]    .    .D=========eE---R   .    .    .    .    .   orq	%r11, %rbx
+[0,39]    .    .D==eeeeeE------R   .    .    .    .    .   movl	92(%rdi), %r11d
+[0,40]    .    .D======eeeeeeE-R   .    .    .    .    .   addq	84(%rdi), %r9
+[0,41]    .    . D===========eER   .    .    .    .    .   adcq	%rbx, %r8
+[0,42]    .    . D==eeeeeE-----R   .    .    .    .    .   movl	80(%rdi), %ebx
+[0,43]    .    . D============eER  .    .    .    .    .   adcq	%r12, %rsi
+[0,44]    .    . D==eeeeeE------R  .    .    .    .    .   movq	48(%rdi), %r12
+[0,45]    .    . D=============eER .    .    .    .    .   adcq	%r13, %rcx
+[0,46]    .    . D===eeeeeE------R .    .    .    .    .   movq	56(%rdi), %r13
+[0,47]    .    .  D=============eER.    .    .    .    .   adcq	%r14, %rdx
+[0,48]    .    .  D==eeeeeE-------R.    .    .    .    .   movq	64(%rdi), %r14
+[0,49]    .    .  D==============eER    .    .    .    .   adcq	%r15, %rax
+[0,50]    .    .  D===eeeeeE-------R    .    .    .    .   movq	72(%rdi), %r15
+[0,51]    .    .  D===============eER   .    .    .    .   adcq	$0, %r10
+[0,52]    .    .  D=======eE--------R   .    .    .    .   shlq	$32, %r11
+[0,53]    .    .   D=======eE-------R   .    .    .    .   shlq	$32, %rbx
+[0,54]    .    .   D=========eE-----R   .    .    .    .   addq	%r11, %r9
+[0,55]    .    .   D==========eE----R   .    .    .    .   adcq	%rbx, %r8
+[0,56]    .    .   D===========eE---R   .    .    .    .   adcq	%r12, %rsi
+[0,57]    .    .   D============eE--R   .    .    .    .   adcq	%r13, %rcx
+[0,58]    .    .   D=============eE-R   .    .    .    .   adcq	%r14, %rdx
+[0,59]    .    .    D=============eER   .    .    .    .   adcq	%r15, %rax
+[0,60]    .    .    D==============eER  .    .    .    .   adcq	$0, %r10
+[0,61]    .    .    D=========eE-----R  .    .    .    .   addq	%r12, %r9
+[0,62]    .    .    D=eeeeeE---------R  .    .    .    .   movq	80(%rdi), %r12
+[0,63]    .    .    D==============eER  .    .    .    .   adcq	%r13, %r8
+[0,64]    .    .    D==eeeeeE--------R  .    .    .    .   movq	88(%rdi), %r13
+[0,65]    .    .    .D==============eER .    .    .    .   adcq	%r14, %rsi
+[0,66]    .    .    .D===============eER.    .    .    .   adcq	%r15, %rcx
+[0,67]    .    .    .D================eER    .    .    .   adcq	%r12, %rdx
+[0,68]    .    .    .D=================eER   .    .    .   adcq	%r13, %rax
+[0,69]    .    .    .D==================eER  .    .    .   adcq	$0, %r10
+[0,70]    .    .    .D===============eE---R  .    .    .   addq	%r12, %rsi
+[0,71]    .    .    . D===============eE--R  .    .    .   adcq	%r13, %rcx
+[0,72]    .    .    . D================eE-R  .    .    .   adcq	$0, %rdx
+[0,73]    .    .    . D=================eER  .    .    .   adcq	$0, %rax
+[0,74]    .    .    . D==================eER .    .    .   adcq	$0, %r10
+[0,75]    .    .    . D====eE--------------R .    .    .   movl	%r12d, %r11d
+[0,76]    .    .    . D====eE--------------R .    .    .   movq	%r12, %rbx
+[0,77]    .    .    .  D====eE-------------R .    .    .   shrq	$32, %rbx
+[0,78]    .    .    .  D============eE-----R .    .    .   shlq	$32, %rbx
+[0,79]    .    .    .  D=======eE----------R .    .    .   addq	%r11, %r9
+[0,80]    .    .    .  D=============eE----R .    .    .   adcq	%rbx, %r8
+[0,81]    .    .    .  D=================eER .    .    .   adcq	%r13, %rsi
+[0,82]    .    .    .  D==================eER.    .    .   adcq	$0, %rcx
+[0,83]    .    .    .   D==================eER    .    .   adcq	$0, %rdx
+[0,84]    .    .    .   D===================eER   .    .   adcq	$0, %rax
+[0,85]    .    .    .   D====================eER  .    .   adcq	$0, %r10
+[0,86]    .    .    .   D===eE-----------------R  .    .   movl	%r12d, %r11d
+[0,87]    .    .    .   DeeeeeE----------------R  .    .   movq	84(%rdi), %r12
+[0,88]    .    .    .   D===eE-----------------R  .    .   movq	%r13, %rbx
+[0,89]    .    .    .    D================eE---R  .    .   shrq	$32, %rbx
+[0,90]    .    .    .    D=================eE--R  .    .   shlq	$32, %r11
+[0,91]    .    .    .    D==================eE-R  .    .   subq	%r11, %r9
+[0,92]    .    .    .    D===================eER  .    .   sbbq	%r12, %r8
+[0,93]    .    .    .    D====================eER .    .   sbbq	%rbx, %rsi
+[0,94]    .    .    .    D=====================eER.    .   sbbq	$0, %rcx
+[0,95]    .    .    .    .D=====================eER    .   sbbq	$0, %rdx
+[0,96]    .    .    .    .D======================eER   .   sbbq	$0, %rax
+[0,97]    .    .    .    .D=======================eER  .   sbbq	$0, %r10
+[0,98]    .    .    .    .D==eE---------------------R  .   movq	%r13, %r11
+[0,99]    .    .    .    .D=================eE------R  .   shrq	$32, %r11
+[0,100]   .    .    .    .D==================eE-----R  .   movl	%r11d, %ebx
+[0,101]   .    .    .    . D==================eE----R  .   shlq	$32, %r11
+[0,102]   .    .    .    . D===================eE---R  .   subq	%r11, %r8
+[0,103]   .    .    .    . D====================eE--R  .   sbbq	%rbx, %rsi
+[0,104]   .    .    .    . D=====================eE-R  .   sbbq	$0, %rcx
+[0,105]   .    .    .    . D======================eER  .   sbbq	$0, %rdx
+[0,106]   .    .    .    . D=======================eER .   sbbq	$0, %rax
+[0,107]   .    .    .    .  D=======================eER.   sbbq	$0, %r10
+[0,108]   .    .    .    .  D================eE-------R.   movq	%r9, (%rdi)
+[0,109]   .    .    .    .  D===================eE----R.   movq	%r8, 8(%rdi)
+[0,110]   .    .    .    .  D====================eE---R.   movq	%rsi, 16(%rdi)
+[0,111]   .    .    .    .  D=====================eE--R.   movq	%rcx, 24(%rdi)
+[0,112]   .    .    .    .  D======================eE-R.   movq	%rdx, 32(%rdi)
+[0,113]   .    .    .    .   D======================eER.   movq	%rax, 40(%rdi)
+[0,114]   .    .    .    .   D=======================eER   movq	%r10, 48(%rdi)
+*/
diff --git a/third_party/mbedtls/ssl_ciphersuites.c b/third_party/mbedtls/ssl_ciphersuites.c
index b465480f8..1329ec6e9 100644
--- a/third_party/mbedtls/ssl_ciphersuites.c
+++ b/third_party/mbedtls/ssl_ciphersuites.c
@@ -61,7 +61,6 @@ static const uint16_t ciphersuite_preference[] =
     MBEDTLS_TLS_DHE_RSA_WITH_CHACHA20_POLY1305_SHA256,
     MBEDTLS_TLS_DHE_RSA_WITH_AES_128_CCM,
     MBEDTLS_TLS_DHE_RSA_WITH_AES_256_CCM,
-    /* weakened perfect forward secrecy */
     MBEDTLS_TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256,
     MBEDTLS_TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384,
     MBEDTLS_TLS_DHE_RSA_WITH_AES_128_CBC_SHA256,
diff --git a/third_party/mbedtls/test/everest_test.c b/third_party/mbedtls/test/everest_test.c
new file mode 100644
index 000000000..e201fa88a
--- /dev/null
+++ b/third_party/mbedtls/test/everest_test.c
@@ -0,0 +1,77 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/rand/rand.h"
+#include "libc/stdio/stdio.h"
+#include "libc/testlib/ezbench.h"
+#include "libc/testlib/testlib.h"
+#include "third_party/mbedtls/config.h"
+#include "third_party/mbedtls/endian.h"
+
+void Hacl_Curve25519_crypto_scalarmult(uint8_t *, uint8_t *, uint8_t *);
+void curve25519(uint8_t[32], uint8_t[32], uint8_t[32]);
+
+const uint64_t kNumbers[] = {
+    0x0000000000000000,  //
+    0x0000000000000001,  //
+    0x0000000000001000,  //
+    0x0000000002000000,  //
+    0x0000004000000000,  //
+    0x0008000000000000,  //
+    0x8000000000000000,  //
+    0x0007ffffffffffff,  //
+    0x0000003fffffffff,  //
+    0x0000000001ffffff,  //
+    0x0000000000000fff,  //
+    0xffffffffffffffff,  //
+    0xfff8000000000000,  //
+};
+
+TEST(everest, tinierVersionBehavesTheSame) {
+  size_t i;
+  uint8_t secret[32], bpoint[32], public[2][32];
+  for (i = 0; i < 500; ++i) {
+    rngset(secret, sizeof(secret), rand64, -1);
+    rngset(bpoint, sizeof(bpoint), rand64, -1);
+    Hacl_Curve25519_crypto_scalarmult(public[0], secret, bpoint);
+    curve25519(public[1], secret, bpoint);
+    ASSERT_EQ(0, memcmp(public[0], public[1], sizeof(public[0])));
+  }
+  for (i = 0; i < 500; ++i) {
+    Write64le(secret + 000, kNumbers[rand() % ARRAYLEN(kNumbers)]);
+    Write64le(secret + 010, kNumbers[rand() % ARRAYLEN(kNumbers)]);
+    Write64le(secret + 020, kNumbers[rand() % ARRAYLEN(kNumbers)]);
+    Write64le(secret + 030, kNumbers[rand() % ARRAYLEN(kNumbers)]);
+    Write64le(bpoint + 000, kNumbers[rand() % ARRAYLEN(kNumbers)]);
+    Write64le(bpoint + 010, kNumbers[rand() % ARRAYLEN(kNumbers)]);
+    Write64le(bpoint + 020, kNumbers[rand() % ARRAYLEN(kNumbers)]);
+    Write64le(bpoint + 030, kNumbers[rand() % ARRAYLEN(kNumbers)]);
+    Hacl_Curve25519_crypto_scalarmult(public[0], secret, bpoint);
+    curve25519(public[1], secret, bpoint);
+    ASSERT_EQ(0, memcmp(public[0], public[1], sizeof(public[0])));
+  }
+}
+
+BENCH(everest, bench) {
+  uint8_t secret[32], bpoint[32], public[32];
+  rngset(secret, sizeof(secret), rand64, -1);
+  rngset(bpoint, sizeof(bpoint), rand64, -1);
+  EZBENCH2("everest", donothing,
+           Hacl_Curve25519_crypto_scalarmult(public, secret, bpoint));
+  EZBENCH2("mariana", donothing, curve25519(public, secret, bpoint));
+}
diff --git a/third_party/mbedtls/test/everest_unravaged.c b/third_party/mbedtls/test/everest_unravaged.c
new file mode 100644
index 000000000..3ad6cb66f
--- /dev/null
+++ b/third_party/mbedtls/test/everest_unravaged.c
@@ -0,0 +1,899 @@
+#include "libc/bits/bits.h"
+#include "libc/limits.h"
+#include "third_party/mbedtls/asn1.h"
+#include "third_party/mbedtls/bignum.h"
+#include "third_party/mbedtls/common.h"
+#include "third_party/mbedtls/error.h"
+#include "third_party/mbedtls/platform.h"
+
+asm(".ident\t\"\\n\\n\
+Everest (Apache 2.0)\\n\
+Copyright 2016-2018 INRIA and Microsoft Corporation\"");
+asm(".include \"libc/disclaimer.inc\"");
+
+/* clang-format off */
+/*
+ *  ECDH with curve-optimized implementation multiplexing
+ *
+ *  Copyright 2016-2018 INRIA and Microsoft Corporation
+ *  SPDX-License-Identifier: Apache-2.0
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"); you may
+ *  not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  This file is part of mbed TLS (https://tls.mbed.org)
+ */
+
+#ifdef memcpy
+#undef memcpy
+#endif
+#define memcpy(x,y,z) __builtin_memcpy(x,y,z)
+
+#define load64_le(b) READ64LE(b)
+#define store64_le(b, i) WRITE64LE(b, i)
+
+#define KRML_HOST_EXIT exit
+#define KRML_HOST_PRINTF printf
+
+#define KRML_EXIT                                                              \
+  do {                                                                         \
+    KRML_HOST_PRINTF("Unimplemented function at %s:%d\n", __FILE__, __LINE__); \
+    KRML_HOST_EXIT(254);                                                       \
+  } while (0)
+
+#define _KRML_CHECK_SIZE_PRAGMA \
+    _Pragma("GCC diagnostic ignored \"-Wtype-limits\"")
+
+#define KRML_CHECK_SIZE(size_elt, sz)                                          \
+  do {                                                                         \
+    _KRML_CHECK_SIZE_PRAGMA                                                    \
+    if (((size_t)(sz)) > ((size_t)(SIZE_MAX / (size_elt)))) {                  \
+      KRML_HOST_PRINTF(                                                        \
+          "Maximum allocatable size exceeded, aborting before overflow at "    \
+          "%s:%d\n",                                                           \
+          __FILE__, __LINE__);                                                 \
+      KRML_HOST_EXIT(253);                                                     \
+    }                                                                          \
+  } while (0)
+
+typedef const char *Prims_string;
+
+typedef struct {
+  uint32_t length;
+  const char *data;
+} FStar_Bytes_bytes;
+
+typedef int32_t Prims_pos, Prims_nat, Prims_nonzero, Prims_int,
+    krml_checked_int_t;
+
+/* Prims_nat not yet in scope */
+inline static int32_t krml_time() {
+  return (int32_t)time(NULL);
+}
+
+static uint64_t FStar_UInt64_eq_mask(uint64_t a, uint64_t b)
+{
+  uint64_t x = a ^ b;
+  uint64_t minus_x = ~x + (uint64_t)1U;
+  uint64_t x_or_minus_x = x | minus_x;
+  uint64_t xnx = x_or_minus_x >> (uint32_t)63U;
+  return xnx - (uint64_t)1U;
+}
+
+static uint64_t FStar_UInt64_gte_mask(uint64_t a, uint64_t b)
+{
+  uint64_t x = a;
+  uint64_t y = b;
+  uint64_t x_xor_y = x ^ y;
+  uint64_t x_sub_y = x - y;
+  uint64_t x_sub_y_xor_y = x_sub_y ^ y;
+  uint64_t q = x_xor_y | x_sub_y_xor_y;
+  uint64_t x_xor_q = x ^ q;
+  uint64_t x_xor_q_ = x_xor_q >> (uint32_t)63U;
+  return x_xor_q_ - (uint64_t)1U;
+}
+
+static uint32_t FStar_UInt32_eq_mask(uint32_t a, uint32_t b)
+{
+  uint32_t x = a ^ b;
+  uint32_t minus_x = ~x + (uint32_t)1U;
+  uint32_t x_or_minus_x = x | minus_x;
+  uint32_t xnx = x_or_minus_x >> (uint32_t)31U;
+  return xnx - (uint32_t)1U;
+}
+
+static uint32_t FStar_UInt32_gte_mask(uint32_t a, uint32_t b)
+{
+  uint32_t x = a;
+  uint32_t y = b;
+  uint32_t x_xor_y = x ^ y;
+  uint32_t x_sub_y = x - y;
+  uint32_t x_sub_y_xor_y = x_sub_y ^ y;
+  uint32_t q = x_xor_y | x_sub_y_xor_y;
+  uint32_t x_xor_q = x ^ q;
+  uint32_t x_xor_q_ = x_xor_q >> (uint32_t)31U;
+  return x_xor_q_ - (uint32_t)1U;
+}
+
+static uint16_t FStar_UInt16_eq_mask(uint16_t a, uint16_t b)
+{
+  uint16_t x = a ^ b;
+  uint16_t minus_x = ~x + (uint16_t)1U;
+  uint16_t x_or_minus_x = x | minus_x;
+  uint16_t xnx = x_or_minus_x >> (uint32_t)15U;
+  return xnx - (uint16_t)1U;
+}
+
+static uint16_t FStar_UInt16_gte_mask(uint16_t a, uint16_t b)
+{
+  uint16_t x = a;
+  uint16_t y = b;
+  uint16_t x_xor_y = x ^ y;
+  uint16_t x_sub_y = x - y;
+  uint16_t x_sub_y_xor_y = x_sub_y ^ y;
+  uint16_t q = x_xor_y | x_sub_y_xor_y;
+  uint16_t x_xor_q = x ^ q;
+  uint16_t x_xor_q_ = x_xor_q >> (uint32_t)15U;
+  return x_xor_q_ - (uint16_t)1U;
+}
+
+static uint8_t FStar_UInt8_eq_mask(uint8_t a, uint8_t b)
+{
+  uint8_t x = a ^ b;
+  uint8_t minus_x = ~x + (uint8_t)1U;
+  uint8_t x_or_minus_x = x | minus_x;
+  uint8_t xnx = x_or_minus_x >> (uint32_t)7U;
+  return xnx - (uint8_t)1U;
+}
+
+static uint8_t FStar_UInt8_gte_mask(uint8_t a, uint8_t b)
+{
+  uint8_t x = a;
+  uint8_t y = b;
+  uint8_t x_xor_y = x ^ y;
+  uint8_t x_sub_y = x - y;
+  uint8_t x_sub_y_xor_y = x_sub_y ^ y;
+  uint8_t q = x_xor_y | x_sub_y_xor_y;
+  uint8_t x_xor_q = x ^ q;
+  uint8_t x_xor_q_ = x_xor_q >> (uint32_t)7U;
+  return x_xor_q_ - (uint8_t)1U;
+}
+
+static void Hacl_Bignum_Modulo_carry_top(uint64_t *b)
+{
+  uint64_t b4 = b[4U];
+  uint64_t b0 = b[0U];
+  uint64_t b4_ = b4 & (uint64_t)0x7ffffffffffffU;
+  uint64_t b0_ = b0 + (uint64_t)19U * (b4 >> (uint32_t)51U);
+  b[4U] = b4_;
+  b[0U] = b0_;
+}
+
+inline static void Hacl_Bignum_Fproduct_copy_from_wide_(uint64_t *output, uint128_t *input)
+{
+  uint32_t i;
+  for (i = (uint32_t)0U; i < (uint32_t)5U; i = i + (uint32_t)1U)
+  {
+    uint128_t xi = input[i];
+    output[i] = (uint64_t)xi;
+  }
+}
+
+inline static void
+Hacl_Bignum_Fproduct_sum_scalar_multiplication_(uint128_t *output, uint64_t *input, uint64_t s)
+{
+  uint32_t i;
+  for (i = (uint32_t)0U; i < (uint32_t)5U; i = i + (uint32_t)1U)
+  {
+    uint128_t xi = output[i];
+    uint64_t yi = input[i];
+    output[i] = xi + (uint128_t)yi * s;
+  }
+}
+
+inline static void Hacl_Bignum_Fproduct_carry_wide_(uint128_t *tmp)
+{
+  uint32_t i;
+  for (i = (uint32_t)0U; i < (uint32_t)4U; i = i + (uint32_t)1U)
+  {
+    uint32_t ctr = i;
+    uint128_t tctr = tmp[ctr];
+    uint128_t tctrp1 = tmp[ctr + (uint32_t)1U];
+    uint64_t r0 = (uint64_t)tctr & (uint64_t)0x7ffffffffffffU;
+    uint128_t c = tctr >> (uint32_t)51U;
+    tmp[ctr] = (uint128_t)r0;
+    tmp[ctr + (uint32_t)1U] = tctrp1 + c;
+  }
+}
+
+inline static void Hacl_Bignum_Fmul_shift_reduce(uint64_t *output)
+{
+  uint64_t tmp = output[4U];
+  uint64_t b0;
+  {
+    uint32_t i;
+    for (i = (uint32_t)0U; i < (uint32_t)4U; i = i + (uint32_t)1U)
+    {
+      uint32_t ctr = (uint32_t)5U - i - (uint32_t)1U;
+      uint64_t z = output[ctr - (uint32_t)1U];
+      output[ctr] = z;
+    }
+  }
+  output[0U] = tmp;
+  b0 = output[0U];
+  output[0U] = (uint64_t)19U * b0;
+}
+
+static void
+Hacl_Bignum_Fmul_mul_shift_reduce_(uint128_t *output, uint64_t *input, uint64_t *input2)
+{
+  uint32_t i;
+  uint64_t input2i;
+  {
+    uint32_t i0;
+    for (i0 = (uint32_t)0U; i0 < (uint32_t)4U; i0 = i0 + (uint32_t)1U)
+    {
+      uint64_t input2i0 = input2[i0];
+      Hacl_Bignum_Fproduct_sum_scalar_multiplication_(output, input, input2i0);
+      Hacl_Bignum_Fmul_shift_reduce(input);
+    }
+  }
+  i = (uint32_t)4U;
+  input2i = input2[i];
+  Hacl_Bignum_Fproduct_sum_scalar_multiplication_(output, input, input2i);
+}
+
+inline static void Hacl_Bignum_Fmul_fmul(uint64_t *output, uint64_t *input, uint64_t *input2)
+{
+  uint64_t tmp[5U] = { 0U };
+  memcpy(tmp, input, (uint32_t)5U * sizeof input[0U]);
+  KRML_CHECK_SIZE(sizeof (uint128_t), (uint32_t)5U);
+  {
+    uint128_t t[5U];
+    {
+      uint32_t _i;
+      for (_i = 0U; _i < (uint32_t)5U; ++_i)
+        t[_i] = (uint128_t)(uint64_t)0U;
+    }
+    {
+      uint128_t b4;
+      uint128_t b0;
+      uint128_t b4_;
+      uint128_t b0_;
+      uint64_t i0;
+      uint64_t i1;
+      uint64_t i0_;
+      uint64_t i1_;
+      Hacl_Bignum_Fmul_mul_shift_reduce_(t, tmp, input2);
+      Hacl_Bignum_Fproduct_carry_wide_(t);
+      b4 = t[4U];
+      b0 = t[0U];
+      b4_ = b4 & (uint128_t)(uint64_t)0x7ffffffffffffU;
+      b0_ = b0 + (uint128_t)(uint64_t)19U * (uint64_t)(b4 >> (uint32_t)51U);
+      t[4U] = b4_;
+      t[0U] = b0_;
+      Hacl_Bignum_Fproduct_copy_from_wide_(output, t);
+      i0 = output[0U];
+      i1 = output[1U];
+      i0_ = i0 & (uint64_t)0x7ffffffffffffU;
+      i1_ = i1 + (i0 >> (uint32_t)51U);
+      output[0U] = i0_;
+      output[1U] = i1_;
+    }
+  }
+}
+
+inline static void Hacl_Bignum_Fsquare_fsquare__(uint128_t *tmp, uint64_t *output)
+{
+  uint64_t r0 = output[0U];
+  uint64_t r1 = output[1U];
+  uint64_t r2 = output[2U];
+  uint64_t r3 = output[3U];
+  uint64_t r4 = output[4U];
+  uint64_t d0 = r0 * (uint64_t)2U;
+  uint64_t d1 = r1 * (uint64_t)2U;
+  uint64_t d2 = r2 * (uint64_t)2U * (uint64_t)19U;
+  uint64_t d419 = r4 * (uint64_t)19U;
+  uint64_t d4 = d419 * (uint64_t)2U;
+  uint128_t s0 = (uint128_t)r0 * r0 + (uint128_t)d4 * r1 + (uint128_t)d2 * r3;
+  uint128_t s1 = (uint128_t)d0 * r1 + (uint128_t)d4 * r2 + (uint128_t)(r3 * (uint64_t)19U) * r3;
+  uint128_t s2 = (uint128_t)d0 * r2 + (uint128_t)r1 * r1 + (uint128_t)d4 * r3;
+  uint128_t s3 = (uint128_t)d0 * r3 + (uint128_t)d1 * r2 + (uint128_t)r4 * d419;
+  uint128_t s4 = (uint128_t)d0 * r4 + (uint128_t)d1 * r3 + (uint128_t)r2 * r2;
+  tmp[0U] = s0;
+  tmp[1U] = s1;
+  tmp[2U] = s2;
+  tmp[3U] = s3;
+  tmp[4U] = s4;
+}
+
+inline static void Hacl_Bignum_Fsquare_fsquare_(uint128_t *tmp, uint64_t *output)
+{
+  uint128_t b4;
+  uint128_t b0;
+  uint128_t b4_;
+  uint128_t b0_;
+  uint64_t i0;
+  uint64_t i1;
+  uint64_t i0_;
+  uint64_t i1_;
+  Hacl_Bignum_Fsquare_fsquare__(tmp, output);
+  Hacl_Bignum_Fproduct_carry_wide_(tmp);
+  b4 = tmp[4U];
+  b0 = tmp[0U];
+  b4_ = b4 & (uint128_t)(uint64_t)0x7ffffffffffffU;
+  b0_ = b0 + (uint128_t)(uint64_t)19U * (uint64_t)(b4 >> (uint32_t)51U);
+  tmp[4U] = b4_;
+  tmp[0U] = b0_;
+  Hacl_Bignum_Fproduct_copy_from_wide_(output, tmp);
+  i0 = output[0U];
+  i1 = output[1U];
+  i0_ = i0 & (uint64_t)0x7ffffffffffffU;
+  i1_ = i1 + (i0 >> (uint32_t)51U);
+  output[0U] = i0_;
+  output[1U] = i1_;
+}
+
+static void
+Hacl_Bignum_Fsquare_fsquare_times_(uint64_t *input, uint128_t *tmp, uint32_t count1)
+{
+  uint32_t i;
+  Hacl_Bignum_Fsquare_fsquare_(tmp, input);
+  for (i = (uint32_t)1U; i < count1; i = i + (uint32_t)1U)
+    Hacl_Bignum_Fsquare_fsquare_(tmp, input);
+}
+
+inline static void
+Hacl_Bignum_Fsquare_fsquare_times(uint64_t *output, uint64_t *input, uint32_t count1)
+{
+  KRML_CHECK_SIZE(sizeof (uint128_t), (uint32_t)5U);
+  {
+    uint128_t t[5U];
+    {
+      uint32_t _i;
+      for (_i = 0U; _i < (uint32_t)5U; ++_i)
+        t[_i] = (uint128_t)(uint64_t)0U;
+    }
+    memcpy(output, input, (uint32_t)5U * sizeof input[0U]);
+    Hacl_Bignum_Fsquare_fsquare_times_(output, t, count1);
+  }
+}
+
+inline static void Hacl_Bignum_Fsquare_fsquare_times_inplace(uint64_t *output, uint32_t count1)
+{
+  KRML_CHECK_SIZE(sizeof (uint128_t), (uint32_t)5U);
+  {
+    uint128_t t[5U];
+    {
+      uint32_t _i;
+      for (_i = 0U; _i < (uint32_t)5U; ++_i)
+        t[_i] = (uint128_t)(uint64_t)0U;
+    }
+    Hacl_Bignum_Fsquare_fsquare_times_(output, t, count1);
+  }
+}
+
+inline static void Hacl_Bignum_Crecip_crecip(uint64_t *out, uint64_t *z)
+{
+  uint64_t buf[20U] = { 0U };
+  uint64_t *a0 = buf;
+  uint64_t *t00 = buf + (uint32_t)5U;
+  uint64_t *b0 = buf + (uint32_t)10U;
+  uint64_t *t01;
+  uint64_t *b1;
+  uint64_t *c0;
+  uint64_t *a;
+  uint64_t *t0;
+  uint64_t *b;
+  uint64_t *c;
+  Hacl_Bignum_Fsquare_fsquare_times(a0, z, (uint32_t)1U);
+  Hacl_Bignum_Fsquare_fsquare_times(t00, a0, (uint32_t)2U);
+  Hacl_Bignum_Fmul_fmul(b0, t00, z);
+  Hacl_Bignum_Fmul_fmul(a0, b0, a0);
+  Hacl_Bignum_Fsquare_fsquare_times(t00, a0, (uint32_t)1U);
+  Hacl_Bignum_Fmul_fmul(b0, t00, b0);
+  Hacl_Bignum_Fsquare_fsquare_times(t00, b0, (uint32_t)5U);
+  t01 = buf + (uint32_t)5U;
+  b1 = buf + (uint32_t)10U;
+  c0 = buf + (uint32_t)15U;
+  Hacl_Bignum_Fmul_fmul(b1, t01, b1);
+  Hacl_Bignum_Fsquare_fsquare_times(t01, b1, (uint32_t)10U);
+  Hacl_Bignum_Fmul_fmul(c0, t01, b1);
+  Hacl_Bignum_Fsquare_fsquare_times(t01, c0, (uint32_t)20U);
+  Hacl_Bignum_Fmul_fmul(t01, t01, c0);
+  Hacl_Bignum_Fsquare_fsquare_times_inplace(t01, (uint32_t)10U);
+  Hacl_Bignum_Fmul_fmul(b1, t01, b1);
+  Hacl_Bignum_Fsquare_fsquare_times(t01, b1, (uint32_t)50U);
+  a = buf;
+  t0 = buf + (uint32_t)5U;
+  b = buf + (uint32_t)10U;
+  c = buf + (uint32_t)15U;
+  Hacl_Bignum_Fmul_fmul(c, t0, b);
+  Hacl_Bignum_Fsquare_fsquare_times(t0, c, (uint32_t)100U);
+  Hacl_Bignum_Fmul_fmul(t0, t0, c);
+  Hacl_Bignum_Fsquare_fsquare_times_inplace(t0, (uint32_t)50U);
+  Hacl_Bignum_Fmul_fmul(t0, t0, b);
+  Hacl_Bignum_Fsquare_fsquare_times_inplace(t0, (uint32_t)5U);
+  Hacl_Bignum_Fmul_fmul(out, t0, a);
+}
+
+inline static void Hacl_Bignum_fsum(uint64_t *a, uint64_t *b)
+{
+  uint32_t i;
+  for (i = (uint32_t)0U; i < (uint32_t)5U; i = i + (uint32_t)1U)
+  {
+    uint64_t xi = a[i];
+    uint64_t yi = b[i];
+    a[i] = xi + yi;
+  }
+}
+
+inline static void Hacl_Bignum_fdifference(uint64_t *a, uint64_t *b)
+{
+  uint64_t tmp[5U] = { 0U };
+  uint64_t b0;
+  uint64_t b1;
+  uint64_t b2;
+  uint64_t b3;
+  uint64_t b4;
+  memcpy(tmp, b, (uint32_t)5U * sizeof b[0U]);
+  b0 = tmp[0U];
+  b1 = tmp[1U];
+  b2 = tmp[2U];
+  b3 = tmp[3U];
+  b4 = tmp[4U];
+  tmp[0U] = b0 + (uint64_t)0x3fffffffffff68U;
+  tmp[1U] = b1 + (uint64_t)0x3ffffffffffff8U;
+  tmp[2U] = b2 + (uint64_t)0x3ffffffffffff8U;
+  tmp[3U] = b3 + (uint64_t)0x3ffffffffffff8U;
+  tmp[4U] = b4 + (uint64_t)0x3ffffffffffff8U;
+  {
+    uint32_t i;
+    for (i = (uint32_t)0U; i < (uint32_t)5U; i = i + (uint32_t)1U)
+    {
+      uint64_t xi = a[i];
+      uint64_t yi = tmp[i];
+      a[i] = yi - xi;
+    }
+  }
+}
+
+inline static void Hacl_Bignum_fscalar(uint64_t *output, uint64_t *b, uint64_t s)
+{
+  KRML_CHECK_SIZE(sizeof (uint128_t), (uint32_t)5U);
+  {
+    uint128_t tmp[5U];
+    {
+      uint32_t _i;
+      for (_i = 0U; _i < (uint32_t)5U; ++_i)
+        tmp[_i] = (uint128_t)(uint64_t)0U;
+    }
+    {
+      uint128_t b4;
+      uint128_t b0;
+      uint128_t b4_;
+      uint128_t b0_;
+      {
+        uint32_t i;
+        for (i = (uint32_t)0U; i < (uint32_t)5U; i = i + (uint32_t)1U)
+        {
+          uint64_t xi = b[i];
+          tmp[i] = (uint128_t)xi * s;
+        }
+      }
+      Hacl_Bignum_Fproduct_carry_wide_(tmp);
+      b4 = tmp[4U];
+      b0 = tmp[0U];
+      b4_ = b4 & (uint128_t)(uint64_t)0x7ffffffffffffU;
+      b0_ = b0 + (uint128_t)(uint64_t)19U * (uint64_t)(b4 >> (uint32_t)51U);
+      tmp[4U] = b4_;
+      tmp[0U] = b0_;
+      Hacl_Bignum_Fproduct_copy_from_wide_(output, tmp);
+    }
+  }
+}
+
+inline static void Hacl_Bignum_fmul(uint64_t *output, uint64_t *a, uint64_t *b)
+{
+  Hacl_Bignum_Fmul_fmul(output, a, b);
+}
+
+inline static void Hacl_Bignum_crecip(uint64_t *output, uint64_t *input)
+{
+  Hacl_Bignum_Crecip_crecip(output, input);
+}
+
+static void
+Hacl_EC_Point_swap_conditional_step(uint64_t *a, uint64_t *b, uint64_t swap1, uint32_t ctr)
+{
+  uint32_t i = ctr - (uint32_t)1U;
+  uint64_t ai = a[i];
+  uint64_t bi = b[i];
+  uint64_t x = swap1 & (ai ^ bi);
+  uint64_t ai1 = ai ^ x;
+  uint64_t bi1 = bi ^ x;
+  a[i] = ai1;
+  b[i] = bi1;
+}
+
+static void
+Hacl_EC_Point_swap_conditional_(uint64_t *a, uint64_t *b, uint64_t swap1, uint32_t ctr)
+{
+  if (!(ctr == (uint32_t)0U))
+  {
+    uint32_t i;
+    Hacl_EC_Point_swap_conditional_step(a, b, swap1, ctr);
+    i = ctr - (uint32_t)1U;
+    Hacl_EC_Point_swap_conditional_(a, b, swap1, i);
+  }
+}
+
+static void Hacl_EC_Point_swap_conditional(uint64_t *a, uint64_t *b, uint64_t iswap)
+{
+  uint64_t swap1 = (uint64_t)0U - iswap;
+  Hacl_EC_Point_swap_conditional_(a, b, swap1, (uint32_t)5U);
+  Hacl_EC_Point_swap_conditional_(a + (uint32_t)5U, b + (uint32_t)5U, swap1, (uint32_t)5U);
+}
+
+static void Hacl_EC_Point_copy(uint64_t *output, uint64_t *input)
+{
+  memcpy(output, input, (uint32_t)5U * sizeof input[0U]);
+  memcpy(output + (uint32_t)5U,
+    input + (uint32_t)5U,
+    (uint32_t)5U * sizeof (input + (uint32_t)5U)[0U]);
+}
+
+static void Hacl_EC_Format_fexpand(uint64_t *output, uint8_t *input)
+{
+  uint64_t i0 = load64_le(input);
+  uint8_t *x00 = input + (uint32_t)6U;
+  uint64_t i1 = load64_le(x00);
+  uint8_t *x01 = input + (uint32_t)12U;
+  uint64_t i2 = load64_le(x01);
+  uint8_t *x02 = input + (uint32_t)19U;
+  uint64_t i3 = load64_le(x02);
+  uint8_t *x0 = input + (uint32_t)24U;
+  uint64_t i4 = load64_le(x0);
+  uint64_t output0 = i0 & (uint64_t)0x7ffffffffffffU;
+  uint64_t output1 = i1 >> (uint32_t)3U & (uint64_t)0x7ffffffffffffU;
+  uint64_t output2 = i2 >> (uint32_t)6U & (uint64_t)0x7ffffffffffffU;
+  uint64_t output3 = i3 >> (uint32_t)1U & (uint64_t)0x7ffffffffffffU;
+  uint64_t output4 = i4 >> (uint32_t)12U & (uint64_t)0x7ffffffffffffU;
+  output[0U] = output0;
+  output[1U] = output1;
+  output[2U] = output2;
+  output[3U] = output3;
+  output[4U] = output4;
+}
+
+static void Hacl_EC_Format_fcontract_first_carry_pass(uint64_t *input)
+{
+  uint64_t t0 = input[0U];
+  uint64_t t1 = input[1U];
+  uint64_t t2 = input[2U];
+  uint64_t t3 = input[3U];
+  uint64_t t4 = input[4U];
+  uint64_t t1_ = t1 + (t0 >> (uint32_t)51U);
+  uint64_t t0_ = t0 & (uint64_t)0x7ffffffffffffU;
+  uint64_t t2_ = t2 + (t1_ >> (uint32_t)51U);
+  uint64_t t1__ = t1_ & (uint64_t)0x7ffffffffffffU;
+  uint64_t t3_ = t3 + (t2_ >> (uint32_t)51U);
+  uint64_t t2__ = t2_ & (uint64_t)0x7ffffffffffffU;
+  uint64_t t4_ = t4 + (t3_ >> (uint32_t)51U);
+  uint64_t t3__ = t3_ & (uint64_t)0x7ffffffffffffU;
+  input[0U] = t0_;
+  input[1U] = t1__;
+  input[2U] = t2__;
+  input[3U] = t3__;
+  input[4U] = t4_;
+}
+
+static void Hacl_EC_Format_fcontract_first_carry_full(uint64_t *input)
+{
+  Hacl_EC_Format_fcontract_first_carry_pass(input);
+  Hacl_Bignum_Modulo_carry_top(input);
+}
+
+static void Hacl_EC_Format_fcontract_second_carry_pass(uint64_t *input)
+{
+  uint64_t t0 = input[0U];
+  uint64_t t1 = input[1U];
+  uint64_t t2 = input[2U];
+  uint64_t t3 = input[3U];
+  uint64_t t4 = input[4U];
+  uint64_t t1_ = t1 + (t0 >> (uint32_t)51U);
+  uint64_t t0_ = t0 & (uint64_t)0x7ffffffffffffU;
+  uint64_t t2_ = t2 + (t1_ >> (uint32_t)51U);
+  uint64_t t1__ = t1_ & (uint64_t)0x7ffffffffffffU;
+  uint64_t t3_ = t3 + (t2_ >> (uint32_t)51U);
+  uint64_t t2__ = t2_ & (uint64_t)0x7ffffffffffffU;
+  uint64_t t4_ = t4 + (t3_ >> (uint32_t)51U);
+  uint64_t t3__ = t3_ & (uint64_t)0x7ffffffffffffU;
+  input[0U] = t0_;
+  input[1U] = t1__;
+  input[2U] = t2__;
+  input[3U] = t3__;
+  input[4U] = t4_;
+}
+
+static void Hacl_EC_Format_fcontract_second_carry_full(uint64_t *input)
+{
+  uint64_t i0;
+  uint64_t i1;
+  uint64_t i0_;
+  uint64_t i1_;
+  Hacl_EC_Format_fcontract_second_carry_pass(input);
+  Hacl_Bignum_Modulo_carry_top(input);
+  i0 = input[0U];
+  i1 = input[1U];
+  i0_ = i0 & (uint64_t)0x7ffffffffffffU;
+  i1_ = i1 + (i0 >> (uint32_t)51U);
+  input[0U] = i0_;
+  input[1U] = i1_;
+}
+
+static void Hacl_EC_Format_fcontract_trim(uint64_t *input)
+{
+  uint64_t a0 = input[0U];
+  uint64_t a1 = input[1U];
+  uint64_t a2 = input[2U];
+  uint64_t a3 = input[3U];
+  uint64_t a4 = input[4U];
+  uint64_t mask0 = FStar_UInt64_gte_mask(a0, (uint64_t)0x7ffffffffffedU);
+  uint64_t mask1 = FStar_UInt64_eq_mask(a1, (uint64_t)0x7ffffffffffffU);
+  uint64_t mask2 = FStar_UInt64_eq_mask(a2, (uint64_t)0x7ffffffffffffU);
+  uint64_t mask3 = FStar_UInt64_eq_mask(a3, (uint64_t)0x7ffffffffffffU);
+  uint64_t mask4 = FStar_UInt64_eq_mask(a4, (uint64_t)0x7ffffffffffffU);
+  uint64_t mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
+  uint64_t a0_ = a0 - ((uint64_t)0x7ffffffffffedU & mask);
+  uint64_t a1_ = a1 - ((uint64_t)0x7ffffffffffffU & mask);
+  uint64_t a2_ = a2 - ((uint64_t)0x7ffffffffffffU & mask);
+  uint64_t a3_ = a3 - ((uint64_t)0x7ffffffffffffU & mask);
+  uint64_t a4_ = a4 - ((uint64_t)0x7ffffffffffffU & mask);
+  input[0U] = a0_;
+  input[1U] = a1_;
+  input[2U] = a2_;
+  input[3U] = a3_;
+  input[4U] = a4_;
+}
+
+static void Hacl_EC_Format_fcontract_store(uint8_t *output, uint64_t *input)
+{
+  uint64_t t0 = input[0U];
+  uint64_t t1 = input[1U];
+  uint64_t t2 = input[2U];
+  uint64_t t3 = input[3U];
+  uint64_t t4 = input[4U];
+  uint64_t o0 = t1 << (uint32_t)51U | t0;
+  uint64_t o1 = t2 << (uint32_t)38U | t1 >> (uint32_t)13U;
+  uint64_t o2 = t3 << (uint32_t)25U | t2 >> (uint32_t)26U;
+  uint64_t o3 = t4 << (uint32_t)12U | t3 >> (uint32_t)39U;
+  uint8_t *b0 = output;
+  uint8_t *b1 = output + (uint32_t)8U;
+  uint8_t *b2 = output + (uint32_t)16U;
+  uint8_t *b3 = output + (uint32_t)24U;
+  store64_le(b0, o0);
+  store64_le(b1, o1);
+  store64_le(b2, o2);
+  store64_le(b3, o3);
+}
+
+static void Hacl_EC_Format_fcontract(uint8_t *output, uint64_t *input)
+{
+  Hacl_EC_Format_fcontract_first_carry_full(input);
+  Hacl_EC_Format_fcontract_second_carry_full(input);
+  Hacl_EC_Format_fcontract_trim(input);
+  Hacl_EC_Format_fcontract_store(output, input);
+}
+
+static void Hacl_EC_Format_scalar_of_point(uint8_t *scalar, uint64_t *point)
+{
+  uint64_t *x = point;
+  uint64_t *z = point + (uint32_t)5U;
+  uint64_t buf[10U] = { 0U };
+  uint64_t *zmone = buf;
+  uint64_t *sc = buf + (uint32_t)5U;
+  Hacl_Bignum_crecip(zmone, z);
+  Hacl_Bignum_fmul(sc, x, zmone);
+  Hacl_EC_Format_fcontract(scalar, sc);
+}
+
+static void
+Hacl_EC_AddAndDouble_fmonty(
+  uint64_t *pp,
+  uint64_t *ppq,
+  uint64_t *p,
+  uint64_t *pq,
+  uint64_t *qmqp
+)
+{
+  uint64_t *qx = qmqp;
+  uint64_t *x2 = pp;
+  uint64_t *z2 = pp + (uint32_t)5U;
+  uint64_t *x3 = ppq;
+  uint64_t *z3 = ppq + (uint32_t)5U;
+  uint64_t *x = p;
+  uint64_t *z = p + (uint32_t)5U;
+  uint64_t *xprime = pq;
+  uint64_t *zprime = pq + (uint32_t)5U;
+  uint64_t buf[40U] = { 0U };
+  uint64_t *origx = buf;
+  uint64_t *origxprime0 = buf + (uint32_t)5U;
+  uint64_t *xxprime0 = buf + (uint32_t)25U;
+  uint64_t *zzprime0 = buf + (uint32_t)30U;
+  uint64_t *origxprime;
+  uint64_t *xx0;
+  uint64_t *zz0;
+  uint64_t *xxprime;
+  uint64_t *zzprime;
+  uint64_t *zzzprime;
+  uint64_t *zzz;
+  uint64_t *xx;
+  uint64_t *zz;
+  uint64_t scalar;
+  memcpy(origx, x, (uint32_t)5U * sizeof x[0U]);
+  Hacl_Bignum_fsum(x, z);
+  Hacl_Bignum_fdifference(z, origx);
+  memcpy(origxprime0, xprime, (uint32_t)5U * sizeof xprime[0U]);
+  Hacl_Bignum_fsum(xprime, zprime);
+  Hacl_Bignum_fdifference(zprime, origxprime0);
+  Hacl_Bignum_fmul(xxprime0, xprime, z);
+  Hacl_Bignum_fmul(zzprime0, x, zprime);
+  origxprime = buf + (uint32_t)5U;
+  xx0 = buf + (uint32_t)15U;
+  zz0 = buf + (uint32_t)20U;
+  xxprime = buf + (uint32_t)25U;
+  zzprime = buf + (uint32_t)30U;
+  zzzprime = buf + (uint32_t)35U;
+  memcpy(origxprime, xxprime, (uint32_t)5U * sizeof xxprime[0U]);
+  Hacl_Bignum_fsum(xxprime, zzprime);
+  Hacl_Bignum_fdifference(zzprime, origxprime);
+  Hacl_Bignum_Fsquare_fsquare_times(x3, xxprime, (uint32_t)1U);
+  Hacl_Bignum_Fsquare_fsquare_times(zzzprime, zzprime, (uint32_t)1U);
+  Hacl_Bignum_fmul(z3, zzzprime, qx);
+  Hacl_Bignum_Fsquare_fsquare_times(xx0, x, (uint32_t)1U);
+  Hacl_Bignum_Fsquare_fsquare_times(zz0, z, (uint32_t)1U);
+  zzz = buf + (uint32_t)10U;
+  xx = buf + (uint32_t)15U;
+  zz = buf + (uint32_t)20U;
+  Hacl_Bignum_fmul(x2, xx, zz);
+  Hacl_Bignum_fdifference(zz, xx);
+  scalar = (uint64_t)121665U;
+  Hacl_Bignum_fscalar(zzz, zz, scalar);
+  Hacl_Bignum_fsum(zzz, xx);
+  Hacl_Bignum_fmul(z2, zzz, zz);
+}
+
+static void
+Hacl_EC_Ladder_SmallLoop_cmult_small_loop_step(
+  uint64_t *nq,
+  uint64_t *nqpq,
+  uint64_t *nq2,
+  uint64_t *nqpq2,
+  uint64_t *q,
+  uint8_t byt
+)
+{
+  uint64_t bit0 = (uint64_t)(byt >> (uint32_t)7U);
+  uint64_t bit;
+  Hacl_EC_Point_swap_conditional(nq, nqpq, bit0);
+  Hacl_EC_AddAndDouble_fmonty(nq2, nqpq2, nq, nqpq, q);
+  bit = (uint64_t)(byt >> (uint32_t)7U);
+  Hacl_EC_Point_swap_conditional(nq2, nqpq2, bit);
+}
+
+static void
+Hacl_EC_Ladder_SmallLoop_cmult_small_loop_double_step(
+  uint64_t *nq,
+  uint64_t *nqpq,
+  uint64_t *nq2,
+  uint64_t *nqpq2,
+  uint64_t *q,
+  uint8_t byt
+)
+{
+  uint8_t byt1;
+  Hacl_EC_Ladder_SmallLoop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
+  byt1 = byt << (uint32_t)1U;
+  Hacl_EC_Ladder_SmallLoop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
+}
+
+static void
+Hacl_EC_Ladder_SmallLoop_cmult_small_loop(
+  uint64_t *nq,
+  uint64_t *nqpq,
+  uint64_t *nq2,
+  uint64_t *nqpq2,
+  uint64_t *q,
+  uint8_t byt,
+  uint32_t i
+)
+{
+  if (!(i == (uint32_t)0U))
+  {
+    uint32_t i_ = i - (uint32_t)1U;
+    uint8_t byt_;
+    Hacl_EC_Ladder_SmallLoop_cmult_small_loop_double_step(nq, nqpq, nq2, nqpq2, q, byt);
+    byt_ = byt << (uint32_t)2U;
+    Hacl_EC_Ladder_SmallLoop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q, byt_, i_);
+  }
+}
+
+static void
+Hacl_EC_Ladder_BigLoop_cmult_big_loop(
+  uint8_t *n1,
+  uint64_t *nq,
+  uint64_t *nqpq,
+  uint64_t *nq2,
+  uint64_t *nqpq2,
+  uint64_t *q,
+  uint32_t i
+)
+{
+  if (!(i == (uint32_t)0U))
+  {
+    uint32_t i1 = i - (uint32_t)1U;
+    uint8_t byte = n1[i1];
+    Hacl_EC_Ladder_SmallLoop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q, byte, (uint32_t)4U);
+    Hacl_EC_Ladder_BigLoop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, i1);
+  }
+}
+
+static void Hacl_EC_Ladder_cmult(uint64_t *result, uint8_t *n1, uint64_t *q)
+{
+  uint64_t point_buf[40U] = { 0U };
+  uint64_t *nq = point_buf;
+  uint64_t *nqpq = point_buf + (uint32_t)10U;
+  uint64_t *nq2 = point_buf + (uint32_t)20U;
+  uint64_t *nqpq2 = point_buf + (uint32_t)30U;
+  Hacl_EC_Point_copy(nqpq, q);
+  nq[0U] = (uint64_t)1U;
+  Hacl_EC_Ladder_BigLoop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, (uint32_t)32U);
+  Hacl_EC_Point_copy(result, nq);
+}
+
+void Hacl_Curve25519_crypto_scalarmult(uint8_t *mypublic, uint8_t *secret, uint8_t *basepoint)
+{
+  uint64_t buf0[10U] = { 0U };
+  uint64_t *x0 = buf0;
+  uint64_t *z = buf0 + (uint32_t)5U;
+  uint64_t *q;
+  Hacl_EC_Format_fexpand(x0, basepoint);
+  z[0U] = (uint64_t)1U;
+  q = buf0;
+  {
+    uint8_t e[32U] = { 0U };
+    uint8_t e0;
+    uint8_t e31;
+    uint8_t e01;
+    uint8_t e311;
+    uint8_t e312;
+    uint8_t *scalar;
+    memcpy(e, secret, (uint32_t)32U * sizeof secret[0U]);
+    e0 = e[0U];
+    e31 = e[31U];
+    e01 = e0 & (uint8_t)248U;
+    e311 = e31 & (uint8_t)127U;
+    e312 = e311 | (uint8_t)64U;
+    e[0U] = e01;
+    e[31U] = e312;
+    scalar = e;
+    {
+      uint64_t buf[15U] = { 0U };
+      uint64_t *nq = buf;
+      uint64_t *x = nq;
+      x[0U] = (uint64_t)1U;
+      Hacl_EC_Ladder_cmult(nq, scalar, q);
+      Hacl_EC_Format_scalar_of_point(mypublic, nq);
+    }
+  }
+}
diff --git a/third_party/mbedtls/test/secp384r1_test.c b/third_party/mbedtls/test/secp384r1_test.c
new file mode 100644
index 000000000..68de75ce7
--- /dev/null
+++ b/third_party/mbedtls/test/secp384r1_test.c
@@ -0,0 +1,294 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/rand/rand.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+#include "libc/testlib/ezbench.h"
+#include "libc/testlib/testlib.h"
+#include "third_party/mbedtls/bignum.h"
+#include "third_party/mbedtls/ecp.h"
+#include "third_party/mbedtls/ecp_internal.h"
+#include "third_party/mbedtls/math.h"
+#ifdef MBEDTLS_ECP_C
+
+int ecp_mod_p384_old(mbedtls_mpi *);
+
+int GetEntropy(void *c, unsigned char *p, size_t n) {
+  rngset(p, n, rand64, -1);
+  return 0;
+}
+
+TEST(secp384r1, testIsTheSame) {
+  int i;
+  mbedtls_mpi A, B;
+  mbedtls_mpi_init(&A);
+  mbedtls_mpi_init(&B);
+  mbedtls_mpi_fill_random(&A, 12 * 8, GetEntropy, 0);
+  mbedtls_mpi_copy(&B, &A);
+  ecp_mod_p384(&A);
+  ecp_mod_p384_old(&B);
+  for (i = 0; i < 1000; ++i) {
+    if (memcmp(A.p, B.p, 12 * 8)) {
+      for (i = 0; i < 12; ++i) {
+        printf("0x%016lx vs. 0x%016lx %d\n", A.p[i], B.p[i], A.p[i] == B.p[i]);
+      }
+      exit(1);
+    }
+  }
+  mbedtls_mpi_free(&B);
+  mbedtls_mpi_free(&A);
+}
+
+static inline bool mbedtls_p384_gte(uint64_t p[7]) {
+  return (((int64_t)p[6] > 0 ||
+           (p[5] > 0xffffffffffffffff ||
+            (p[5] == 0xffffffffffffffff &&
+             (p[4] > 0xffffffffffffffff ||
+              (p[4] == 0xffffffffffffffff &&
+               (p[3] > 0xffffffffffffffff ||
+                (p[3] == 0xffffffffffffffff &&
+                 (p[2] > 0xfffffffffffffffe ||
+                  (p[2] == 0xfffffffffffffffe &&
+                   (p[1] > 0xffffffff00000000 ||
+                    (p[1] == 0xffffffff00000000 &&
+                     (p[0] > 0x00000000ffffffff ||
+                      (p[0] == 0x00000000ffffffff))))))))))))));
+}
+
+static inline void mbedtls_p384_gro(uint64_t p[7]) {
+#if defined(__x86_64__) && !defined(__STRICT_ANSI__)
+  asm("addq\t%1,%0\n\t"
+      "adcq\t%2,8+%0\n\t"
+      "adcq\t%3,16+%0\n\t"
+      "adcq\t%4,24+%0\n\t"
+      "adcq\t%4,32+%0\n\t"
+      "adcq\t%4,40+%0\n\t"
+      "adcq\t$0,48+%0"
+      : "+o"(*p)
+      : "r"(0x00000000ffffffffl), "r"(0xffffffff00000000),
+        "i"(0xfffffffffffffffel), "i"(0xffffffffffffffff)
+      : "memory", "cc");
+#else
+  uint64_t c;
+  ADC(p[0], p[0], 0x00000000ffffffff, 0, c);
+  ADC(p[1], p[1], 0xffffffff00000000, c, c);
+  ADC(p[2], p[2], 0xfffffffffffffffe, c, c);
+  ADC(p[3], p[3], 0xffffffffffffffff, c, c);
+  ADC(p[4], p[4], 0xffffffffffffffff, c, c);
+  ADC(p[5], p[5], 0xffffffffffffffff, c, c);
+  ADC(p[6], p[6], 0, c, c);
+#endif
+}
+
+static inline void mbedtls_p384_red(uint64_t p[7]) {
+#if defined(__x86_64__) && !defined(__STRICT_ANSI__)
+  asm("subq\t%1,%0\n\t"
+      "sbbq\t%2,8+%0\n\t"
+      "sbbq\t%3,16+%0\n\t"
+      "sbbq\t%4,24+%0\n\t"
+      "sbbq\t%4,32+%0\n\t"
+      "sbbq\t%4,40+%0\n\t"
+      "sbbq\t$0,48+%0"
+      : "+o"(*p)
+      : "r"(0x00000000ffffffffl), "r"(0xffffffff00000000),
+        "i"(0xfffffffffffffffel), "i"(0xffffffffffffffff)
+      : "memory", "cc");
+#else
+  uint64_t c;
+  SBB(p[0], p[0], 0x00000000ffffffff, 0, c);
+  SBB(p[1], p[1], 0xffffffff00000000, c, c);
+  SBB(p[2], p[2], 0xfffffffffffffffe, c, c);
+  SBB(p[3], p[3], 0xffffffffffffffff, c, c);
+  SBB(p[4], p[4], 0xffffffffffffffff, c, c);
+  SBB(p[5], p[5], 0xffffffffffffffff, c, c);
+  SBB(p[6], p[6], 0, c, c);
+#endif
+}
+
+static inline void mbedtls_p384_rum(uint64_t p[7]) {
+  while (mbedtls_p384_gte(p)) mbedtls_p384_red(p);
+}
+
+static inline void mbedtls_p384_mod(uint64_t X[12]) {
+  secp384r1(X);
+  if ((int64_t)X[6] < 0) {
+    do {
+      mbedtls_p384_gro(X);
+    } while ((int64_t)X[6] < 0);
+  } else {
+    while (mbedtls_p384_gte(X)) {
+      mbedtls_p384_red(X);
+    }
+  }
+}
+
+TEST(secp384r1, needsDownwardCorrection) {
+  int i;
+  uint64_t P[6] = {
+      0x00000000ffffffff,  //
+      0xffffffff00000000,  //
+      0xfffffffffffffffe,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+  };
+  uint64_t X[12] = {
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+  };
+  uint64_t W[12] /* == X mod P */ = {
+      0xfffffffe00000000,  //
+      0x0000000200000000,  //
+      0xfffffffe00000000,  //
+      0x0000000200000000,  //
+      0x0000000000000001,  //
+  };
+  mbedtls_p384_mod(X);
+  if (memcmp(W, X, 12 * 8)) {
+    for (i = 0; i < 12; ++i) {
+      printf("0x%016lx vs. 0x%016lx %d\n", W[i], X[i], W[i] == X[i]);
+    }
+    exit(1);
+  }
+}
+
+TEST(secp384r1, needsUpwardCorrection) {
+  int i;
+  uint64_t P[6] = {
+      0x00000000ffffffff,  //
+      0xffffffff00000000,  //
+      0xfffffffffffffffe,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+      0xffffffffffffffff,  //
+  };
+  uint64_t X[12] = {
+      0x0000000000000000,  //
+      0x0000000000000000,  //
+      0x0000000000000000,  //
+      0x0000000000000000,  //
+      0x0000000000000000,  //
+      0x0000000000000000,  //
+      0x0000000000000000,  //
+      0x0000000000000000,  //
+      0x0000000000000000,  //
+      0x0000000000000000,  //
+      0x0000000000000000,  //
+      0x00000000ffffffff,  //
+  };
+  uint64_t W[12] /* == X mod P */ = {
+      0xffffffffffffffff,  //
+      0x0000000000000000,  //
+      0xfffffffefffffffd,  //
+      0x0000000100000000,  //
+      0x0000000000000000,  //
+      0x00000001ffffffff,  //
+  };
+  mbedtls_p384_mod(X);
+  if (memcmp(W, X, 12 * 8)) {
+    for (i = 0; i < 12; ++i) {
+      printf("0x%016lx vs. 0x%016lx %d\n", W[i], X[i], W[i] == X[i]);
+    }
+    exit(1);
+  }
+}
+
+BENCH(secp384r1, bench) {
+  mbedtls_mpi A;
+  mbedtls_mpi_init(&A);
+  mbedtls_mpi_fill_random(&A, 12 * 8, GetEntropy, 0);
+  EZBENCH2("secp384r1", donothing, secp384r1(A.p));
+  EZBENCH2("ecp_mod_p384", donothing, ecp_mod_p384(&A));
+  EZBENCH2("ecp_mod_p384_old", donothing, ecp_mod_p384_old(&A));
+  mbedtls_mpi_free(&A);
+}
+
+void mbedtls_p384_shl_a(uint64_t p[7]) {
+  asm("shlq\t%0\n\t"
+      "rclq\t8+%0\n\t"
+      "rclq\t16+%0\n\t"
+      "rclq\t24+%0\n\t"
+      "rclq\t32+%0\n\t"
+      "rclq\t40+%0\n\t"
+      "rclq\t48+%0\n\t"
+      : "+o"(*p)
+      : /* no inputs */
+      : "memory", "cc");
+  mbedtls_p384_rum(p);
+}
+
+void mbedtls_p384_shl_b(uint64_t p[7]) {
+  p[6] = p[5] >> 63;
+  p[5] = p[5] << 1 | p[4] >> 63;
+  p[4] = p[4] << 1 | p[3] >> 63;
+  p[3] = p[3] << 1 | p[2] >> 63;
+  p[2] = p[2] << 1 | p[1] >> 63;
+  p[1] = p[1] << 1 | p[0] >> 63;
+  p[0] = p[0] << 1;
+  mbedtls_p384_rum(p);
+}
+
+BENCH(shl, bench) {
+  uint64_t A[7] = {0};
+  EZBENCH2("mbedtls_p384_shl_a", donothing, mbedtls_p384_shl_a(A));
+  EZBENCH2("mbedtls_p384_shl_b", donothing, mbedtls_p384_shl_b(A));
+}
+
+void mbedtls_p384_red_a(uint64_t p[7]) {
+  asm("subq\t%1,%0\n\t"
+      "sbbq\t%2,8+%0\n\t"
+      "sbbq\t%3,16+%0\n\t"
+      "sbbq\t%4,24+%0\n\t"
+      "sbbq\t%4,32+%0\n\t"
+      "sbbq\t%4,40+%0\n\t"
+      "sbbq\t$0,48+%0"
+      : "+o"(*p)
+      : "r"(0x00000000ffffffffl), "r"(0xffffffff00000000),
+        "i"(0xfffffffffffffffel), "i"(0xffffffffffffffff)
+      : "memory", "cc");
+}
+
+void mbedtls_p384_red_b(uint64_t p[7]) {
+  uint64_t c;
+  SBB(p[0], p[0], 0x00000000ffffffff, 0, c);
+  SBB(p[1], p[1], 0xffffffff00000000, c, c);
+  SBB(p[2], p[2], 0xfffffffffffffffe, c, c);
+  SBB(p[3], p[3], 0xffffffffffffffff, c, c);
+  SBB(p[4], p[4], 0xffffffffffffffff, c, c);
+  SBB(p[5], p[5], 0xffffffffffffffff, c, c);
+  SBB(p[6], p[6], 0, c, c);
+}
+
+BENCH(red, bench) {
+  uint64_t A[7] = {0};
+  EZBENCH2("mbedtls_p384_red_a", donothing, mbedtls_p384_red_a(A));
+  EZBENCH2("mbedtls_p384_red_b", donothing, mbedtls_p384_red_b(A));
+}
+
+#endif /* MBEDTLS_ECP_C */
diff --git a/third_party/mbedtls/test/test.mk b/third_party/mbedtls/test/test.mk
index 51b64adc3..4e492e171 100644
--- a/third_party/mbedtls/test/test.mk
+++ b/third_party/mbedtls/test/test.mk
@@ -78,7 +78,9 @@ THIRD_PARTY_MBEDTLS_TEST_COMS =											\
 	o/$(MODE)/third_party/mbedtls/test/test_suite_timing.com						\
 	o/$(MODE)/third_party/mbedtls/test/test_suite_version.com						\
 	o/$(MODE)/third_party/mbedtls/test/test_suite_x509parse.com						\
-	o/$(MODE)/third_party/mbedtls/test/test_suite_x509write.com
+	o/$(MODE)/third_party/mbedtls/test/test_suite_x509write.com						\
+	o/$(MODE)/third_party/mbedtls/test/secp384r1_test.com							\
+	o/$(MODE)/third_party/mbedtls/test/everest_test.com
 
 THIRD_PARTY_MBEDTLS_TEST_TESTS =										\
 	$(THIRD_PARTY_MBEDTLS_TEST_COMS:%=%.ok)
@@ -1340,3 +1342,22 @@ o/$(MODE)/third_party/mbedtls/test/test_suite_x509write.com.dbg:						\
 		$(CRT)												\
 		$(APE)
 	@$(APELINK)
+
+o/$(MODE)/third_party/mbedtls/test/everest_test.com: o/$(MODE)/third_party/mbedtls/test/everest_test.com.dbg
+o/$(MODE)/third_party/mbedtls/test/everest_test.com.dbg:							\
+		$(THIRD_PARTY_MBEDTLS_TEST_DEPS)								\
+		o/$(MODE)/third_party/mbedtls/test/everest_test.o						\
+		o/$(MODE)/third_party/mbedtls/test/everest_unravaged.o						\
+		$(LIBC_TESTMAIN)										\
+		$(CRT)												\
+		$(APE)
+	@$(APELINK)
+
+o/$(MODE)/third_party/mbedtls/test/secp384r1_test.com: o/$(MODE)/third_party/mbedtls/test/secp384r1_test.com.dbg
+o/$(MODE)/third_party/mbedtls/test/secp384r1_test.com.dbg:							\
+		$(THIRD_PARTY_MBEDTLS_TEST_DEPS)								\
+		o/$(MODE)/third_party/mbedtls/test/secp384r1_test.o						\
+		$(LIBC_TESTMAIN)										\
+		$(CRT)												\
+		$(APE)
+	@$(APELINK)