Make numerous improvements

- Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
2025-07-28 05:20:28 +00:00 · 2021-09-27 22:58:51 -07:00 · 2021-09-27 22:58:51 -07:00 · 39bf41f4eb
commit 39bf41f4eb
parent fa7b4f5bd1
806 changed files with 77494 additions and 63859 deletions
--- a/libc/nexgen32e/kompressor.h
+++ b/libc/nexgen32e/kompressor.h
@ -14,6 +14,7 @@ struct RlDecode {
 };

 void rldecode(void *dest, const struct RlDecode *) hidden;
+void rldecode2(void *dest, const struct RlDecode *) hidden;
 const uint8_t *lz4check(const void *data) hidden;
 void *lz4cpy(void *dest, const void *blockdata, size_t blocksize) hidden;
 void *lz4decode(void *dest, const void *src) hidden;
--- a/libc/nexgen32e/ksha256.S
+++ b/libc/nexgen32e/ksha256.S
@ -1,7 +1,7 @@
 /*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
 │vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@ -18,36 +18,23 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/macros.internal.h"

-//	Copies memory.
-//
-//	DEST and SRC may overlap.
-//
-//	@param	rdi is dest
-//	@param	rsi is src
-//	@param	rdx is number of bytes
-//	@return	original rdi copied to rax
-//	@clob	flags,rcx
-//	@asyncsignalsafe
-memmove:
-	mov	%rdi,%rax
-//	𝑠𝑙𝑖𝑑𝑒
-	.endfn	MemMove,globl,hidden
-
-MemMove:
-	.leafprologue
-	.profilable
-	push	%rdi
-	push	%rsi
-	mov	%rdx,%rcx
-	cmp	%rsi,%rdi
-	jb	1f
-	lea	-1(%rdi,%rcx),%rdi
-	lea	-1(%rsi,%rcx),%rsi
-	std
-1:	rep movsb
-	cld
-	pop	%rsi
-	pop	%rdi
-	.leafepilogue
-	.endfn	memmove,globl
-	.source	__FILE__
+	.rodata
+	.align	64
+kSha256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.endobj	kSha256,globl
--- a/libc/nexgen32e/memcpy.S
+++ b/libc/nexgen32e/memcpy.S
@ -1,556 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚──────────────────────────────────────────────────────────────────────────────╝
-	@fileoverview Cosmopolitan Memory Copying
-	
-	Of all the functions in the technology industry, none are more
-	critical than the Kernighan & Ritchie Memory Copy API for the C
-	Language, 1972 model: more commonly known as memcpy(). It's the
-	world's most popular function──one all programmers love.
-	
-	This implementation is the fastest and nearly the tiniest too.
-	It doesn't break when copying backwards or on misaligned data.
-	It's so easy that even a child could use it, and they do.
-*/
-#include "libc/nexgen32e/x86feature.h"
-#include "libc/macros.internal.h"
-
-//	Copies memory.
-//
-//	DEST and SRC must not overlap, unless DEST≤SRC.
-//
-//	@param	rdi is dest
-//	@param	rsi is src
-//	@param	rdx is number of bytes
-//	@return	original rdi copied to rax
-//	@mode	long
-//	@asyncsignalsafe
-memcpy:	mov	%rdi,%rax
-//	𝑠𝑙𝑖𝑑𝑒
-	.align	16
-	.endfn	memcpy,globl
-
-//	Copies memory w/ minimal impact ABI.
-//
-//	@param	rdi is dest
-//	@param	rsi is src
-//	@param	rdx is number of bytes
-//	@clob	flags,rcx,xmm3,xmm4
-//	@mode	long
-MemCpy:	.leafprologue
-	.profilable
-	mov	$.Lmemcpytab.ro.size,%ecx
-	cmp	%rcx,%rdx
-	cmovb	%rdx,%rcx
-	jmp	*memcpytab(,%rcx,8)
-.Lanchorpoint:
-.L32r:	cmp	$1024,%rdx
-	jae	.Lerms
-.L32:	vmovdqu	-32(%rsi,%rdx),%ymm4
-	mov	$32,%rcx
-0:	add	$32,%rcx
-	vmovdqu	-64(%rsi,%rcx),%ymm3
-	vmovdqu	%ymm3,-64(%rdi,%rcx)
-	cmp	%rcx,%rdx
-	ja	0b
-	vmovdqu	%ymm4,-32(%rdi,%rdx)
-	vxorps	%ymm4,%ymm4,%ymm4
-	vxorps	%ymm3,%ymm3,%ymm3
-	jmp	.L0
-.L16r:	cmp	$1024,%rdx
-	jae	.Lerms
-.L16:	movdqu	-16(%rsi,%rdx),%xmm4
-	mov	$16,%rcx
-0:	add	$16,%rcx
-	movdqu	-32(%rsi,%rcx),%xmm3
-	movdqu	%xmm3,-32(%rdi,%rcx)
-	cmp	%rcx,%rdx
-	ja	0b
-	movdqu	%xmm4,-16(%rdi,%rdx)
-	pxor	%xmm4,%xmm4
-	pxor	%xmm3,%xmm3
-	jmp	.L0
-.L8:	push	%rbx
-	mov	(%rsi),%rcx
-	mov	-8(%rsi,%rdx),%rbx
-	mov	%rcx,(%rdi)
-	mov	%rbx,-8(%rdi,%rdx)
-1:	pop	%rbx
-.L0:	.leafepilogue
-.L4:	push	%rbx
-	mov	(%rsi),%ecx
-	mov	-4(%rsi,%rdx),%ebx
-	mov	%ecx,(%rdi)
-	mov	%ebx,-4(%rdi,%rdx)
-	jmp	1b
-.L3:	push	%rbx
-	mov	(%rsi),%cx
-	mov	-2(%rsi,%rdx),%bx
-	mov	%cx,(%rdi)
-	mov	%bx,-2(%rdi,%rdx)
-	jmp	1b
-.L2:	mov	(%rsi),%cx
-	mov	%cx,(%rdi)
-	jmp	.L0
-.L1:	mov	(%rsi),%cl
-	mov	%cl,(%rdi)
-	jmp	.L0
-.Lerms:
-#ifdef TINY
-	cmp	$1024*1024,%rdx
-#else
-	cmp	kHalfCache3(%rip),%rdx
-#endif
-	ja	.Lnts
-	push	%rdi
-	push	%rsi
-	mov	%rdx,%rcx
-	rep movsb
-	pop	%rsi
-	pop	%rdi
-	jmp	.L0
-.Lnts:	movdqu	(%rsi),%xmm3
-	movdqu	%xmm3,(%rdi)
-	lea	16(%rdi),%rcx
-	and	$-16,%rcx
-	sub	%rdi,%rcx
-	add	%rcx,%rdi
-	add	%rcx,%rsi
-	sub	%rcx,%rdx
-	mov	$16,%rcx
-0:	add	$16,%rcx
-	movdqu	-32(%rsi,%rcx),%xmm3
-	movntdq	%xmm3,-32(%rdi,%rcx)
-	cmp	%rcx,%rdx
-	ja	0b
-	sfence
-	movdqu	-16(%rsi,%rdx),%xmm3
-	movdqu	%xmm3,-16(%rdi,%rdx)
-	pxor	%xmm3,%xmm3
-	jmp	.L0
-	.endfn	MemCpy,globl,hidden
-	.source	__FILE__
-
-	.initro 300,_init_memcpy
-memcpytab.ro:
-	.byte	.L0-.Lanchorpoint
-	.byte	.L1-.Lanchorpoint
-	.byte	.L2-.Lanchorpoint
-	.byte	.L3-.Lanchorpoint
-	.rept	4
-	.byte	.L4-.Lanchorpoint
-	.endr
-	.rept	8
-	.byte	.L8-.Lanchorpoint
-	.endr
-	.rept	16
-	.byte	.L16-.Lanchorpoint
-	.endr
-	.equ	.Lmemcpytab.ro.size,.-memcpytab.ro
-	.endobj	memcpytab.ro
-	.if	.Lmemcpytab.ro.size % 8
-	.error	"moar jmptab"
-	.endif
-	.byte	.L16-.Lanchorpoint		# SSE2
-	.byte	.L16r-.Lanchorpoint		# SSE2 + ERMS
-	.byte	.L32-.Lanchorpoint		# AVX
-	.byte	.L32r-.Lanchorpoint		# AVX + ERMS
-	.byte	0,0,0,0
-	.previous
-
-	.initbss 300,_init_memcpy
-memcpytab:
-	.rept	.Lmemcpytab.ro.size
-	.quad	0
-	.endr
-	.quad	0
-	.endobj	memcpytab
-	.previous
-
-	.init.start 300,_init_memcpy
-	pushpop	.Lmemcpytab.ro.size,%rcx
-	ezlea	.Lanchorpoint,dx
-	testb	X86_HAVE(AVX)+kCpuids(%rip)
-	call	memjmpinit
-	.init.end 300,_init_memcpy
-
-/*	your memcpy()		375 bytes
-	bionic memcpy()		1,429 bytes
-	glibc memcpy()		27,216 bytes
-	musl memcpy()		49 bytes
-	newlib memcpy()		300 bytes
-
-	benchmarks on intel core i7-6700 @ 3.40GHz (skylake)
-	includes function call overhead (unless marked otherwise)
-
-	your memcpy(𝑛) for #c per n where c ≈ 0.293ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1                297.000        35.125        35.203      92
-	1                 35.000        35.625        35.016      93
-	2                 27.500        17.438        17.555     185
-	3                 21.000        11.875        12.057     270
-	4                 16.250         8.719         8.809     369
-	7                  5.000         4.946         5.069     641
-	8                  7.375         4.422         4.365     745
-	15                 4.067         2.342         2.336    1391
-	16                 4.188         2.242         2.257    1440 «
-	31                 8.032         1.157         1.147    2835
-	32                 2.031         1.723         1.325    2454
-	63                 1.000         0.589         0.589    5523
-	64                 0.578         0.580         0.577    5630 «
-	127                0.638         0.377         0.320   10151
-	128                0.289         0.296         0.307   10605
-	255                0.404         0.202         0.194   16741
-	256                0.160         0.165         0.166   19574 «
-	511                0.159         0.123         0.110   29458
-	512                0.139         0.098         0.097   33571 «
-	1023               0.107         0.086         0.074   44111
-	1024               0.103         0.084         0.082   39489
-	2047               0.057         0.056         0.057   57450
-	2048               0.055         0.055         0.055   59269
-	4095               0.044         0.044         0.044   74051
-	4096               0.043         0.043         0.043   75300 «
-	8191               0.036         0.036         0.036   91301
-	8192               0.036         0.035         0.035   92411
-	16383              0.033         0.032         0.032  102163
-	16384              0.034         0.032         0.032  102145 « (L1)/2
-	32767              0.098         0.081         0.077   42271
-	32768              0.077         0.077         0.076   42781
-	65535              0.088         0.075         0.072   44973
-	65536              0.074         0.072         0.071   45520
-	131071             0.086         0.075         0.072   44869
-	131072             0.077         0.073         0.072   45076 « (L2)/2
-	262143             0.095         0.096         0.095   34116
-	262144             0.096         0.096         0.095   34160
-	524287             0.102         0.109         0.111   29359
-	524288             0.107         0.109         0.108   30033
-	1048575            0.102         0.103         0.104   31112
-	1048576            0.101         0.103         0.103   31605
-	2097151            0.104         0.103         0.109   29929
-	2097152            0.108         0.110         0.103   31652
-	4194303            0.192         0.172         0.172   18950
-	4194304            0.168         0.161         0.160   20311 « (L3)/2
-	8388607            0.339         0.329         0.344    9461 « RAM
-	8388608            0.384         0.369         0.341    9545
-
-	Bionic memcpy() for #c per n where c ≈ 0.293ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1                347.000        40.625        35.984      90
-	1                 37.000        35.625        36.734      89
-	2                 28.500        18.688        18.383     177
-	3                 11.667        12.375        12.359     263
-	4                 12.250         9.406         9.020     361
-	7                  5.000         5.018         5.118     636
-	8                 11.625         5.828         4.779     681
-	15                 3.533         3.158         2.620    1243
-	16                 4.688         2.742         2.884    1129 «
-	31                 1.903         1.262         1.172    2778
-	32                 1.344         1.113         1.125    2895
-	63                 1.444         0.633         0.591    5513
-	64                 0.766         0.580         0.581    5605 «
-	127                0.512         0.383         0.318   10229
-	128                0.461         0.315         0.311   10463
-	255                0.475         0.216         0.193   16840
-	256                0.371         0.236         0.199   16397 «
-	511                0.295         0.144         0.120   27223
-	512                0.240         0.151         0.126   25937 «
-	1023               0.142         0.101         0.088   36947
-	1024               0.126         0.108         0.091   35889
-	2047               0.088         0.074         0.072   45475
-	2048               0.089         0.077         0.073   44380
-	4095               0.081         0.065         0.064   50766
-	4096               0.068         0.066         0.065   50246 «
-	8191               0.063         0.061         0.060   54075
-	8192               0.065         0.061         0.061   53731
-	16383              0.082         0.066         0.061   53765
-	16384              0.067         0.063         0.062   52765 « (L1)/2
-	32767              0.102         0.085         0.085   38406
-	32768              0.086         0.085         0.085   38473
-	65535              0.098         0.085         0.085   38292
-	65536              0.086         0.085         0.085   38369
-	131071             0.438         0.177         0.089   36716
-	131072             0.092         0.090         0.093   34880 « (L2)/2
-	262143             0.306         0.146         0.127   25601
-	262144             0.126         0.168         0.127   25704
-	524287             0.213         0.152         0.136   23993
-	524288             0.132         0.159         0.133   24570
-	1048575            0.127         0.129         0.130   25117
-	1048576            0.128         0.129         0.130   25107
-	2097151            0.127         0.127         0.129   25199
-	2097152            0.127         0.136         0.134   24274
-	4194303            0.216         0.192         0.228   14237
-	4194304            0.351         0.351         0.356    9139 « (L3)/2
-	8388607            0.323         0.293         0.298   10903 « RAM
-	8388608            0.365         0.296         0.300   10844
-
-	GCC builtin (Inline REP MOVSB) for #c per n where c ≈ 0.293ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1                 53.000        50.625        50.453      64
-	1                 47.000        49.375        49.141      66
-	2                 23.500        25.062        24.898     131
-	3                 15.667        16.792        16.880     193
-	4                 11.750        12.531        12.957     251
-	7                  7.000         7.125         7.190     452
-	8                  6.125         7.578         6.322     514
-	15                 3.133         3.325         3.372     964
-	16                 3.062         3.117         3.132    1038 «
-	31                 1.645         1.601         1.620    2007
-	32                 1.531         1.559         1.585    2051
-	63                 0.778         0.796         0.802    4056
-	64                 0.766         0.768         0.767    4238 «
-	127                0.480         0.446         0.448    7259
-	128                0.445         0.419         0.423    7693
-	255                0.239         0.239         0.236   13781
-	256                0.238         0.225         0.225   14466 «
-	511                0.127         0.133         0.132   24555
-	512                0.123         0.127         0.128   25377 «
-	1023               0.079         0.081         0.081   40346
-	1024               0.075         0.077         0.078   41714
-	2047               0.053         0.055         0.055   59575
-	2048               0.053         0.053         0.053   60795
-	4095               0.042         0.043         0.043   75843
-	4096               0.042         0.042         0.042   77153
-	8191               0.035         0.036         0.036   91518
-	8192               0.035         0.035         0.035   92603
-	16383              0.032         0.032         0.032  102407
-	16384              0.033         0.032         0.032  102864 « (L1)/2
-	32767              0.106         0.082         0.078   41486
-	32768              0.079         0.078         0.079   41290
-	65535              0.090         0.077         0.075   43565
-	65536              0.074         0.074         0.073   44299
-	131071             0.091         0.078         0.075   43196
-	131072             0.078         0.076         0.074   43673 « (L2)/2
-	262143             0.097         0.099         0.098   33192
-	262144             0.098         0.098         0.098   33193
-	524287             0.105         0.111         0.111   29212
-	524288             0.109         0.111         0.111   29211
-	1048575            0.107         0.108         0.108   30069
-	1048576            0.106         0.112         0.105   30886
-	2097151            0.105         0.103         0.103   31621
-	2097152            0.102         0.103         0.104   31280
-	4194303            0.180         0.158         0.176   18456
-	4194304            0.167         0.155         0.154   21098 « (L3)/2
-	8388607            0.538         0.576         0.557    5834 « RAM
-	8388608            0.750         0.579         0.552    5893
-
-	glibc memcpy() for #c per n where c ≈ 0.293ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1                139.000        90.125        84.891      38
-	1                 83.000        82.125        84.359      39
-	2                 61.500        46.438        45.164      72
-	3                 41.667        32.458        31.245     104
-	4                 32.750        26.156        24.410     133
-	7                 20.143        16.732        16.033     203
-	8                 13.375         8.328         6.908     471
-	15                 8.200         6.408         5.753     565
-	16                 4.438         3.570         3.466     938 «
-	31                 3.258         2.891         2.786    1167
-	32                 2.281         1.801         1.732    1878
-	63                 1.635         1.431         1.374    2367
-	64                 1.109         0.896         0.868    3747 «
-	127                0.921         0.792         0.779    4176
-	128                0.508         0.511         0.494    6589
-	255                0.451         0.407         0.402    8081
-	256                0.324         0.269         0.260   12498 «
-	511                0.249         0.218         0.212   15335
-	512                0.178         0.149         0.146   22297 «
-	1023               0.138         0.124         0.121   26947
-	1024               0.087         0.089         0.087   37238
-	2047               0.084         0.077         0.076   43046
-	2048               0.066         0.059         0.058   56120
-	4095               0.058         0.054         0.054   60706
-	4096               0.050         0.046         0.046   71092 «
-	8191               0.043         0.042         0.042   78259
-	8192               0.037         0.037         0.037   87409
-	16383              0.037         0.036         0.035   92065
-	16384              0.034         0.034         0.033   97942 « (L1)/2
-	32767              0.104         0.084         0.080   40572
-	32768              0.079         0.079         0.079   41055
-	65535              0.094         0.080         0.076   42885
-	65536              0.077         0.075         0.075   43423
-	131071             0.092         0.080         0.078   41498
-	131072             0.082         0.078         0.077   42350 « (L2)/2
-	262143             0.100         0.101         0.287   11342
-	262144             0.099         0.099         0.098   33177
-	524287             0.106         0.111         0.110   29609
-	524288             0.107         0.119         0.110   29608
-	1048575            0.104         0.105         0.106   30626
-	1048576            0.104         0.111         0.105   30878
-	2097151            0.103         0.103         0.103   31606
-	2097152            0.102         0.103         0.103   31644
-	4194303            0.174         0.160         0.165   19714
-	4194304            0.166         0.157         0.154   21110 « (L3)/2
-	8388607            0.537         0.554         0.565    5750 « RAM
-	8388608            0.701         0.537         0.552    5884
-
-	musl memcpy() for #c per n where c ≈ 0.293ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1                 97.000        80.625        79.891      41
-	1                 77.000        78.875        78.266      42
-	2                 49.500        44.062        42.102      77
-	3                 33.667        32.792        30.651     106
-	4                 29.750        24.281        24.137     135
-	7                 19.000        16.161        15.734     207
-	8                 12.125         7.766         6.721     484
-	15                 8.867         5.892         5.714     569
-	16                 5.062         3.742         3.458     940
-	31                 3.645         2.915         2.715    1198
-	32                 2.156         1.723         1.663    1956
-	63                 1.540         1.367         1.333    2440
-	64                 1.078         0.873         0.833    3905
-	127                0.874         0.771         0.737    4415
-	128                0.617         0.487         0.481    6766
-	255                0.443         0.390         0.382    8504
-	256                0.316         0.259         0.259   12545
-	511                0.245         0.232         0.237   13742
-	512                0.174         0.159         0.208   15668
-	1023               0.181         0.193         0.182   17821
-	1024               0.155         0.123         0.114   28579
-	2047               0.102         0.092         0.085   38219
-	2048               0.064         0.073         0.070   46577
-	4095               0.058         0.067         0.065   50272
-	4096               0.049         0.055         0.055   59467
-	8191               0.057         0.052         0.049   66468
-	8192               0.053         0.050         0.051   63557
-	16383              0.082         0.065         0.064   50897
-	16384              0.066         0.065         0.061   53697 « (L1)/2
-	32767              0.121         0.100         0.114   28555
-	32768              0.093         0.091         0.114   28615
-	65535              0.118         0.102         0.142   22858
-	65536              0.108         0.274         0.097   33432
-	131071             0.117         0.109         0.109   29905
-	131072             0.110         0.195         0.113   28692 « (L2)/2
-	262143             0.283         0.166         0.122   26638
-	262144             0.130         0.144         0.123   26544
-	524287             0.210         0.153         0.130   25079
-	524288             0.126         0.128         0.123   26422
-	1048575            0.139         0.107         0.106   30803
-	1048576            0.104         0.105         0.106   30683
-	2097151            0.103         0.103         0.103   31564
-	2097152            0.102         0.103         0.103   31531
-	4194303            0.242         0.158         0.169   19238
-	4194304            0.166         0.161         0.154   21072 « (L3)/2
-	8388607            0.533         0.549         0.599    5422 « RAM
-	8388608            0.768         0.630         0.560    5801
-
-	newlib (aka. cygwin) memcpy() for #c per n where c ≈ 0.293ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1                 61.000        52.875        53.141      61
-	1                 49.000        49.875        50.328      65
-	2                 24.500        24.812        26.727     122
-	3                 15.667        20.125        16.943     192
-	4                 12.750        15.281        13.090     248
-	7                  7.000         7.375         7.431     438
-	8                  5.875         6.422         6.377     510
-	15                 3.267         3.375         3.447     943
-	16                10.062         6.945         6.386     509
-	31                 2.548         2.488         2.545    1278
-	32                 3.156         3.207         3.201    1016
-	63                 1.190         1.220         1.229    2646
-	64                 1.578         1.588         1.599    2033
-	127                0.717         0.690         0.685    4744
-	128                0.820         0.856         0.857    3795
-	255                0.357         0.359         0.358    9077
-	256                0.629         0.461         0.426    7630
-	511                0.260         0.219         0.204   15947
-	512                0.330         0.299         0.268   12113
-	1023               0.269         0.175         0.162   20042
-	1024               0.315         0.201         0.196   16633
-	2047               0.349         0.241         0.236   13790
-	2048               0.332         0.269         0.264   12295
-	4095               0.349         0.295         0.287   11348
-	4096               0.361         0.313         0.303   10748
-	8191               0.361         0.317         0.322   10110
-	8192               0.369         0.326         0.319   10201
-	16383              0.321         0.322         0.327    9940
-	16384              0.309         0.330         0.329    9878 « (L1)/2
-	32767              0.291         0.303         0.307   10599
-	32768              0.314         0.304         0.305   10667
-	65535              0.373         0.311         0.313   10396
-	65536              0.305         0.750         0.421    7729
-	131071             0.329         0.427         0.384    8470
-	131072             0.329         0.388         0.361    9020 « (L2)/2
-	262143             0.520         0.389         0.425    7646
-	262144             0.364         0.400         0.368    8843
-	524287             0.449         0.389         0.389    8353
-	524288             0.384         0.379         0.384    8466
-	1048575            0.436         0.397         0.401    8107
-	1048576            0.431         0.397         0.401    8112
-	2097151            0.417         0.567         0.434    7498
-	2097152            0.457         0.503         0.427    7621
-	4194303            0.328         0.348         0.368    8822
-	4194304            0.343         0.352         0.352    9221 « (L3)/2
-	8388607            0.313         0.319         0.326    9957 « RAM
-	8388608            0.366         0.320         0.328    9910
-
-	openbsd memcpy() for #c per n where c ≈ 0.293ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1                 73.000        41.375        41.484      78
-	1                 39.000        39.875        41.641      78
-	2                 28.500        20.688        21.227     153
-	3                 27.000        15.875        15.557     209
-	4                 16.750        12.656        12.520     260
-	7                 20.429        10.982        10.292     316
-	8                  8.625         5.234         5.576     583
-	15                 7.267         4.758         4.920     661
-	16                 4.312         2.742         2.747    1183
-	31                 4.613         2.891         2.555    1272
-	32                 2.844         1.520         1.441    2256
-	63                 2.397         1.268         1.328    2449
-	64                 1.547         0.822         0.769    4226
-	127                1.189         0.782         0.671    4842
-	128                0.727         0.532         0.460    7066
-	255                0.631         0.463         0.414    7856
-	256                0.543         0.374         0.302   10775
-	511                0.542         0.316         0.276   11785
-	512                0.354         0.260         0.224   14494
-	1023               0.267         0.245         0.229   14201
-	1024               0.251         0.200         0.197   16496
-	2047               0.214         0.226         0.181   17941
-	2048               0.189         0.167         0.166   19575
-	4095               0.200         0.168         0.163   19957
-	4096               0.165         0.155         0.153   21219
-	8191               0.158         0.153         0.151   21578
-	8192               0.153         0.148         0.147   22138
-	16383              0.173         0.148         0.146   22319
-	16384              0.153         0.487         0.188   17298 « (L1)/2
-	32767              0.161         0.151         0.192   16893
-	32768              0.151         0.314         0.213   15275
-	65535              0.157         0.154         0.148   21969
-	65536              0.147         0.145         0.145   22493
-	131071             0.152         0.151         0.154   21145
-	131072             0.148         0.229         0.158   20564 « (L2)/2
-	262143             0.320         0.183         0.162   20031
-	262144             0.330         0.205         0.167   19503
-	524287             0.159         0.171         0.163   19913
-	524288             0.250         0.189         0.162   20120
-	1048575            0.157         0.164         0.161   20182
-	1048576            0.155         0.156         0.157   20672
-	2097151            0.161         0.158         0.157   20644
-	2097152            0.158         0.157         0.165   19727
-	4194303            0.327         0.256         0.238   13684
-	4194304            0.232         0.220         0.236   13749 « (L3)/2
-	8388607            0.721         0.689         0.586    5549 « RAM
-	8388608            0.943         0.569         0.593    5481 */
--- a/libc/nexgen32e/mempcpy.S
+++ b/libc/nexgen32e/mempcpy.S
@ -1,33 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-.source	__FILE__
-
-//	Copies memory.
-//
-//	DEST and SRC must not overlap unless DEST ≤ SRC.
-//
-//	@param	rdi is dest
-//	@param	rsi is src
-//	@param	rdx is number of bytes
-//	@return	original rdi + rdx copied to rax
-mempcpy:
-	lea	(%rdi,%rdx),%rax
-	jmp	MemCpy
-	.endfn	mempcpy,globl
--- a/libc/nexgen32e/memset.S
+++ b/libc/nexgen32e/memset.S
@ -1,406 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 sw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚──────────────────────────────────────────────────────────────────────────────╝
-	@fileoverview Cosmopolitan Memory Setter
-	
-	This sets one bit per picosecond on a $900 Skylake workstation,
-	which is about 110 GBps. */
-#include "libc/nexgen32e/x86feature.h"
-#include "libc/nexgen32e/macros.h"
-#include "libc/macros.internal.h"
-
-//	Sets memory.
-//
-//	@param	rdi is dest
-//	@param	esi is the byte to set
-//	@param	edx is the number of bytes to set
-//	@return	original rdi copied to rax
-//	@mode	long
-//	@asyncsignalsafe
-memset:	mov	%rdi,%rax
-//	𝑠𝑙𝑖𝑑𝑒
-	.align	16
-	.endfn	memset,globl
-
-//	Sets memory w/ minimal-impact ABI.
-//
-//	@param	rdi is dest
-//	@param	esi is the byte to set
-//	@param	edx is the number of bytes to set
-//	@clob	flags,rcx,xmm3
-//	@mode	long
-MemSet:	.leafprologue
-	.profilable
-	mov	$.Lmemsettab.ro.size,%ecx
-	cmp	%rcx,%rdx
-	cmovb	%rdx,%rcx
-	jmp	*memsettab(,%rcx,8)
-.Lanchorpoint:
-.L32r:	cmp	$1024,%rdx
-	jae	.Lerms
-.L32:	vmovd	%esi,%xmm3
-	vpbroadcastb %xmm3,%ymm3
-	mov	$32,%ecx
-1:	lea	32(%rcx),%rcx
-	vmovdqu	%ymm3,-64(%rdi,%rcx)
-	cmp	%rcx,%rdx
-	ja	1b
-	vmovdqu	%ymm3,-32(%rdi,%rdx)
-	vpxor	%ymm3,%ymm3,%ymm3
-	jmp	.L0
-.L16r:	cmp	$1024,%rdx
-	jae	.Lerms
-.L16:	movd	%esi,%xmm3
-	pbroadcastb %xmm3
-	mov	$16,%ecx
-1:	lea	16(%rcx),%rcx
-	movdqu	%xmm3,-32(%rdi,%rcx)
-	cmp	%rcx,%rdx
-	ja	1b
-	movdqu	%xmm3,-16(%rdi,%rdx)
-	pxor	%xmm3,%xmm3
-.L0:	.leafepilogue
-.L8:	movzbl	%sil,%ecx
-	imul	.Lb8(%rip),%rcx
-	mov	%rcx,(%rdi)
-	mov	%rcx,-8(%rdi,%rdx)
-	jmp	.L0
-.L4:	movzbl	%sil,%ecx
-	imul	$0x01010101,%ecx,%ecx
-	mov	%ecx,(%rdi)
-	mov	%ecx,-4(%rdi,%rdx)
-	jmp	.L0
-.L3:	mov	%sil,2(%rdi)
-.L2:	mov	%sil,1(%rdi)
-.L1:	mov	%sil,(%rdi)
-	jmp	.L0
-.Lerms:	push	%rax
-	push	%rdi
-	mov	%esi,%eax
-	mov	%rdx,%rcx
-	rep stosb
-	pop	%rdi
-	pop	%rax
-	jmp	.L0
-	.endfn	MemSet,globl,hidden
-	.source	__FILE__
-
-	.rodata.cst8
-.Lb8:	.quad	0x0101010101010101
-	.previous
-
-	.initro 300,_init_memset
-memsettab.ro:
-	.byte	.L0 - .Lanchorpoint
-	.byte	.L1 - .Lanchorpoint
-	.byte	.L2 - .Lanchorpoint
-	.byte	.L3 - .Lanchorpoint
-	.rept	4
-	.byte	.L4 - .Lanchorpoint
-	.endr
-	.rept	8
-	.byte	.L8 - .Lanchorpoint
-	.endr
-	.rept	16
-	.byte	.L16 - .Lanchorpoint
-	.endr
-	.equ	.Lmemsettab.ro.size,.-memsettab.ro
-	.endobj	memsettab.ro
-	.if	.Lmemsettab.ro.size % 8
-	.error	"moar jmptab"
-	.endif
-	.byte	.L16 - .Lanchorpoint		# SSE2
-	.byte	.L16r - .Lanchorpoint		# SSE2 + ERMS
-	.byte	.L32 - .Lanchorpoint		# AVX2
-	.byte	.L32r - .Lanchorpoint		# AVX2 + ERMS
-	.byte	0,0,0,0
-	.previous
-
-	.initbss 300,_init_memset
-memsettab:
-	.rept	.Lmemsettab.ro.size
-	.quad	0
-	.endr
-	.quad	0
-	.endobj	memsettab
-	.previous
-
-	.init.start 300,_init_memset
-	pushpop	.Lmemsettab.ro.size,%rcx
-	ezlea	.Lanchorpoint,dx
-	testb	X86_HAVE(AVX2)+kCpuids(%rip)
-	call	memjmpinit
-	.init.end 300,_init_memset
-
-/*	benchmarks on intel core i7-6700 @ 3.40GHz (skylake)
-	includes function call overhead (unless marked otherwise)
-
-	Your memset() for #c per n where c ≈ 0.273ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1                 73.000        35.125        36.141      97
-	1                 35.000        36.375        35.984      97
-	2                 28.500        19.938        18.820     185
-	3                 19.000        12.458        12.651     276
-	4                 15.750        10.719         9.566     365
-	7                  5.000         5.411         5.730     609
-	8                  8.375         4.953         4.697     743
-	15                 4.200         2.408         2.407    1450
-	16                 7.188         2.539         2.382    1465 «
-	31                 1.129         1.206         1.183    2950
-	32                15.156         2.012         1.292    2702
-	63                 4.016         0.986         0.663    5264
-	64                 3.547         0.967         0.684    5104
-	127                2.087         0.562         0.338   10311
-	128                1.805         0.499         0.336   10393
-	255                0.412         0.180         0.183   19119
-	256                0.160         0.170         0.169   20650
-	511                0.162         0.134         0.108   32214
-	512                0.100         0.106         0.104   33507
-	1023               0.110         0.095         0.082   42574
-	1024               0.099         0.080         0.078   44944
-	2047               0.155         0.154         0.154   22624
-	2048               0.052         0.052         0.053   66266
-	4095               0.098         0.099         0.099   35142
-	4096               0.042         0.042         0.041   84250
-	8191               0.072         0.073         0.072   48157
-	8192               0.034         0.034         0.034  101332
-	16383              0.059         0.059         0.059   58997
-	16384              0.031         0.031         0.031  112972
-	32767              0.054         0.054         0.054   65053
-	32768              0.029         0.029         0.029  119433
-	65535              0.069         0.069         0.068   51690
-	65536              0.057         0.057         0.057   61434
-	131071             0.066         0.066         0.066   53001
-	131072             0.057         0.058         0.057   60716
-	262143             0.066         0.065         0.065   53462
-	262144             0.060         0.058         0.058   60104
-	524287             0.067         0.068         0.072   48784
-	524288             0.063         0.062         0.061   56957
-	1048575            0.068         0.068         0.069   50353
-	1048576            0.062         0.060         0.062   56661
-	2097151            0.066         0.066         0.067   52421
-	2097152            0.060         0.060         0.061   57672
-	4194303            0.072         0.067         0.067   51910
-	4194304            0.062         0.061         0.062   56327
-	8388607            0.129         0.111         0.111   31368
-	8388608            0.136         0.119         0.111   31519
-
-	glibc memset() for #c per n where c ≈ 0.273ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1                121.000        39.125        35.547      98
-	1                 33.000        35.875        35.172      99
-	2                 17.500        18.312        18.070     193
-	3                 16.333        14.542        12.411     281
-	4                 12.250         9.344         9.215     379
-	7                  7.571         5.732         5.453     640
-	8                  4.625         4.641         4.623     755
-	15                 4.467         3.158         2.478    1408
-	16                 2.312         2.289         2.468    1414
-	31                 2.290         1.367         1.278    2731
-	32                 1.219         1.176         1.182    2952
-	63                 0.905         0.696         0.656    5320
-	64                 0.672         0.658         0.660    5285
-	127                1.299         0.723         0.673    5183
-	128                0.508         0.423         0.424    8227
-	255                0.490         0.428         0.417    8367
-	256                0.293         0.233         0.243   14349
-	511                0.284         0.232         0.234   14902
-	512                0.154         0.131         0.131   26626
-	1023               0.155         0.137         0.135   25839
-	1024               0.089         0.078         0.080   43875
-	2047               0.103         0.092         0.090   38672
-	2048               0.060         0.054         0.054   65116
-	4095               0.073         0.068         0.068   51405
-	4096               0.046         0.042         0.042   82162
-	8191               0.060         0.058         0.057   60739
-	8192               0.036         0.034         0.034  101467
-	16383              0.052         0.052         0.051   68594
-	16384              0.031         0.031         0.031  112603
-	32767              0.053         0.050         0.049   70850
-	32768              0.032         0.029         0.029  119617
-	65535              0.067         0.067         0.067   52015
-	65536              0.058         0.058         0.058   60440
-	131071             0.067         0.066         0.065   53518
-	131072             0.059         0.058         0.058   60281
-	262143             0.066         0.065         0.065   54005
-	262144             0.058         0.058         0.058   60121
-	524287             0.067         0.067         0.067   52349
-	524288             0.061         0.061         0.064   54699
-	1048575            0.068         0.067         0.067   51876
-	1048576            0.061         0.061         0.061   56775
-	2097151            0.068         0.068         0.068   51379
-	2097152            0.062         0.062         0.062   56513
-	4194303            0.069         0.068         0.069   50580
-	4194304            0.063         0.064         0.063   55751
-	8388607            0.120         0.118         0.120   28998
-	8388608            0.137         0.123         0.117   29936
-
-	GCC (Inline REP STOSB) for #c per n where c ≈ 0.273ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1                413.000       434.125       441.453       8
-	1                431.000       436.125       438.953       8
-	2                223.500       224.438       224.836      16
-	3                149.000       150.042       623.786       6
-	4                108.750       109.531       110.559      32
-	7                 62.714        63.196        63.266      55
-	8                 56.375        56.641        56.838      61
-	15                30.467        30.708        30.761     113
-	16                24.062        24.023        24.038     145
-	31                14.548        14.859        14.876     235
-	32                 9.719         9.691         9.730     359
-	63                 7.286         7.312         7.339     476
-	64                 3.609         3.705         3.721     938
-	127                1.976         2.058         2.067    1689
-	128                0.414         0.405         0.409    8532
-	255                0.890         0.907         0.911    3832
-	256                0.215         0.217         0.218   16039
-	511                0.476         0.481         0.480    7273
-	512                0.119         0.119         0.119   29270
-	1023               0.257         0.260         0.260   13409
-	1024               0.073         0.073         0.074   47442
-	2047               0.150         0.150         0.151   23189
-	2048               0.049         0.050         0.050   69424
-	4095               0.096         0.097         0.097   36142
-	4096               0.040         0.040         0.040   87842
-	8191               0.071         0.071         0.071   49061
-	8192               0.034         0.033         0.034  104099
-	16383              0.058         0.059         0.058   59697
-	16384              0.030         0.031         0.030  114585
-	32767              0.053         0.053         0.053   66161
-	32768              0.029         0.029         0.029  120750
-	65535              0.069         0.069         0.069   50520
-	65536              0.058         0.058         0.058   60100
-	131071             0.068         0.067         0.085   40964
-	131072             0.076         0.072         0.063   55514
-	262143             0.067         0.093         0.090   38681
-	262144             0.073         0.062         0.077   45384
-	524287             0.107         0.093         0.066   52689
-	524288             0.061         0.060         0.062   56294
-	1048575            0.066         0.066         0.066   52990
-	1048576            0.061         0.061         0.061   57248
-	2097151            0.067         0.075         0.067   51887
-	2097152            0.061         0.061         0.061   56878
-	4194303            0.068         0.100         0.069   50623
-	4194304            0.061         0.061         0.061   57195
-	8388607            0.117         0.121         0.119   29441
-	8388608            0.118         0.119         0.162   21587
-
-	Musl memset() for #c per n where c ≈ 0.273ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1                 49.000        35.625        35.172      99
-	1                 33.000        34.625        35.109      99
-	2                 17.500        17.562        18.023     194
-	3                 20.333        14.042        12.411     281
-	4                 11.250         9.219         9.301     375
-	7                 11.857         6.018         5.417     644
-	8                  4.125         4.516         4.592     760
-	15                 4.200         2.692         2.480    1407
-	16                 2.312         2.273         2.310    1511
-	31                 2.097         1.786         1.342    2600
-	32                 1.219         1.238         1.242    2811
-	63                 0.841         0.815         0.686    5085
-	64                 0.641         0.666         0.665    5246
-	127                1.000         0.718         0.690    5061
-	128                0.477         0.435         0.413    8451
-	255                0.459         0.418         0.403    8670
-	256                0.285         0.233         0.232   15051
-	511                0.256         0.230         0.228   15285
-	512                0.158         0.129         0.128   27170
-	1023               0.134         0.140         0.138   25296
-	1024               0.089         0.077         0.078   44891
-	2047               0.094         0.088         0.088   39837
-	2048               0.060         0.052         0.053   66075
-	4095               0.071         0.068         0.068   51359
-	4096               0.045         0.043         0.042   83178
-	8191               0.059         0.058         0.057   60868
-	8192               0.037         0.035         0.034  102662
-	16383              0.052         0.051         0.051   68658
-	16384              0.032         0.031         0.031  113568
-	32767              0.050         0.049         0.049   71296
-	32768              0.030         0.029         0.029  120029
-	65535              0.067         0.067         0.068   50983
-	65536              0.059         0.059         0.058   59665
-	131071             0.067         0.067         0.067   52014
-	131072             0.059         0.060         0.059   59211
-	262143             0.067         0.066         0.066   52877
-	262144             0.059         0.060         0.085   40900
-	524287             0.067         0.066         0.065   53688
-	524288             0.059         0.059         0.059   59112
-	1048575            0.066         0.066         0.066   53181
-	1048576            0.060         0.060         0.060   58300
-	2097151            0.066         0.066         0.067   52439
-	2097152            0.060         0.068         0.060   57924
-	4194303            0.069         0.067         0.080   43425
-	4194304            0.062         0.080         0.062   56085
-	8388607            0.126         0.118         0.133   26207
-	8388608            0.127         0.119         0.118   29643
-
-	Newlib memset() for #c per n where c ≈ 0.273ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1                443.000       440.875       440.078       8
-	1                437.000       437.375       440.453       8
-	2                226.500       226.438       227.461      15
-	3                150.333       150.625       151.151      23
-	4                113.250       113.281       113.770      31
-	7                 66.714        67.232        66.998      52
-	8                 58.375        58.828        58.811      59
-	15                31.000        30.858        31.264     112
-	16                31.438        28.523        28.317     123
-	31                27.839        29.536        50.533      69
-	32                11.281        10.918        11.068     315
-	63                12.302        11.907        11.863     294
-	64                 4.703         4.396         4.404     793
-	127                2.732         2.719         2.712    1287
-	128                0.852         0.729         0.736    4742
-	255                1.188         1.178         1.171    2981
-	256                0.652         0.416         0.381    9171
-	511                1.474         1.629         1.662    2099
-	512                0.287         0.264         0.246   14204
-	1023               0.873         0.934         0.947    3684
-	1024               0.196         0.179         0.178   19604
-	2047               0.544         0.545         0.626    5572
-	2048               0.257         0.257         0.253   13779
-	4095               0.426         0.427         0.430    8110
-	4096               0.282         0.296         0.293   11917
-	8191               0.374         0.370         0.371    9402
-	8192               0.297         0.310         0.400    8717
-	16383              0.346         0.345         0.433    8062
-	16384              0.313         0.312         0.311   11223
-	32767              0.334         0.332         0.332   10505
-	32768              0.313         0.313         0.358    9759
-	65535              0.335         0.327         0.330   10589
-	65536              0.330         0.312         0.337   10347
-	131071             0.350         0.339         0.355    9825
-	131072             0.334         0.329         0.359    9728
-	262143             0.346         0.352         0.357    9785
-	262144             0.350         0.375         0.482    7243
-	524287             0.348         0.346         0.360    9691
-	524288             0.347         0.346         0.385    9063
-	1048575            0.358         0.375         0.383    9114
-	1048576            0.355         0.382         0.388    8987
-	2097151            0.362         0.368         0.390    8956
-	2097152            0.363         0.375         0.387    9016
-	4194303            0.361         0.379         0.385    9073
-	4194304            0.366         0.376         0.385    9074
-	8388607            0.363         0.366         0.372    9391
-	8388608            0.419         0.374         0.370    9428 */
--- a/libc/nexgen32e/nexgen32e.mk
+++ b/libc/nexgen32e/nexgen32e.mk
@ -49,7 +49,8 @@ o/$(MODE)/libc/nexgen32e/tinystrncmp.ncabi.o:		\

 o/$(MODE)/libc/nexgen32e/errno.o:			\
 		OVERRIDE_CFLAGS +=			\
-			$(NO_MAGIC)
+			$(NO_MAGIC)			\
+			-fno-sanitize=all

 LIBC_NEXGEN32E_LIBS = $(foreach x,$(LIBC_NEXGEN32E_ARTIFACTS),$($(x)))
 LIBC_NEXGEN32E_SRCS = $(foreach x,$(LIBC_NEXGEN32E_ARTIFACTS),$($(x)_SRCS))
--- a/libc/nexgen32e/rdtscp.h
+++ b/libc/nexgen32e/rdtscp.h
@ -1,6 +1,7 @@
 #ifndef COSMOPOLITAN_LIBC_NEXGEN32E_RDTSCP_H_
 #define COSMOPOLITAN_LIBC_NEXGEN32E_RDTSCP_H_
 #include "libc/bits/bits.h"
+#include "libc/nexgen32e/x86feature.h"
 #if !(__ASSEMBLER__ + __LINKER__ + 0)
 COSMOPOLITAN_C_START_

--- a/libc/nexgen32e/sha.h
+++ b/libc/nexgen32e/sha.h
@ -0,0 +1,13 @@
+#ifndef COSMOPOLITAN_LIBC_NEXGEN32E_SHA_H_
+#define COSMOPOLITAN_LIBC_NEXGEN32E_SHA_H_
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
+
+void sha1_transform_avx2(uint32_t[hasatleast 5], const void *, unsigned);
+void sha1_transform_ni(uint32_t[hasatleast 5], const void *, unsigned);
+void sha256_transform_rorx(uint32_t[hasatleast 8], const void *, unsigned);
+void sha256_transform_ni(uint32_t[hasatleast 8], const void *, unsigned);
+
+COSMOPOLITAN_C_END_
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_LIBC_NEXGEN32E_SHA_H_ */
--- a/libc/nexgen32e/sha1.S
+++ b/libc/nexgen32e/sha1.S
@ -1,49 +1,36 @@
-/*
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * - Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- *
- * - Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in
- *   the documentation and/or other materials provided with the
- *   distribution.
- *
- * - Neither the name of Intel Corporation nor the names of its
- *   contributors may be used to endorse or promote products derived
- *   from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-/*
- * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
- *
- * This implementation is based on the previous SSSE3 release:
- * Visit http://software.intel.com/en-us/articles/
- * and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
- *
- * Updates 20-byte SHA-1 record at start of 'state', from 'input', for
- * even number of 'blocks' consecutive 64-byte blocks.
- *
- * extern "C" void sha1_transform_avx2(
- *     struct sha1_state *state, const uint8_t *input, int blocks );
- */
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│                                                                              │
+│  Copyright 2014 Intel Corporation                                            │
+│                                                                              │
+│  Redistribution and use in source and binary forms, with or without          │
+│  modification, are permitted provided that the following conditions          │
+│  are met:                                                                    │
+│                                                                              │
+│    * Redistributions of source code must retain the above copyright          │
+│      notice, this list of conditions and the following disclaimer.           │
+│    * Redistributions in binary form must reproduce the above copyright       │
+│      notice, this list of conditions and the following disclaimer in         │
+│      the documentation and/or other materials provided with the              │
+│      distribution.                                                           │
+│    * Neither the name of Intel Corporation nor the names of its              │
+│      contributors may be used to endorse or promote products derived         │
+│      from this software without specific prior written permission.           │
+│                                                                              │
+│  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS         │
+│  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT           │
+│  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR       │
+│  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT        │
+│  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,       │
+│  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT            │
+│  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,       │
+│  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY       │
+│  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT         │
+│  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE       │
+│  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.        │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/macros.internal.h"

 .ident "\n\
@ -71,7 +58,6 @@ Copyright 2014 Intel Corporation\n"
 #define	REG_RTB	%rbx
 #define	REG_T1	%r11d
 #define	xmm_mov	vmovups
-#define	avx2_zeroupper	vzeroupper
 #define	RND_F1	1
 #define	RND_F2	2
 #define	RND_F3	3
@ -84,16 +70,13 @@ Copyright 2014 Intel Corporation\n"
 	.set E, REG_E
 	.set TB, REG_TB
 	.set TA, REG_TA
-
 	.set RA, REG_RA
 	.set RB, REG_RB
 	.set RC, REG_RC
 	.set RD, REG_RD
 	.set RE, REG_RE
-
 	.set RTA, REG_RTA
 	.set RTB, REG_RTB
-
 	.set T1, REG_T1
 .endm

@ -177,7 +160,6 @@ Copyright 2014 Intel Corporation\n"
 		PRECALC_RESET_WY
 		PRECALC_ROTATE_WY
 	.endif
-
 	/* message scheduling pre-compute for rounds 0-15 */
 	.if   ((i & 7) == 0)
 		/*
@ -194,7 +176,6 @@ Copyright 2014 Intel Corporation\n"
 		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 	.elseif ((i & 7) == 7)
 		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
-
 		PRECALC_ROTATE_WY
 	.endif
 .endm
@ -236,7 +217,6 @@ Copyright 2014 Intel Corporation\n"
 		vpxor	WY_TMP2, WY_TMP, WY
 		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
-
 		PRECALC_ROTATE_WY
 	.endif
 .endm
@ -250,7 +230,6 @@ Copyright 2014 Intel Corporation\n"
 	 * allows more efficient vectorization
 	 * since w[i]=>w[i-3] dependency is broken
 	 */
-
 	.if   ((i & 7) == 0)
 	/*
 	 * blended AVX2 and ALU instruction scheduling
@ -272,14 +251,12 @@ Copyright 2014 Intel Corporation\n"
 	.elseif ((i & 7) == 7)
 		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
-
 		PRECALC_ROTATE_WY
 	.endif
 .endm

 .macro PRECALC r, s
 	.set i, \r
-
 	.if (i < 40)
 		.set K_XMM, 32*0
 	.elseif (i < 80)
@ -289,7 +266,6 @@ Copyright 2014 Intel Corporation\n"
 	.else
 		.set K_XMM, 32*3
 	.endif
-
 	.if (i<32)
 		PRECALC_00_15	\s
 	.elseif (i<64)
@ -307,7 +283,6 @@ Copyright 2014 Intel Corporation\n"
 	.set B, TB
 	.set TB, A
 	.set A, T_REG
-
 	.set T_REG, RE
 	.set RE, RD
 	.set RD, RC
@ -317,9 +292,8 @@ Copyright 2014 Intel Corporation\n"
 	.set RA, T_REG
 .endm

-/* Macro relies on saved ROUND_Fx */
-
-.macro RND_FUN f, r
+//	Macro relies on saved ROUND_Fx
+.macro	RND_FUN f, r
 	.if (\f == RND_F1)
 		ROUND_F1	\r
 	.elseif (\f == RND_F2)
@ -332,11 +306,11 @@ Copyright 2014 Intel Corporation\n"
 .macro RR r
 	.set round_id, (\r % 80)

-	.if (round_id == 0)        /* Precalculate F for first round */
+	.if (round_id == 0)		# Precalculate F for first round
 		.set ROUND_FUNC, RND_F1
 		mov	B, TB

-		rorx	$(32-30), B, B    /* b>>>2 */
+		rorx	$(32-30), B, B	# b>>>2
 		andn	D, TB, T1
 		and	C, TB
 		xor	T1, TB
@ -362,40 +336,38 @@ Copyright 2014 Intel Corporation\n"
 .macro ROUND_F1 r
 	add	WK(\r), E

-	andn	C, A, T1			/* ~b&d */
-	lea	(RE,RTB), E		/* Add F from the previous round */
+	andn	C, A, T1		# ~b&d
+	lea	(RE,RTB), E		# Add F from the previous round

-	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
-	rorx	$(32-30),A, TB		/* b>>>2 for next round */
+	rorx	$(32-5), A, TA		# T2 = A >>> 5
+	rorx	$(32-30),A, TB		# b>>>2 for next round

-	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+	PRECALC	(\r)			# msg scheduling for next 2 blocks

-	/*
-	 * Calculate F for the next round
-	 * (b & c) ^ andn[b, d]
-	 */
-	and	B, A			/* b&c */
-	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
+//	Calculate F for the next round
+//	(b & c) ^ andn[b, d]
+	and	B, A			# b&c
+	xor	T1, A			# F1 = (b&c) ^ (~b&d)

-	lea	(RE,RTA), E		/* E += A >>> 5 */
+	lea	(RE,RTA), E		# E += A >>> 5
 .endm

 .macro ROUND_F2 r
 	add	WK(\r), E
-	lea	(RE,RTB), E		/* Add F from the previous round */
+	lea	(RE,RTB), E		# Add F from the previous round

 	/* Calculate F for the next round */
-	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	rorx	$(32-5), A, TA		# T2 = A >>> 5
 	.if ((round_id) < 79)
-		rorx	$(32-30), A, TB	/* b>>>2 for next round */
+		rorx	$(32-30), A, TB	# b>>>2 for next round
 	.endif
-	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+	PRECALC	(\r)			# msg scheduling for next 2 blocks

 	.if ((round_id) < 79)
 		xor	B, A
 	.endif

-	add	TA, E			/* E += A >>> 5 */
+	add	TA, E			# E += A >>> 5

 	.if ((round_id) < 79)
 		xor	C, A
@ -404,30 +376,28 @@ Copyright 2014 Intel Corporation\n"

 .macro ROUND_F3 r
 	add	WK(\r), E
-	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+	PRECALC	(\r)			# msg scheduling for next 2 blocks

-	lea	(RE,RTB), E		/* Add F from the previous round */
+	lea	(RE,RTB), E		# Add F from the previous round

 	mov	B, T1
 	or	A, T1

-	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
-	rorx	$(32-30), A, TB		/* b>>>2 for next round */
+	rorx	$(32-5), A, TA		# T2 = A >>> 5
+	rorx	$(32-30), A, TB		# b>>>2 for next round

-	/* Calculate F for the next round
-	 * (b and c) or (d and (b or c))
-	 */
+//	Calculate F for the next round
+//	(b and c) or (d and (b or c))
 	and	C, T1
 	and	B, A
 	or	T1, A

-	add	TA, E			/* E += A >>> 5 */
+	add	TA, E			# E += A >>> 5

 .endm

-/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
- * %1 + %2 >= %3 ? %4 : 0
- */
+//	Add constant only if (%2 > %3) condition met (uses RTA as temp)
+//	%1 + %2 >= %3 ? %4 : 0
 .macro ADD_IF_GE a, b, c, d
 	mov     \a, RTA
 	add     $\d, RTA
@ -435,9 +405,7 @@ Copyright 2014 Intel Corporation\n"
 	cmovge  RTA, \a
 .endm

-/*
- * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
- */
+//	Performs 80 rounds of SHA-1 for multiple blocks with s/w pipelining
 .macro SHA1_PIPELINED_MAIN_BODY

 	REGALLOC
@ -451,7 +419,7 @@ Copyright 2014 Intel Corporation\n"
 	mov	%rsp, PRECALC_BUF
 	lea	(2*4*80+32)(%rsp), WK_BUF

-	# Precalc WK for first 2 blocks
+//	Precalc WK for first 2 blocks
 	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
 	.set i, 0
 	.rept    160
@ -459,29 +427,27 @@ Copyright 2014 Intel Corporation\n"
 		.set i, i + 1
 	.endr

-	/* Go to next block if needed */
+//	Go to next block if needed
 	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
 	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
 	xchg	WK_BUF, PRECALC_BUF

 	.align 32
 .L_loop:
-	/*
-	 * code loops through more than one block
-	 * we use K_BASE value as a signal of a last block,
-	 * it is set below by: cmovae BUFFER_PTR, K_BASE
-	 */
+
+//	code loops through more than one block
+//	we use K_BASE value as a signal of a last block,
+//	it is set below by: cmovae BUFFER_PTR, K_BASE
 	test BLOCKS_CTR, BLOCKS_CTR
 	jnz .L_begin
 	.align 32
 	jmp	.L_end
+
 	.align 32
 .L_begin:

-	/*
-	 * Do first block
-	 * rounds: 0,2,4,6,8
-	 */
+//	process first block
+//	rounds: 0,2,4,6,8
 	.set j, 0
 	.rept 5
 		RR	j
@ -491,28 +457,26 @@ Copyright 2014 Intel Corporation\n"
 	jmp .L_loop0
 .L_loop0:

-	/*
-	 * rounds:
-	 * 10,12,14,16,18
-	 * 20,22,24,26,28
-	 * 30,32,34,36,38
-	 * 40,42,44,46,48
-	 * 50,52,54,56,58
-	 */
+//	rounds
+//	10,12,14,16,18
+//	20,22,24,26,28
+//	30,32,34,36,38
+//	40,42,44,46,48
+//	50,52,54,56,58
 	.rept 25
 		RR	j
 		.set j, j+2
 	.endr

-	/* Update Counter */
+//	Update Counter */
 	sub $1, BLOCKS_CTR
-	/* Move to the next block only if needed*/
+
+//	Move to the next block only if needed*/
 	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
-	/*
-	 * rounds
-	 * 60,62,64,66,68
-	 * 70,72,74,76,78
-	 */
+
+//	rounds
+//	60,62,64,66,68
+//	70,72,74,76,78
 	.rept 10
 		RR	j
 		.set j, j+2
@ -529,12 +493,9 @@ Copyright 2014 Intel Corporation\n"

 	mov	TB, B

-	/* Process second block */
-	/*
-	 * rounds
-	 *  0+80, 2+80, 4+80, 6+80, 8+80
-	 * 10+80,12+80,14+80,16+80,18+80
-	 */
+//	process second block
+//	 0+80, 2+80, 4+80, 6+80, 8+80
+//	10+80,12+80,14+80,16+80,18+80

 	.set j, 0
 	.rept 10
@ -544,11 +505,10 @@ Copyright 2014 Intel Corporation\n"

 	jmp	.L_loop1
 .L_loop1:
-	/*
-	 * rounds
-	 * 20+80,22+80,24+80,26+80,28+80
-	 * 30+80,32+80,34+80,36+80,38+80
-	 */
+
+//	rounds
+//	20+80,22+80,24+80,26+80,28+80
+//	30+80,32+80,34+80,36+80,38+80
 	.rept 10
 		RR	j+80
 		.set j, j+2
@ -557,29 +517,26 @@ Copyright 2014 Intel Corporation\n"
 	jmp	.L_loop2
 .L_loop2:

-	/*
-	 * rounds
-	 * 40+80,42+80,44+80,46+80,48+80
-	 * 50+80,52+80,54+80,56+80,58+80
-	 */
+//	rounds
+//	40+80,42+80,44+80,46+80,48+80
+//	50+80,52+80,54+80,56+80,58+80
 	.rept 10
 		RR	j+80
 		.set j, j+2
 	.endr

-	/* update counter */
+//	update counter
 	sub     $1, BLOCKS_CTR
-	/* Move to the next block only if needed*/
+
+//	Move to the next block only if needed
 	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128

 	jmp	.L_loop3
 .L_loop3:

-	/*
-	 * rounds
-	 * 60+80,62+80,64+80,66+80,68+80
-	 * 70+80,72+80,74+80,76+80,78+80
-	 */
+//	rounds
+//	60+80,62+80,64+80,66+80,68+80
+//	70+80,72+80,74+80,76+80,78+80
 	.rept 10
 		RR	j+80
 		.set j, j+2
@ -619,14 +576,14 @@ Copyright 2014 Intel Corporation\n"

 .align 128
 K_XMM_AR:
-	.long K1, K1, K1, K1
-	.long K1, K1, K1, K1
-	.long K2, K2, K2, K2
-	.long K2, K2, K2, K2
-	.long K3, K3, K3, K3
-	.long K3, K3, K3, K3
-	.long K4, K4, K4, K4
-	.long K4, K4, K4, K4
+	.long K1,K1,K1,K1
+	.long K1,K1,K1,K1
+	.long K2,K2,K2,K2
+	.long K2,K2,K2,K2
+	.long K3,K3,K3,K3
+	.long K3,K3,K3,K3
+	.long K4,K4,K4,K4
+	.long K4,K4,K4,K4

 BSWAP_SHUFB_CTL:
 	.long 0x00010203
@ -639,6 +596,23 @@ BSWAP_SHUFB_CTL:
 	.long 0x0c0d0e0f
 .text

+//	Performs Intel® AVX2™ optimized SHA-1 update.
+//
+//	This implementation is based on the previous SSSE3 release:
+//	Visit http://software.intel.com/en-us/articles/ and refer
+//	to improving-the-performance-of-the-secure-hash-algorithm-1/
+//
+//	Updates 20-byte SHA-1 record at start of 'state', from 'input',
+//	for even number of 'blocks' consecutive 64-byte blocks.
+//
+//	    void sha1_transform_avx2(struct sha1_state *state,
+//	                             const uint8_t *input,
+//	                             int blocks);
+//
+//	@param	%rdi points to output digest
+//	@param	%rsi points to input data
+//	@param	%rdx is number of 64-byte blocks to process
+//	@see	X86_HAVE(SHA)
 sha1_transform_avx2:
 	push	%rbp
 	mov	%rsp,%rbp
@ -648,33 +622,23 @@ sha1_transform_avx2:
 	push	%r13
 	push	%r14
 	push	%r15
-
 	RESERVE_STACK  = (W_SIZE*4 + 8+24)
-
 	/* Align stack */
-	mov	%rsp, %rbx
-	and	$~(0x20-1), %rsp
+	mov	%rsp,%rbx
+	and	$~(0x20-1),%rsp
 	push	%rbx
-	sub	$RESERVE_STACK, %rsp
-
-	avx2_zeroupper
-
+	sub	$RESERVE_STACK,%rsp
+	vzeroupper
 	/* Setup initial values */
-	mov	CTX, HASH_PTR
-	mov	BUF, BUFFER_PTR
-
-	mov	BUF, BUFFER_PTR2
-	mov	CNT, BLOCKS_CTR
-
-	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
-
+	mov	CTX,HASH_PTR
+	mov	BUF,BUFFER_PTR
+	mov	BUF,BUFFER_PTR2
+	mov	CNT,BLOCKS_CTR
+	xmm_mov	BSWAP_SHUFB_CTL(%rip),YMM_SHUFB_BSWAP
 	SHA1_PIPELINED_MAIN_BODY
-
-	avx2_zeroupper
-
-	add	$RESERVE_STACK, %rsp
+	vzeroupper
+	add	$RESERVE_STACK,%rsp
 	pop	%rsp
-
 	pop	%r15
 	pop	%r14
 	pop	%r13
--- a/libc/nexgen32e/sha1ni.S
+++ b/libc/nexgen32e/sha1ni.S
@ -0,0 +1,286 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│                                                                              │
+│  Copyright 2015 Intel Corporation                                            │
+│                                                                              │
+│  Redistribution and use in source and binary forms, with or without          │
+│  modification, are permitted provided that the following conditions          │
+│  are met:                                                                    │
+│                                                                              │
+│    * Redistributions of source code must retain the above copyright          │
+│      notice, this list of conditions and the following disclaimer.           │
+│    * Redistributions in binary form must reproduce the above copyright       │
+│      notice, this list of conditions and the following disclaimer in         │
+│      the documentation and/or other materials provided with the              │
+│      distribution.                                                           │
+│    * Neither the name of Intel Corporation nor the names of its              │
+│      contributors may be used to endorse or promote products derived         │
+│      from this software without specific prior written permission.           │
+│                                                                              │
+│  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS         │
+│  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT           │
+│  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR       │
+│  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT        │
+│  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,       │
+│  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT            │
+│  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,       │
+│  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY       │
+│  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT         │
+│  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE       │
+│  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.        │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/macros.internal.h"
+
+.text
+.align 32
+.ident "\n\
+Intel SHA-NI (BSD-3 License)\n\
+Copyright 2015 Intel Corporation\n\
+Sean Gulley <sean.m.gulley@intel.com>\n\
+Tim Chen <tim.c.chen@linux.intel.com>\n"
+.include "libc/disclaimer.inc"
+
+#define FRAME_SIZE	32
+#define DIGEST_PTR	%rdi
+#define DATA_PTR	%rsi
+#define NUM_BLKS	%rdx
+#define ABCD		%xmm0
+#define E0		%xmm1	/* Need two E's b/c they ping pong */
+#define E1		%xmm2
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+#define SHUF_MASK	%xmm7
+
+//	Performs Intel® SHA-NI™ optimized SHA-1 update.
+//
+//	The function takes a pointer to the current hash values, a
+//	pointer to the input data, and a number of 64 byte blocks to
+//	process. Once all blocks have been processed, the digest pointer
+//	is updated with the resulting hash value. The function only
+//	processes complete blocks, there is no functionality to store
+//	partial blocks. All message padding and hash value
+//	initialization must be done outside the update function.
+//
+//	The indented lines in the loop are instructions related to
+//	rounds processing. The non-indented lines are instructions
+//	related to the message schedule.
+//
+//	    void sha1_transform_ni(uint32_t digest[static 5],
+//	                           const void *data,
+//	                           uint32_t numBlocks);
+//
+//	@param	%rdi points to output digest
+//	@param	%rsi points to input data
+//	@param	%rdx is number of 64-byte blocks to process
+//	@see	X86_HAVE(SHA)
+sha1_transform_ni:
+	push	%rbp
+	mov	%rsp,%rbp
+	.profilable
+	sub	$FRAME_SIZE,%rsp
+	shl	$6,NUM_BLKS		# convert to bytes
+	jz	.Ldone_hash
+	add	DATA_PTR,NUM_BLKS	# pointer to end of data
+
+//	load initial hash values
+	movdqa	UPPER_WORD_MASK(%rip),E1
+	pinsrd	$3,1*16(DIGEST_PTR),E0
+	movdqu	0*16(DIGEST_PTR),ABCD
+	pand	E1,E0
+	pshufd	$0x1B,ABCD,ABCD
+
+	movdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip),SHUF_MASK
+
+.Lloop0:
+//	Save hash values for addition after rounds
+	movdqa		E0,(0*16)(%rsp)
+	movdqa		ABCD,(1*16)(%rsp)
+
+//	Rounds 0-3
+	movdqu		0*16(DATA_PTR),MSG0
+	pshufb		SHUF_MASK,MSG0
+	paddd		MSG0,E0
+	movdqa		ABCD,E1
+	sha1rnds4	$0,E0,ABCD
+
+//	Rounds 4-7
+	movdqu		1*16(DATA_PTR),MSG1
+	pshufb		SHUF_MASK,MSG1
+	sha1nexte	MSG1,E1
+	movdqa		ABCD,E0
+	sha1rnds4	$0,E1,ABCD
+	sha1msg1	MSG1,MSG0
+
+//	Rounds 8-11
+	movdqu		2*16(DATA_PTR),MSG2
+	pshufb		SHUF_MASK,MSG2
+	sha1nexte	MSG2,E0
+	movdqa		ABCD,E1
+	sha1rnds4	$0,E0,ABCD
+	sha1msg1	MSG2,MSG1
+	pxor		MSG2,MSG0
+
+//	Rounds 12-15
+	movdqu		3*16(DATA_PTR),MSG3
+	pshufb		SHUF_MASK,MSG3
+	sha1nexte	MSG3,E1
+	movdqa		ABCD,E0
+	sha1msg2	MSG3,MSG0
+	sha1rnds4	$0,E1,ABCD
+	sha1msg1	MSG3,MSG2
+	pxor		MSG3,MSG1
+
+//	Rounds 16-19
+	sha1nexte	MSG0,E0
+	movdqa		ABCD,E1
+	sha1msg2	MSG0,MSG1
+	sha1rnds4	$0,E0,ABCD
+	sha1msg1	MSG0,MSG3
+	pxor		MSG0,MSG2
+
+//	Rounds 20-23
+	sha1nexte	MSG1,E1
+	movdqa		ABCD,E0
+	sha1msg2	MSG1,MSG2
+	sha1rnds4	$1,E1,ABCD
+	sha1msg1	MSG1,MSG0
+	pxor		MSG1,MSG3
+
+//	Rounds 24-27
+	sha1nexte	MSG2,E0
+	movdqa		ABCD,E1
+	sha1msg2	MSG2,MSG3
+	sha1rnds4	$1,E0,ABCD
+	sha1msg1	MSG2,MSG1
+	pxor		MSG2,MSG0
+
+//	Rounds 28-31
+	sha1nexte	MSG3,E1
+	movdqa		ABCD,E0
+	sha1msg2	MSG3,MSG0
+	sha1rnds4	$1,E1,ABCD
+	sha1msg1	MSG3,MSG2
+	pxor		MSG3,MSG1
+
+//	Rounds 32-35
+	sha1nexte	MSG0,E0
+	movdqa		ABCD,E1
+	sha1msg2	MSG0,MSG1
+	sha1rnds4	$1,E0,ABCD
+	sha1msg1	MSG0,MSG3
+	pxor		MSG0,MSG2
+
+//	Rounds 36-39
+	sha1nexte	MSG1,E1
+	movdqa		ABCD,E0
+	sha1msg2	MSG1,MSG2
+	sha1rnds4	$1,E1,ABCD
+	sha1msg1	MSG1,MSG0
+	pxor		MSG1,MSG3
+
+//	Rounds 40-43
+	sha1nexte	MSG2,E0
+	movdqa		ABCD,E1
+	sha1msg2	MSG2,MSG3
+	sha1rnds4	$2,E0,ABCD
+	sha1msg1	MSG2,MSG1
+	pxor		MSG2,MSG0
+
+//	Rounds 44-47
+	sha1nexte	MSG3,E1
+	movdqa		ABCD,E0
+	sha1msg2	MSG3,MSG0
+	sha1rnds4	$2,E1,ABCD
+	sha1msg1	MSG3,MSG2
+	pxor		MSG3,MSG1
+
+//	Rounds 48-51
+	sha1nexte	MSG0,E0
+	movdqa		ABCD,E1
+	sha1msg2	MSG0,MSG1
+	sha1rnds4	$2,E0,ABCD
+	sha1msg1	MSG0,MSG3
+	pxor		MSG0,MSG2
+
+//	Rounds 52-55
+	sha1nexte	MSG1,E1
+	movdqa		ABCD,E0
+	sha1msg2	MSG1,MSG2
+	sha1rnds4	$2,E1,ABCD
+	sha1msg1	MSG1,MSG0
+	pxor		MSG1,MSG3
+
+//	Rounds 56-59
+	sha1nexte	MSG2,E0
+	movdqa		ABCD,E1
+	sha1msg2	MSG2,MSG3
+	sha1rnds4	$2,E0,ABCD
+	sha1msg1	MSG2,MSG1
+	pxor		MSG2,MSG0
+
+//	Rounds 60-63
+	sha1nexte	MSG3,E1
+	movdqa		ABCD,E0
+	sha1msg2	MSG3,MSG0
+	sha1rnds4	$3,E1,ABCD
+	sha1msg1	MSG3,MSG2
+	pxor		MSG3,MSG1
+
+//	Rounds 64-67
+	sha1nexte	MSG0,E0
+	movdqa		ABCD,E1
+	sha1msg2	MSG0,MSG1
+	sha1rnds4	$3,E0,ABCD
+	sha1msg1	MSG0,MSG3
+	pxor		MSG0,MSG2
+
+//	Rounds 68-71
+	sha1nexte	MSG1,E1
+	movdqa		ABCD,E0
+	sha1msg2	MSG1,MSG2
+	sha1rnds4	$3,E1,ABCD
+	pxor		MSG1,MSG3
+
+//	Rounds 72-75
+	sha1nexte	MSG2,E0
+	movdqa		ABCD,E1
+	sha1msg2	MSG2,MSG3
+	sha1rnds4	$3,E0,ABCD
+
+//	Rounds 76-79
+	sha1nexte	MSG3,E1
+	movdqa		ABCD,E0
+	sha1rnds4	$3,E1,ABCD
+
+//	Add current hash values with previously saved
+	sha1nexte	(0*16)(%rsp),E0
+	paddd		(1*16)(%rsp),ABCD
+
+//	Increment data pointer and loop if more to process
+	add	$64,DATA_PTR
+	cmp	NUM_BLKS,DATA_PTR
+	jne	.Lloop0
+
+//	Write hash values back in the correct order
+	pshufd	$0x1B,ABCD,ABCD
+	movdqu	ABCD,0*16(DIGEST_PTR)
+	pextrd	$3,E0,1*16(DIGEST_PTR)
+
+.Ldone_hash:
+	leave
+	ret
+	.endfn	sha1_transform_ni,globl
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x000102030405060708090a0b0c0d0e0f
+
+.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
+.align 16
+UPPER_WORD_MASK:
+	.octa 0xFFFFFFFF000000000000000000000000
--- a/libc/nexgen32e/sha256.S
+++ b/libc/nexgen32e/sha256.S
@ -50,7 +50,7 @@
 #include "libc/macros.internal.h"

 .ident "\n\
-AVX2 SHA-256 (BSD-2 License)\n\
+AVX2 SHA2 (BSD-2 License)\n\
 Copyright 2013 Intel Corporation\n"
 .include "libc/disclaimer.inc"

@ -598,19 +598,19 @@ sha256_transform_rorx:

 .align 16
 .Loop1:
-	vpaddd	K256+0*32(SRND), X0, XFER
+	vpaddd	kSha256x2+0*32(SRND), X0, XFER
 	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
 	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32

-	vpaddd	K256+1*32(SRND), X0, XFER
+	vpaddd	kSha256x2+1*32(SRND), X0, XFER
 	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
 	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32

-	vpaddd	K256+2*32(SRND), X0, XFER
+	vpaddd	kSha256x2+2*32(SRND), X0, XFER
 	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
 	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32

-	vpaddd	K256+3*32(SRND), X0, XFER
+	vpaddd	kSha256x2+3*32(SRND), X0, XFER
 	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
 	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32

@ -620,11 +620,11 @@ sha256_transform_rorx:

 .Loop2:
 	## Do last 16 rounds with no scheduling
-	vpaddd	K256+0*32(SRND), X0, XFER
+	vpaddd	kSha256x2+0*32(SRND), X0, XFER
 	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
 	DO_4ROUNDS	_XFER + 0*32

-	vpaddd	K256+1*32(SRND), X1, XFER
+	vpaddd	kSha256x2+1*32(SRND), X1, XFER
 	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
 	DO_4ROUNDS	_XFER + 1*32
 	add	$2*32, SRND
@ -712,7 +712,6 @@ sha256_transform_rorx:
 .Ldone_hash:

 	mov	_RSP(%rsp), %rsp
-
 	popq	%r15
 	popq	%r14
 	popq	%r13
@ -722,52 +721,38 @@ sha256_transform_rorx:
 	ret
 	.endfn	sha256_transform_rorx,globl

-.section	.rodata.cst512.K256, "aM", @progbits, 512
-.align 64
-K256:
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
 	.rodata.cst32
 PSHUFFLE_BYTE_FLIP_MASK:
-	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+	.octa	0x0c0d0e0f08090a0b0405060700010203
+	.octa	0x0c0d0e0f08090a0b0405060700010203

 # shuffle xBxA -> 00BA
 	.rodata.cst32
 _SHUF_00BA:
-	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+	.octa	0xFFFFFFFFFFFFFFFF0b0a090803020100
+	.octa	0xFFFFFFFFFFFFFFFF0b0a090803020100

 # shuffle xDxC -> DC00
 	.rodata.cst32
 _SHUF_DC00:
-	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
+	.octa	0x0b0a090803020100FFFFFFFFFFFFFFFF
+	.octa	0x0b0a090803020100FFFFFFFFFFFFFFFF
+
+	.bss
+	.align	64
+kSha256x2:
+	.zero	512
+	.endobj	kSha256x2,globl
+	.previous
+
+	.init.start 201,_init_kSha256x2
+	push	$64
+	pop	%rcx
+	ezlea	kSha256,dx
+	ezlea	kSha256x2,ax
+0:	movaps	-16(%rdx,%rcx,4),%xmm0
+	movaps	%xmm0,-16(%rax,%rcx,8)
+	movaps	%xmm0,-32(%rax,%rcx,8)
+	sub	$4,%ecx
+	jnz	0b
+	.init.end 201,_init_kSha256x2
--- a/libc/nexgen32e/sha256ni.S
+++ b/libc/nexgen32e/sha256ni.S
@ -0,0 +1,318 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│                                                                              │
+│  Copyright 2015 Intel Corporation                                            │
+│                                                                              │
+│  Redistribution and use in source and binary forms, with or without          │
+│  modification, are permitted provided that the following conditions          │
+│  are met:                                                                    │
+│                                                                              │
+│    * Redistributions of source code must retain the above copyright          │
+│      notice, this list of conditions and the following disclaimer.           │
+│    * Redistributions in binary form must reproduce the above copyright       │
+│      notice, this list of conditions and the following disclaimer in         │
+│      the documentation and/or other materials provided with the              │
+│      distribution.                                                           │
+│    * Neither the name of Intel Corporation nor the names of its              │
+│      contributors may be used to endorse or promote products derived         │
+│      from this software without specific prior written permission.           │
+│                                                                              │
+│  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS         │
+│  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT           │
+│  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR       │
+│  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT        │
+│  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,       │
+│  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT            │
+│  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,       │
+│  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY       │
+│  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT         │
+│  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE       │
+│  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.        │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/macros.internal.h"
+
+.text
+.align 32
+.ident "\n\
+Intel SHA-NI (BSD-3 License)\n\
+Copyright 2015 Intel Corporation\n\
+Sean Gulley <sean.m.gulley@intel.com>\n\
+Tim Chen <tim.c.chen@linux.intel.com>\n"
+.include "libc/disclaimer.inc"
+
+#define DIGEST_PTR	%rdi	/* 1st arg */
+#define DATA_PTR	%rsi	/* 2nd arg */
+#define NUM_BLKS	%rdx	/* 3rd arg */
+#define SHA256CONSTANTS	%rax
+#define MSG		%xmm0
+#define STATE0		%xmm1
+#define STATE1		%xmm2
+#define MSGTMP0		%xmm3
+#define MSGTMP1		%xmm4
+#define MSGTMP2		%xmm5
+#define MSGTMP3		%xmm6
+#define MSGTMP4		%xmm7
+#define SHUF_MASK	%xmm8
+#define ABEF_SAVE	%xmm9
+#define CDGH_SAVE	%xmm10
+
+//	Performs Intel® SHA-NI™ optimized SHA-256 update.
+//
+//	The function takes a pointer to the current hash values, a
+//	pointer to the input data, and a number of 64 byte blocks to
+//	process. Once all blocks have been processed, the digest pointer
+//	is updated with the resulting hash value. The function only
+//	processes complete blocks, there is no functionality to store
+//	partial blocks. All message padding and hash value
+//	initialization must be done outside the update function.
+//
+//	The indented lines in the loop are instructions related to
+//	rounds processing. The non-indented lines are instructions
+//	related to the message schedule.
+//
+//	    void sha256_transform_ni(uint32_t digest[static 8],
+//	                             const void *data,
+//	                             int32_t numBlocks);
+//	
+//	@param	%rdi points to output digest
+//	@param	%rsi points to input data
+//	@param	%rdx is number of blocks to process
+//	@see	X86_HAVE(SHA)
+sha256_transform_ni:
+	.leafprologue
+	.profilable
+	shl	$6,NUM_BLKS			# convert to bytes
+	jz	.Ldone_hash
+	add	DATA_PTR,NUM_BLKS		# pointer to end of data
+
+//	Load initial hash values
+//	Need to reorder these appropriately
+//	DCBA, HGFE -> ABEF, CDGH
+	movdqu	0*16(DIGEST_PTR),STATE0
+	movdqu	1*16(DIGEST_PTR),STATE1
+
+	pshufd	$0xB1,STATE0,STATE0		# CDAB
+	pshufd	$0x1B,STATE1,STATE1		# EFGH
+	movdqa	STATE0,MSGTMP4
+	palignr	$8,STATE1,STATE0		# ABEF
+	pblendw	$0xF0,MSGTMP4,STATE1		# CDGH
+
+	movdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip),SHUF_MASK
+	lea	kSha256(%rip),SHA256CONSTANTS
+
+.Lloop0:
+
+//	Save hash values for addition after rounds
+	movdqa		STATE0,ABEF_SAVE
+	movdqa		STATE1,CDGH_SAVE
+
+//	Rounds 0-3
+	movdqu		0*16(DATA_PTR),MSG
+	pshufb		SHUF_MASK,MSG
+	movdqa		MSG,MSGTMP0
+	paddd		0*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+
+//	Rounds 4-7
+	movdqu		1*16(DATA_PTR),MSG
+	pshufb		SHUF_MASK,MSG
+	movdqa		MSG,MSGTMP1
+	paddd		1*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+	sha256msg1	MSGTMP1,MSGTMP0
+
+//	Rounds 8-11
+	movdqu		2*16(DATA_PTR),MSG
+	pshufb		SHUF_MASK,MSG
+	movdqa		MSG,MSGTMP2
+	paddd		2*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+	sha256msg1	MSGTMP2,MSGTMP1
+
+//	Rounds 12-15
+	movdqu		3*16(DATA_PTR),MSG
+	pshufb		SHUF_MASK,MSG
+	movdqa		MSG,MSGTMP3
+	paddd		3*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	movdqa		MSGTMP3,MSGTMP4
+	palignr		$4,MSGTMP2,MSGTMP4
+	paddd		MSGTMP4,MSGTMP0
+	sha256msg2	MSGTMP3,MSGTMP0
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+	sha256msg1	MSGTMP3,MSGTMP2
+
+//	Rounds 16-19
+	movdqa		MSGTMP0,MSG
+	paddd		4*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	movdqa		MSGTMP0,MSGTMP4
+	palignr		$4,MSGTMP3,MSGTMP4
+	paddd		MSGTMP4,MSGTMP1
+	sha256msg2	MSGTMP0,MSGTMP1
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+	sha256msg1	MSGTMP0,MSGTMP3
+
+//	Rounds 20-23
+	movdqa		MSGTMP1,MSG
+	paddd		5*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	movdqa		MSGTMP1,MSGTMP4
+	palignr		$4,MSGTMP0,MSGTMP4
+	paddd		MSGTMP4,MSGTMP2
+	sha256msg2	MSGTMP1,MSGTMP2
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+	sha256msg1	MSGTMP1,MSGTMP0
+
+//	Rounds 24-27
+	movdqa		MSGTMP2,MSG
+	paddd		6*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	movdqa		MSGTMP2,MSGTMP4
+	palignr		$4,MSGTMP1,MSGTMP4
+	paddd		MSGTMP4,MSGTMP3
+	sha256msg2	MSGTMP2,MSGTMP3
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+	sha256msg1	MSGTMP2,MSGTMP1
+
+//	Rounds 28-31
+	movdqa		MSGTMP3,MSG
+	paddd		7*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	movdqa		MSGTMP3,MSGTMP4
+	palignr		$4,MSGTMP2,MSGTMP4
+	paddd		MSGTMP4,MSGTMP0
+	sha256msg2	MSGTMP3,MSGTMP0
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+	sha256msg1	MSGTMP3,MSGTMP2
+
+//	Rounds 32-35
+	movdqa		MSGTMP0,MSG
+	paddd		8*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	movdqa		MSGTMP0,MSGTMP4
+	palignr		$4,MSGTMP3,MSGTMP4
+	paddd		MSGTMP4,MSGTMP1
+	sha256msg2	MSGTMP0,MSGTMP1
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+	sha256msg1	MSGTMP0,MSGTMP3
+
+//	Rounds 36-39
+	movdqa		MSGTMP1,MSG
+	paddd		9*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	movdqa		MSGTMP1,MSGTMP4
+	palignr		$4,MSGTMP0,MSGTMP4
+	paddd		MSGTMP4,MSGTMP2
+	sha256msg2	MSGTMP1,MSGTMP2
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+	sha256msg1	MSGTMP1,MSGTMP0
+
+//	Rounds 40-43
+	movdqa		MSGTMP2,MSG
+	paddd		10*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	movdqa		MSGTMP2,MSGTMP4
+	palignr		$4,MSGTMP1,MSGTMP4
+	paddd		MSGTMP4,MSGTMP3
+	sha256msg2	MSGTMP2,MSGTMP3
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+	sha256msg1	MSGTMP2,MSGTMP1
+
+//	Rounds 44-47
+	movdqa		MSGTMP3,MSG
+	paddd		11*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	movdqa		MSGTMP3,MSGTMP4
+	palignr		$4,MSGTMP2,MSGTMP4
+	paddd		MSGTMP4,MSGTMP0
+	sha256msg2	MSGTMP3,MSGTMP0
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+	sha256msg1	MSGTMP3,MSGTMP2
+
+//	Rounds 48-51
+	movdqa		MSGTMP0,MSG
+	paddd		12*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	movdqa		MSGTMP0,MSGTMP4
+	palignr		$4,MSGTMP3,MSGTMP4
+	paddd		MSGTMP4,MSGTMP1
+	sha256msg2	MSGTMP0,MSGTMP1
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+	sha256msg1	MSGTMP0,MSGTMP3
+
+//	Rounds 52-55
+	movdqa		MSGTMP1,MSG
+	paddd		13*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	movdqa		MSGTMP1,MSGTMP4
+	palignr		$4,MSGTMP0,MSGTMP4
+	paddd		MSGTMP4,MSGTMP2
+	sha256msg2	MSGTMP1,MSGTMP2
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+
+//	Rounds 56-59
+	movdqa		MSGTMP2,MSG
+	paddd		14*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	movdqa		MSGTMP2,MSGTMP4
+	palignr		$4,MSGTMP1,MSGTMP4
+	paddd		MSGTMP4,MSGTMP3
+	sha256msg2	MSGTMP2,MSGTMP3
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+
+//	Rounds 60-63
+	movdqa		MSGTMP3,MSG
+	paddd		15*16(SHA256CONSTANTS),MSG
+	sha256rnds2	STATE0,STATE1
+	pshufd 		$0x0E,MSG,MSG
+	sha256rnds2	STATE1,STATE0
+
+//	Add current hash values with previously saved
+	paddd		ABEF_SAVE,STATE0
+	paddd		CDGH_SAVE,STATE1
+
+//	Increment data pointer and loop if more to process
+	add		$64,DATA_PTR
+	cmp		NUM_BLKS,DATA_PTR
+	jne		.Lloop0
+
+//	Write hash values back in the correct order
+	pshufd		$0x1B,STATE0,STATE0		# FEBA
+	pshufd		$0xB1,STATE1,STATE1		# DCHG
+	movdqa		STATE0,MSGTMP4
+	pblendw		$0xF0,STATE1,STATE0		# DCBA
+	palignr		$8,MSGTMP4,STATE1		# HGFE
+
+	movdqu		STATE0,0*16(DIGEST_PTR)
+	movdqu		STATE1,1*16(DIGEST_PTR)
+
+.Ldone_hash:
+	.leafepilogue
+	.endfn	sha256_transform_ni,globl
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK,"aM",@progbits,16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa	0x0c0d0e0f08090a0b0405060700010203
+	.endobj	PSHUFFLE_BYTE_FLIP_MASK
--- a/libc/nexgen32e/sha512.S
+++ b/libc/nexgen32e/sha512.S
@ -51,7 +51,7 @@
 #include "libc/macros.internal.h"

 .ident "\n\
-AVX2 SHA-512 (BSD-2 License)\n\
+AVX2 SHA2 (BSD-2 License)\n\
 Copyright 2013 Intel Corporation\n"
 .include "libc/disclaimer.inc"

--- a/libc/nexgen32e/strcspn.S
+++ b/libc/nexgen32e/strcspn.S
@ -1,74 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-
-//	Returns prefix length, consisting of chars not in reject.
-//
-//	@param	rdi is string
-//	@param	rsi is reject nul-terminated character set
-//	@return	rax is index of first byte in charset
-//	@see	strspn(), strtok_r()
-//	@asyncsignalsafe
-strcspn:
-	push	%rbp
-	mov	%rsp,%rbp
-	.profilable
-	sub	$16,%rsp
-	push	%rdi
-	mov	%rsi,%rdi
-	call	strlen
-	pop	%rdi
-	cmp	$15,%rax
-	ja	4f
-	push	%rdi
-	mov	%rax,%rdx
-	pxor	%xmm0,%xmm0
-	lea	-16(%rbp),%rdi
-	movdqa	%xmm0,(%rdi)
-	call	MemCpy
-	movdqa	(%rdi),%xmm1
-	pop	%rdi
-	or	$-1,%rax
-0:	inc	%rax
-	movzbl	(%rdi,%rax),%ecx
-	movd	%ecx,%xmm0
-	punpcklbw %xmm0,%xmm0
-	punpcklwd %xmm0,%xmm0
-	pshufd	$0,%xmm0,%xmm0
-	pcmpeqb	%xmm1,%xmm0
-	pmovmskb %xmm0,%ecx
-	test	%ecx,%ecx
-	jz	0b
-9:	leave
-	ret
-1:	cmp	%ch,%cl
-	je	9b
-	inc	%edx
-2:	mov	(%rsi,%rdx),%ch
-	test	%ch,%ch
-	jne	1b
-	inc	%rax
-3:	mov	(%rdi,%rax),%cl
-	test	%cl,%cl
-	je	9b
-	xor	%edx,%edx
-	jmp	2b
-4:	xor	%eax,%eax
-	jmp	3b
-	.endfn	strcspn,globl
--- a/libc/nexgen32e/strlen.S
+++ b/libc/nexgen32e/strlen.S
@ -1,51 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-
-//	Returns length of NUL-terminated string.
-//
-//	@param	rdi is non-null NUL-terminated string pointer
-//	@return	rax is number of bytes (excluding NUL)
-//	@clob	ax,dx,cx,xmm3,xmm4
-//	@note	h/t agner fog
-//	@asyncsignalsafe
-strlen:	.leafprologue
-	.profilable
-	mov	%rdi,%rax
-	mov	%edi,%ecx
-	and	$15,%ecx
-	and	$-16,%rax
-	pxor	%xmm4,%xmm4
-	movdqa	(%rax),%xmm3
-	pcmpeqb	%xmm4,%xmm3
-	pmovmskb %xmm3,%edx
-	shr	%cl,%edx
-	shl	%cl,%edx
-	bsf	%edx,%edx
-	jnz	2f
-1:	lea	16(%rax),%rax
-	movdqa	(%rax),%xmm3
-	pcmpeqb	%xmm4,%xmm3
-	pmovmskb %xmm3,%edx
-	bsf	%edx,%edx
-	jz	1b
-2:	add	%rdx,%rax
-	sub	%rdi,%rax
-	.leafepilogue
-	.endfn	strlen,globl