Make numerous improvements

- Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
2025-10-04 05:31:02 +00:00 · 2021-09-27 22:58:51 -07:00 · 2021-09-27 22:58:51 -07:00 · 39bf41f4eb
commit 39bf41f4eb
parent fa7b4f5bd1
806 changed files with 77494 additions and 63859 deletions
--- a/libc/nexgen32e/sha1.S
+++ b/libc/nexgen32e/sha1.S
@ -1,49 +1,36 @@
-/*
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * - Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- *
- * - Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in
- *   the documentation and/or other materials provided with the
- *   distribution.
- *
- * - Neither the name of Intel Corporation nor the names of its
- *   contributors may be used to endorse or promote products derived
- *   from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-/*
- * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
- *
- * This implementation is based on the previous SSSE3 release:
- * Visit http://software.intel.com/en-us/articles/
- * and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
- *
- * Updates 20-byte SHA-1 record at start of 'state', from 'input', for
- * even number of 'blocks' consecutive 64-byte blocks.
- *
- * extern "C" void sha1_transform_avx2(
- *     struct sha1_state *state, const uint8_t *input, int blocks );
- */
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│                                                                              │
+│  Copyright 2014 Intel Corporation                                            │
+│                                                                              │
+│  Redistribution and use in source and binary forms, with or without          │
+│  modification, are permitted provided that the following conditions          │
+│  are met:                                                                    │
+│                                                                              │
+│    * Redistributions of source code must retain the above copyright          │
+│      notice, this list of conditions and the following disclaimer.           │
+│    * Redistributions in binary form must reproduce the above copyright       │
+│      notice, this list of conditions and the following disclaimer in         │
+│      the documentation and/or other materials provided with the              │
+│      distribution.                                                           │
+│    * Neither the name of Intel Corporation nor the names of its              │
+│      contributors may be used to endorse or promote products derived         │
+│      from this software without specific prior written permission.           │
+│                                                                              │
+│  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS         │
+│  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT           │
+│  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR       │
+│  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT        │
+│  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,       │
+│  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT            │
+│  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,       │
+│  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY       │
+│  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT         │
+│  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE       │
+│  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.        │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/macros.internal.h"

 .ident "\n\
@ -71,7 +58,6 @@ Copyright 2014 Intel Corporation\n"
 #define	REG_RTB	%rbx
 #define	REG_T1	%r11d
 #define	xmm_mov	vmovups
-#define	avx2_zeroupper	vzeroupper
 #define	RND_F1	1
 #define	RND_F2	2
 #define	RND_F3	3
@ -84,16 +70,13 @@ Copyright 2014 Intel Corporation\n"
 	.set E, REG_E
 	.set TB, REG_TB
 	.set TA, REG_TA
-
 	.set RA, REG_RA
 	.set RB, REG_RB
 	.set RC, REG_RC
 	.set RD, REG_RD
 	.set RE, REG_RE
-
 	.set RTA, REG_RTA
 	.set RTB, REG_RTB
-
 	.set T1, REG_T1
 .endm

@ -177,7 +160,6 @@ Copyright 2014 Intel Corporation\n"
 		PRECALC_RESET_WY
 		PRECALC_ROTATE_WY
 	.endif
-
 	/* message scheduling pre-compute for rounds 0-15 */
 	.if   ((i & 7) == 0)
 		/*
@ -194,7 +176,6 @@ Copyright 2014 Intel Corporation\n"
 		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 	.elseif ((i & 7) == 7)
 		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
-
 		PRECALC_ROTATE_WY
 	.endif
 .endm
@ -236,7 +217,6 @@ Copyright 2014 Intel Corporation\n"
 		vpxor	WY_TMP2, WY_TMP, WY
 		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
-
 		PRECALC_ROTATE_WY
 	.endif
 .endm
@ -250,7 +230,6 @@ Copyright 2014 Intel Corporation\n"
 	 * allows more efficient vectorization
 	 * since w[i]=>w[i-3] dependency is broken
 	 */
-
 	.if   ((i & 7) == 0)
 	/*
 	 * blended AVX2 and ALU instruction scheduling
@ -272,14 +251,12 @@ Copyright 2014 Intel Corporation\n"
 	.elseif ((i & 7) == 7)
 		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
-
 		PRECALC_ROTATE_WY
 	.endif
 .endm

 .macro PRECALC r, s
 	.set i, \r
-
 	.if (i < 40)
 		.set K_XMM, 32*0
 	.elseif (i < 80)
@ -289,7 +266,6 @@ Copyright 2014 Intel Corporation\n"
 	.else
 		.set K_XMM, 32*3
 	.endif
-
 	.if (i<32)
 		PRECALC_00_15	\s
 	.elseif (i<64)
@ -307,7 +283,6 @@ Copyright 2014 Intel Corporation\n"
 	.set B, TB
 	.set TB, A
 	.set A, T_REG
-
 	.set T_REG, RE
 	.set RE, RD
 	.set RD, RC
@ -317,9 +292,8 @@ Copyright 2014 Intel Corporation\n"
 	.set RA, T_REG
 .endm

-/* Macro relies on saved ROUND_Fx */
-
-.macro RND_FUN f, r
+//	Macro relies on saved ROUND_Fx
+.macro	RND_FUN f, r
 	.if (\f == RND_F1)
 		ROUND_F1	\r
 	.elseif (\f == RND_F2)
@ -332,11 +306,11 @@ Copyright 2014 Intel Corporation\n"
 .macro RR r
 	.set round_id, (\r % 80)

-	.if (round_id == 0)        /* Precalculate F for first round */
+	.if (round_id == 0)		# Precalculate F for first round
 		.set ROUND_FUNC, RND_F1
 		mov	B, TB

-		rorx	$(32-30), B, B    /* b>>>2 */
+		rorx	$(32-30), B, B	# b>>>2
 		andn	D, TB, T1
 		and	C, TB
 		xor	T1, TB
@ -362,40 +336,38 @@ Copyright 2014 Intel Corporation\n"
 .macro ROUND_F1 r
 	add	WK(\r), E

-	andn	C, A, T1			/* ~b&d */
-	lea	(RE,RTB), E		/* Add F from the previous round */
+	andn	C, A, T1		# ~b&d
+	lea	(RE,RTB), E		# Add F from the previous round

-	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
-	rorx	$(32-30),A, TB		/* b>>>2 for next round */
+	rorx	$(32-5), A, TA		# T2 = A >>> 5
+	rorx	$(32-30),A, TB		# b>>>2 for next round

-	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+	PRECALC	(\r)			# msg scheduling for next 2 blocks

-	/*
-	 * Calculate F for the next round
-	 * (b & c) ^ andn[b, d]
-	 */
-	and	B, A			/* b&c */
-	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
+//	Calculate F for the next round
+//	(b & c) ^ andn[b, d]
+	and	B, A			# b&c
+	xor	T1, A			# F1 = (b&c) ^ (~b&d)

-	lea	(RE,RTA), E		/* E += A >>> 5 */
+	lea	(RE,RTA), E		# E += A >>> 5
 .endm

 .macro ROUND_F2 r
 	add	WK(\r), E
-	lea	(RE,RTB), E		/* Add F from the previous round */
+	lea	(RE,RTB), E		# Add F from the previous round

 	/* Calculate F for the next round */
-	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	rorx	$(32-5), A, TA		# T2 = A >>> 5
 	.if ((round_id) < 79)
-		rorx	$(32-30), A, TB	/* b>>>2 for next round */
+		rorx	$(32-30), A, TB	# b>>>2 for next round
 	.endif
-	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+	PRECALC	(\r)			# msg scheduling for next 2 blocks

 	.if ((round_id) < 79)
 		xor	B, A
 	.endif

-	add	TA, E			/* E += A >>> 5 */
+	add	TA, E			# E += A >>> 5

 	.if ((round_id) < 79)
 		xor	C, A
@ -404,30 +376,28 @@ Copyright 2014 Intel Corporation\n"

 .macro ROUND_F3 r
 	add	WK(\r), E
-	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+	PRECALC	(\r)			# msg scheduling for next 2 blocks

-	lea	(RE,RTB), E		/* Add F from the previous round */
+	lea	(RE,RTB), E		# Add F from the previous round

 	mov	B, T1
 	or	A, T1

-	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
-	rorx	$(32-30), A, TB		/* b>>>2 for next round */
+	rorx	$(32-5), A, TA		# T2 = A >>> 5
+	rorx	$(32-30), A, TB		# b>>>2 for next round

-	/* Calculate F for the next round
-	 * (b and c) or (d and (b or c))
-	 */
+//	Calculate F for the next round
+//	(b and c) or (d and (b or c))
 	and	C, T1
 	and	B, A
 	or	T1, A

-	add	TA, E			/* E += A >>> 5 */
+	add	TA, E			# E += A >>> 5

 .endm

-/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
- * %1 + %2 >= %3 ? %4 : 0
- */
+//	Add constant only if (%2 > %3) condition met (uses RTA as temp)
+//	%1 + %2 >= %3 ? %4 : 0
 .macro ADD_IF_GE a, b, c, d
 	mov     \a, RTA
 	add     $\d, RTA
@ -435,9 +405,7 @@ Copyright 2014 Intel Corporation\n"
 	cmovge  RTA, \a
 .endm

-/*
- * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
- */
+//	Performs 80 rounds of SHA-1 for multiple blocks with s/w pipelining
 .macro SHA1_PIPELINED_MAIN_BODY

 	REGALLOC
@ -451,7 +419,7 @@ Copyright 2014 Intel Corporation\n"
 	mov	%rsp, PRECALC_BUF
 	lea	(2*4*80+32)(%rsp), WK_BUF

-	# Precalc WK for first 2 blocks
+//	Precalc WK for first 2 blocks
 	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
 	.set i, 0
 	.rept    160
@ -459,29 +427,27 @@ Copyright 2014 Intel Corporation\n"
 		.set i, i + 1
 	.endr

-	/* Go to next block if needed */
+//	Go to next block if needed
 	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
 	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
 	xchg	WK_BUF, PRECALC_BUF

 	.align 32
 .L_loop:
-	/*
-	 * code loops through more than one block
-	 * we use K_BASE value as a signal of a last block,
-	 * it is set below by: cmovae BUFFER_PTR, K_BASE
-	 */
+
+//	code loops through more than one block
+//	we use K_BASE value as a signal of a last block,
+//	it is set below by: cmovae BUFFER_PTR, K_BASE
 	test BLOCKS_CTR, BLOCKS_CTR
 	jnz .L_begin
 	.align 32
 	jmp	.L_end
+
 	.align 32
 .L_begin:

-	/*
-	 * Do first block
-	 * rounds: 0,2,4,6,8
-	 */
+//	process first block
+//	rounds: 0,2,4,6,8
 	.set j, 0
 	.rept 5
 		RR	j
@ -491,28 +457,26 @@ Copyright 2014 Intel Corporation\n"
 	jmp .L_loop0
 .L_loop0:

-	/*
-	 * rounds:
-	 * 10,12,14,16,18
-	 * 20,22,24,26,28
-	 * 30,32,34,36,38
-	 * 40,42,44,46,48
-	 * 50,52,54,56,58
-	 */
+//	rounds
+//	10,12,14,16,18
+//	20,22,24,26,28
+//	30,32,34,36,38
+//	40,42,44,46,48
+//	50,52,54,56,58
 	.rept 25
 		RR	j
 		.set j, j+2
 	.endr

-	/* Update Counter */
+//	Update Counter */
 	sub $1, BLOCKS_CTR
-	/* Move to the next block only if needed*/
+
+//	Move to the next block only if needed*/
 	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
-	/*
-	 * rounds
-	 * 60,62,64,66,68
-	 * 70,72,74,76,78
-	 */
+
+//	rounds
+//	60,62,64,66,68
+//	70,72,74,76,78
 	.rept 10
 		RR	j
 		.set j, j+2
@ -529,12 +493,9 @@ Copyright 2014 Intel Corporation\n"

 	mov	TB, B

-	/* Process second block */
-	/*
-	 * rounds
-	 *  0+80, 2+80, 4+80, 6+80, 8+80
-	 * 10+80,12+80,14+80,16+80,18+80
-	 */
+//	process second block
+//	 0+80, 2+80, 4+80, 6+80, 8+80
+//	10+80,12+80,14+80,16+80,18+80

 	.set j, 0
 	.rept 10
@ -544,11 +505,10 @@ Copyright 2014 Intel Corporation\n"

 	jmp	.L_loop1
 .L_loop1:
-	/*
-	 * rounds
-	 * 20+80,22+80,24+80,26+80,28+80
-	 * 30+80,32+80,34+80,36+80,38+80
-	 */
+
+//	rounds
+//	20+80,22+80,24+80,26+80,28+80
+//	30+80,32+80,34+80,36+80,38+80
 	.rept 10
 		RR	j+80
 		.set j, j+2
@ -557,29 +517,26 @@ Copyright 2014 Intel Corporation\n"
 	jmp	.L_loop2
 .L_loop2:

-	/*
-	 * rounds
-	 * 40+80,42+80,44+80,46+80,48+80
-	 * 50+80,52+80,54+80,56+80,58+80
-	 */
+//	rounds
+//	40+80,42+80,44+80,46+80,48+80
+//	50+80,52+80,54+80,56+80,58+80
 	.rept 10
 		RR	j+80
 		.set j, j+2
 	.endr

-	/* update counter */
+//	update counter
 	sub     $1, BLOCKS_CTR
-	/* Move to the next block only if needed*/
+
+//	Move to the next block only if needed
 	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128

 	jmp	.L_loop3
 .L_loop3:

-	/*
-	 * rounds
-	 * 60+80,62+80,64+80,66+80,68+80
-	 * 70+80,72+80,74+80,76+80,78+80
-	 */
+//	rounds
+//	60+80,62+80,64+80,66+80,68+80
+//	70+80,72+80,74+80,76+80,78+80
 	.rept 10
 		RR	j+80
 		.set j, j+2
@ -619,14 +576,14 @@ Copyright 2014 Intel Corporation\n"

 .align 128
 K_XMM_AR:
-	.long K1, K1, K1, K1
-	.long K1, K1, K1, K1
-	.long K2, K2, K2, K2
-	.long K2, K2, K2, K2
-	.long K3, K3, K3, K3
-	.long K3, K3, K3, K3
-	.long K4, K4, K4, K4
-	.long K4, K4, K4, K4
+	.long K1,K1,K1,K1
+	.long K1,K1,K1,K1
+	.long K2,K2,K2,K2
+	.long K2,K2,K2,K2
+	.long K3,K3,K3,K3
+	.long K3,K3,K3,K3
+	.long K4,K4,K4,K4
+	.long K4,K4,K4,K4

 BSWAP_SHUFB_CTL:
 	.long 0x00010203
@ -639,6 +596,23 @@ BSWAP_SHUFB_CTL:
 	.long 0x0c0d0e0f
 .text

+//	Performs Intel® AVX2™ optimized SHA-1 update.
+//
+//	This implementation is based on the previous SSSE3 release:
+//	Visit http://software.intel.com/en-us/articles/ and refer
+//	to improving-the-performance-of-the-secure-hash-algorithm-1/
+//
+//	Updates 20-byte SHA-1 record at start of 'state', from 'input',
+//	for even number of 'blocks' consecutive 64-byte blocks.
+//
+//	    void sha1_transform_avx2(struct sha1_state *state,
+//	                             const uint8_t *input,
+//	                             int blocks);
+//
+//	@param	%rdi points to output digest
+//	@param	%rsi points to input data
+//	@param	%rdx is number of 64-byte blocks to process
+//	@see	X86_HAVE(SHA)
 sha1_transform_avx2:
 	push	%rbp
 	mov	%rsp,%rbp
@ -648,33 +622,23 @@ sha1_transform_avx2:
 	push	%r13
 	push	%r14
 	push	%r15
-
 	RESERVE_STACK  = (W_SIZE*4 + 8+24)
-
 	/* Align stack */
-	mov	%rsp, %rbx
-	and	$~(0x20-1), %rsp
+	mov	%rsp,%rbx
+	and	$~(0x20-1),%rsp
 	push	%rbx
-	sub	$RESERVE_STACK, %rsp
-
-	avx2_zeroupper
-
+	sub	$RESERVE_STACK,%rsp
+	vzeroupper
 	/* Setup initial values */
-	mov	CTX, HASH_PTR
-	mov	BUF, BUFFER_PTR
-
-	mov	BUF, BUFFER_PTR2
-	mov	CNT, BLOCKS_CTR
-
-	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
-
+	mov	CTX,HASH_PTR
+	mov	BUF,BUFFER_PTR
+	mov	BUF,BUFFER_PTR2
+	mov	CNT,BLOCKS_CTR
+	xmm_mov	BSWAP_SHUFB_CTL(%rip),YMM_SHUFB_BSWAP
 	SHA1_PIPELINED_MAIN_BODY
-
-	avx2_zeroupper
-
-	add	$RESERVE_STACK, %rsp
+	vzeroupper
+	add	$RESERVE_STACK,%rsp
 	pop	%rsp
-
 	pop	%r15
 	pop	%r14
 	pop	%r13