Make numerous improvements

- Python static hello world now 1.8mb
- Python static fully loaded now 10mb
- Python HTTPS client now uses MbedTLS
- Python REPL now completes import stmts
- Increase stack size for Python for now
- Begin synthesizing posixpath and ntpath
- Restore Python \N{UNICODE NAME} support
- Restore Python NFKD symbol normalization
- Add optimized code path for Intel SHA-NI
- Get more Python unit tests passing faster
- Get Python help() pagination working on NT
- Python hashlib now supports MbedTLS PBKDF2
- Make memcpy/memmove/memcmp/bcmp/etc. faster
- Add Mersenne Twister and Vigna to LIBC_RAND
- Provide privileged __printf() for error code
- Fix zipos opendir() so that it reports ENOTDIR
- Add basic chmod() implementation for Windows NT
- Add Cosmo's best functions to Python cosmo module
- Pin function trace indent depth to that of caller
- Show memory diagram on invalid access in MODE=dbg
- Differentiate stack overflow on crash in MODE=dbg
- Add stb_truetype and tools for analyzing font files
- Upgrade to UNICODE 13 and reduce its binary footprint
- COMPILE.COM now logs resource usage of build commands
- Start implementing basic poll() support on bare metal
- Set getauxval(AT_EXECFN) to GetModuleFileName() on NT
- Add descriptions to strerror() in non-TINY build modes
- Add COUNTBRANCH() macro to help with micro-optimizations
- Make error / backtrace / asan / memory code more unbreakable
- Add fast perfect C implementation of μ-Law and a-Law audio codecs
- Make strtol() functions consistent with other libc implementations
- Improve Linenoise implementation (see also github.com/jart/bestline)
- COMPILE.COM now suppresses stdout/stderr of successful build commands
This commit is contained in:
Justine Tunney 2021-09-27 22:58:51 -07:00
parent fa7b4f5bd1
commit 39bf41f4eb
806 changed files with 77494 additions and 63859 deletions

View file

@ -1,49 +1,36 @@
/*
* BSD LICENSE
*
* Copyright(c) 2014 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* - Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
*
* This implementation is based on the previous SSSE3 release:
* Visit http://software.intel.com/en-us/articles/
* and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
*
* Updates 20-byte SHA-1 record at start of 'state', from 'input', for
* even number of 'blocks' consecutive 64-byte blocks.
*
* extern "C" void sha1_transform_avx2(
* struct sha1_state *state, const uint8_t *input, int blocks );
*/
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2014 Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "libc/macros.internal.h"
.ident "\n\
@ -71,7 +58,6 @@ Copyright 2014 Intel Corporation\n"
#define REG_RTB %rbx
#define REG_T1 %r11d
#define xmm_mov vmovups
#define avx2_zeroupper vzeroupper
#define RND_F1 1
#define RND_F2 2
#define RND_F3 3
@ -84,16 +70,13 @@ Copyright 2014 Intel Corporation\n"
.set E, REG_E
.set TB, REG_TB
.set TA, REG_TA
.set RA, REG_RA
.set RB, REG_RB
.set RC, REG_RC
.set RD, REG_RD
.set RE, REG_RE
.set RTA, REG_RTA
.set RTB, REG_RTB
.set T1, REG_T1
.endm
@ -177,7 +160,6 @@ Copyright 2014 Intel Corporation\n"
PRECALC_RESET_WY
PRECALC_ROTATE_WY
.endif
/* message scheduling pre-compute for rounds 0-15 */
.if ((i & 7) == 0)
/*
@ -194,7 +176,6 @@ Copyright 2014 Intel Corporation\n"
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
.elseif ((i & 7) == 7)
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
.endif
.endm
@ -236,7 +217,6 @@ Copyright 2014 Intel Corporation\n"
vpxor WY_TMP2, WY_TMP, WY
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
.endif
.endm
@ -250,7 +230,6 @@ Copyright 2014 Intel Corporation\n"
* allows more efficient vectorization
* since w[i]=>w[i-3] dependency is broken
*/
.if ((i & 7) == 0)
/*
* blended AVX2 and ALU instruction scheduling
@ -272,14 +251,12 @@ Copyright 2014 Intel Corporation\n"
.elseif ((i & 7) == 7)
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
.endif
.endm
.macro PRECALC r, s
.set i, \r
.if (i < 40)
.set K_XMM, 32*0
.elseif (i < 80)
@ -289,7 +266,6 @@ Copyright 2014 Intel Corporation\n"
.else
.set K_XMM, 32*3
.endif
.if (i<32)
PRECALC_00_15 \s
.elseif (i<64)
@ -307,7 +283,6 @@ Copyright 2014 Intel Corporation\n"
.set B, TB
.set TB, A
.set A, T_REG
.set T_REG, RE
.set RE, RD
.set RD, RC
@ -317,9 +292,8 @@ Copyright 2014 Intel Corporation\n"
.set RA, T_REG
.endm
/* Macro relies on saved ROUND_Fx */
.macro RND_FUN f, r
// Macro relies on saved ROUND_Fx
.macro RND_FUN f, r
.if (\f == RND_F1)
ROUND_F1 \r
.elseif (\f == RND_F2)
@ -332,11 +306,11 @@ Copyright 2014 Intel Corporation\n"
.macro RR r
.set round_id, (\r % 80)
.if (round_id == 0) /* Precalculate F for first round */
.if (round_id == 0) # Precalculate F for first round
.set ROUND_FUNC, RND_F1
mov B, TB
rorx $(32-30), B, B /* b>>>2 */
rorx $(32-30), B, B # b>>>2
andn D, TB, T1
and C, TB
xor T1, TB
@ -362,40 +336,38 @@ Copyright 2014 Intel Corporation\n"
.macro ROUND_F1 r
add WK(\r), E
andn C, A, T1 /* ~b&d */
lea (RE,RTB), E /* Add F from the previous round */
andn C, A, T1 # ~b&d
lea (RE,RTB), E # Add F from the previous round
rorx $(32-5), A, TA /* T2 = A >>> 5 */
rorx $(32-30),A, TB /* b>>>2 for next round */
rorx $(32-5), A, TA # T2 = A >>> 5
rorx $(32-30),A, TB # b>>>2 for next round
PRECALC (\r) /* msg scheduling for next 2 blocks */
PRECALC (\r) # msg scheduling for next 2 blocks
/*
* Calculate F for the next round
* (b & c) ^ andn[b, d]
*/
and B, A /* b&c */
xor T1, A /* F1 = (b&c) ^ (~b&d) */
// Calculate F for the next round
// (b & c) ^ andn[b, d]
and B, A # b&c
xor T1, A # F1 = (b&c) ^ (~b&d)
lea (RE,RTA), E /* E += A >>> 5 */
lea (RE,RTA), E # E += A >>> 5
.endm
.macro ROUND_F2 r
add WK(\r), E
lea (RE,RTB), E /* Add F from the previous round */
lea (RE,RTB), E # Add F from the previous round
/* Calculate F for the next round */
rorx $(32-5), A, TA /* T2 = A >>> 5 */
rorx $(32-5), A, TA # T2 = A >>> 5
.if ((round_id) < 79)
rorx $(32-30), A, TB /* b>>>2 for next round */
rorx $(32-30), A, TB # b>>>2 for next round
.endif
PRECALC (\r) /* msg scheduling for next 2 blocks */
PRECALC (\r) # msg scheduling for next 2 blocks
.if ((round_id) < 79)
xor B, A
.endif
add TA, E /* E += A >>> 5 */
add TA, E # E += A >>> 5
.if ((round_id) < 79)
xor C, A
@ -404,30 +376,28 @@ Copyright 2014 Intel Corporation\n"
.macro ROUND_F3 r
add WK(\r), E
PRECALC (\r) /* msg scheduling for next 2 blocks */
PRECALC (\r) # msg scheduling for next 2 blocks
lea (RE,RTB), E /* Add F from the previous round */
lea (RE,RTB), E # Add F from the previous round
mov B, T1
or A, T1
rorx $(32-5), A, TA /* T2 = A >>> 5 */
rorx $(32-30), A, TB /* b>>>2 for next round */
rorx $(32-5), A, TA # T2 = A >>> 5
rorx $(32-30), A, TB # b>>>2 for next round
/* Calculate F for the next round
* (b and c) or (d and (b or c))
*/
// Calculate F for the next round
// (b and c) or (d and (b or c))
and C, T1
and B, A
or T1, A
add TA, E /* E += A >>> 5 */
add TA, E # E += A >>> 5
.endm
/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
* %1 + %2 >= %3 ? %4 : 0
*/
// Add constant only if (%2 > %3) condition met (uses RTA as temp)
// %1 + %2 >= %3 ? %4 : 0
.macro ADD_IF_GE a, b, c, d
mov \a, RTA
add $\d, RTA
@ -435,9 +405,7 @@ Copyright 2014 Intel Corporation\n"
cmovge RTA, \a
.endm
/*
* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
*/
// Performs 80 rounds of SHA-1 for multiple blocks with s/w pipelining
.macro SHA1_PIPELINED_MAIN_BODY
REGALLOC
@ -451,7 +419,7 @@ Copyright 2014 Intel Corporation\n"
mov %rsp, PRECALC_BUF
lea (2*4*80+32)(%rsp), WK_BUF
# Precalc WK for first 2 blocks
// Precalc WK for first 2 blocks
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
.set i, 0
.rept 160
@ -459,29 +427,27 @@ Copyright 2014 Intel Corporation\n"
.set i, i + 1
.endr
/* Go to next block if needed */
// Go to next block if needed
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
xchg WK_BUF, PRECALC_BUF
.align 32
.L_loop:
/*
* code loops through more than one block
* we use K_BASE value as a signal of a last block,
* it is set below by: cmovae BUFFER_PTR, K_BASE
*/
// code loops through more than one block
// we use K_BASE value as a signal of a last block,
// it is set below by: cmovae BUFFER_PTR, K_BASE
test BLOCKS_CTR, BLOCKS_CTR
jnz .L_begin
.align 32
jmp .L_end
.align 32
.L_begin:
/*
* Do first block
* rounds: 0,2,4,6,8
*/
// process first block
// rounds: 0,2,4,6,8
.set j, 0
.rept 5
RR j
@ -491,28 +457,26 @@ Copyright 2014 Intel Corporation\n"
jmp .L_loop0
.L_loop0:
/*
* rounds:
* 10,12,14,16,18
* 20,22,24,26,28
* 30,32,34,36,38
* 40,42,44,46,48
* 50,52,54,56,58
*/
// rounds
// 10,12,14,16,18
// 20,22,24,26,28
// 30,32,34,36,38
// 40,42,44,46,48
// 50,52,54,56,58
.rept 25
RR j
.set j, j+2
.endr
/* Update Counter */
// Update Counter */
sub $1, BLOCKS_CTR
/* Move to the next block only if needed*/
// Move to the next block only if needed*/
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
/*
* rounds
* 60,62,64,66,68
* 70,72,74,76,78
*/
// rounds
// 60,62,64,66,68
// 70,72,74,76,78
.rept 10
RR j
.set j, j+2
@ -529,12 +493,9 @@ Copyright 2014 Intel Corporation\n"
mov TB, B
/* Process second block */
/*
* rounds
* 0+80, 2+80, 4+80, 6+80, 8+80
* 10+80,12+80,14+80,16+80,18+80
*/
// process second block
// 0+80, 2+80, 4+80, 6+80, 8+80
// 10+80,12+80,14+80,16+80,18+80
.set j, 0
.rept 10
@ -544,11 +505,10 @@ Copyright 2014 Intel Corporation\n"
jmp .L_loop1
.L_loop1:
/*
* rounds
* 20+80,22+80,24+80,26+80,28+80
* 30+80,32+80,34+80,36+80,38+80
*/
// rounds
// 20+80,22+80,24+80,26+80,28+80
// 30+80,32+80,34+80,36+80,38+80
.rept 10
RR j+80
.set j, j+2
@ -557,29 +517,26 @@ Copyright 2014 Intel Corporation\n"
jmp .L_loop2
.L_loop2:
/*
* rounds
* 40+80,42+80,44+80,46+80,48+80
* 50+80,52+80,54+80,56+80,58+80
*/
// rounds
// 40+80,42+80,44+80,46+80,48+80
// 50+80,52+80,54+80,56+80,58+80
.rept 10
RR j+80
.set j, j+2
.endr
/* update counter */
// update counter
sub $1, BLOCKS_CTR
/* Move to the next block only if needed*/
// Move to the next block only if needed
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
jmp .L_loop3
.L_loop3:
/*
* rounds
* 60+80,62+80,64+80,66+80,68+80
* 70+80,72+80,74+80,76+80,78+80
*/
// rounds
// 60+80,62+80,64+80,66+80,68+80
// 70+80,72+80,74+80,76+80,78+80
.rept 10
RR j+80
.set j, j+2
@ -619,14 +576,14 @@ Copyright 2014 Intel Corporation\n"
.align 128
K_XMM_AR:
.long K1, K1, K1, K1
.long K1, K1, K1, K1
.long K2, K2, K2, K2
.long K2, K2, K2, K2
.long K3, K3, K3, K3
.long K3, K3, K3, K3
.long K4, K4, K4, K4
.long K4, K4, K4, K4
.long K1,K1,K1,K1
.long K1,K1,K1,K1
.long K2,K2,K2,K2
.long K2,K2,K2,K2
.long K3,K3,K3,K3
.long K3,K3,K3,K3
.long K4,K4,K4,K4
.long K4,K4,K4,K4
BSWAP_SHUFB_CTL:
.long 0x00010203
@ -639,6 +596,23 @@ BSWAP_SHUFB_CTL:
.long 0x0c0d0e0f
.text
// Performs Intel® AVX2 optimized SHA-1 update.
//
// This implementation is based on the previous SSSE3 release:
// Visit http://software.intel.com/en-us/articles/ and refer
// to improving-the-performance-of-the-secure-hash-algorithm-1/
//
// Updates 20-byte SHA-1 record at start of 'state', from 'input',
// for even number of 'blocks' consecutive 64-byte blocks.
//
// void sha1_transform_avx2(struct sha1_state *state,
// const uint8_t *input,
// int blocks);
//
// @param %rdi points to output digest
// @param %rsi points to input data
// @param %rdx is number of 64-byte blocks to process
// @see X86_HAVE(SHA)
sha1_transform_avx2:
push %rbp
mov %rsp,%rbp
@ -648,33 +622,23 @@ sha1_transform_avx2:
push %r13
push %r14
push %r15
RESERVE_STACK = (W_SIZE*4 + 8+24)
/* Align stack */
mov %rsp, %rbx
and $~(0x20-1), %rsp
mov %rsp,%rbx
and $~(0x20-1),%rsp
push %rbx
sub $RESERVE_STACK, %rsp
avx2_zeroupper
sub $RESERVE_STACK,%rsp
vzeroupper
/* Setup initial values */
mov CTX, HASH_PTR
mov BUF, BUFFER_PTR
mov BUF, BUFFER_PTR2
mov CNT, BLOCKS_CTR
xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
mov CTX,HASH_PTR
mov BUF,BUFFER_PTR
mov BUF,BUFFER_PTR2
mov CNT,BLOCKS_CTR
xmm_mov BSWAP_SHUFB_CTL(%rip),YMM_SHUFB_BSWAP
SHA1_PIPELINED_MAIN_BODY
avx2_zeroupper
add $RESERVE_STACK, %rsp
vzeroupper
add $RESERVE_STACK,%rsp
pop %rsp
pop %r15
pop %r14
pop %r13