mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-07-31 23:10:27 +00:00
Make numerous improvements
- Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
This commit is contained in:
parent
fa7b4f5bd1
commit
39bf41f4eb
806 changed files with 77494 additions and 63859 deletions
|
@ -1,49 +1,36 @@
|
|||
/*
|
||||
* BSD LICENSE
|
||||
*
|
||||
* Copyright(c) 2014 Intel Corporation.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* - Neither the name of Intel Corporation nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
|
||||
*
|
||||
* This implementation is based on the previous SSSE3 release:
|
||||
* Visit http://software.intel.com/en-us/articles/
|
||||
* and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
|
||||
*
|
||||
* Updates 20-byte SHA-1 record at start of 'state', from 'input', for
|
||||
* even number of 'blocks' consecutive 64-byte blocks.
|
||||
*
|
||||
* extern "C" void sha1_transform_avx2(
|
||||
* struct sha1_state *state, const uint8_t *input, int blocks );
|
||||
*/
|
||||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ │
|
||||
│ Copyright 2014 Intel Corporation │
|
||||
│ │
|
||||
│ Redistribution and use in source and binary forms, with or without │
|
||||
│ modification, are permitted provided that the following conditions │
|
||||
│ are met: │
|
||||
│ │
|
||||
│ * Redistributions of source code must retain the above copyright │
|
||||
│ notice, this list of conditions and the following disclaimer. │
|
||||
│ * Redistributions in binary form must reproduce the above copyright │
|
||||
│ notice, this list of conditions and the following disclaimer in │
|
||||
│ the documentation and/or other materials provided with the │
|
||||
│ distribution. │
|
||||
│ * Neither the name of Intel Corporation nor the names of its │
|
||||
│ contributors may be used to endorse or promote products derived │
|
||||
│ from this software without specific prior written permission. │
|
||||
│ │
|
||||
│ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS │
|
||||
│ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT │
|
||||
│ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR │
|
||||
│ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT │
|
||||
│ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, │
|
||||
│ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT │
|
||||
│ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │
|
||||
│ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │
|
||||
│ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT │
|
||||
│ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE │
|
||||
│ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
|
||||
.ident "\n\
|
||||
|
@ -71,7 +58,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
#define REG_RTB %rbx
|
||||
#define REG_T1 %r11d
|
||||
#define xmm_mov vmovups
|
||||
#define avx2_zeroupper vzeroupper
|
||||
#define RND_F1 1
|
||||
#define RND_F2 2
|
||||
#define RND_F3 3
|
||||
|
@ -84,16 +70,13 @@ Copyright 2014 Intel Corporation\n"
|
|||
.set E, REG_E
|
||||
.set TB, REG_TB
|
||||
.set TA, REG_TA
|
||||
|
||||
.set RA, REG_RA
|
||||
.set RB, REG_RB
|
||||
.set RC, REG_RC
|
||||
.set RD, REG_RD
|
||||
.set RE, REG_RE
|
||||
|
||||
.set RTA, REG_RTA
|
||||
.set RTB, REG_RTB
|
||||
|
||||
.set T1, REG_T1
|
||||
.endm
|
||||
|
||||
|
@ -177,7 +160,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
PRECALC_RESET_WY
|
||||
PRECALC_ROTATE_WY
|
||||
.endif
|
||||
|
||||
/* message scheduling pre-compute for rounds 0-15 */
|
||||
.if ((i & 7) == 0)
|
||||
/*
|
||||
|
@ -194,7 +176,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
|
||||
.elseif ((i & 7) == 7)
|
||||
vmovdqu WY_TMP, PRECALC_WK(i&~7)
|
||||
|
||||
PRECALC_ROTATE_WY
|
||||
.endif
|
||||
.endm
|
||||
|
@ -236,7 +217,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
vpxor WY_TMP2, WY_TMP, WY
|
||||
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
|
||||
vmovdqu WY_TMP, PRECALC_WK(i&~7)
|
||||
|
||||
PRECALC_ROTATE_WY
|
||||
.endif
|
||||
.endm
|
||||
|
@ -250,7 +230,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
* allows more efficient vectorization
|
||||
* since w[i]=>w[i-3] dependency is broken
|
||||
*/
|
||||
|
||||
.if ((i & 7) == 0)
|
||||
/*
|
||||
* blended AVX2 and ALU instruction scheduling
|
||||
|
@ -272,14 +251,12 @@ Copyright 2014 Intel Corporation\n"
|
|||
.elseif ((i & 7) == 7)
|
||||
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
|
||||
vmovdqu WY_TMP, PRECALC_WK(i&~7)
|
||||
|
||||
PRECALC_ROTATE_WY
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro PRECALC r, s
|
||||
.set i, \r
|
||||
|
||||
.if (i < 40)
|
||||
.set K_XMM, 32*0
|
||||
.elseif (i < 80)
|
||||
|
@ -289,7 +266,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
.else
|
||||
.set K_XMM, 32*3
|
||||
.endif
|
||||
|
||||
.if (i<32)
|
||||
PRECALC_00_15 \s
|
||||
.elseif (i<64)
|
||||
|
@ -307,7 +283,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
.set B, TB
|
||||
.set TB, A
|
||||
.set A, T_REG
|
||||
|
||||
.set T_REG, RE
|
||||
.set RE, RD
|
||||
.set RD, RC
|
||||
|
@ -317,9 +292,8 @@ Copyright 2014 Intel Corporation\n"
|
|||
.set RA, T_REG
|
||||
.endm
|
||||
|
||||
/* Macro relies on saved ROUND_Fx */
|
||||
|
||||
.macro RND_FUN f, r
|
||||
// Macro relies on saved ROUND_Fx
|
||||
.macro RND_FUN f, r
|
||||
.if (\f == RND_F1)
|
||||
ROUND_F1 \r
|
||||
.elseif (\f == RND_F2)
|
||||
|
@ -332,11 +306,11 @@ Copyright 2014 Intel Corporation\n"
|
|||
.macro RR r
|
||||
.set round_id, (\r % 80)
|
||||
|
||||
.if (round_id == 0) /* Precalculate F for first round */
|
||||
.if (round_id == 0) # Precalculate F for first round
|
||||
.set ROUND_FUNC, RND_F1
|
||||
mov B, TB
|
||||
|
||||
rorx $(32-30), B, B /* b>>>2 */
|
||||
rorx $(32-30), B, B # b>>>2
|
||||
andn D, TB, T1
|
||||
and C, TB
|
||||
xor T1, TB
|
||||
|
@ -362,40 +336,38 @@ Copyright 2014 Intel Corporation\n"
|
|||
.macro ROUND_F1 r
|
||||
add WK(\r), E
|
||||
|
||||
andn C, A, T1 /* ~b&d */
|
||||
lea (RE,RTB), E /* Add F from the previous round */
|
||||
andn C, A, T1 # ~b&d
|
||||
lea (RE,RTB), E # Add F from the previous round
|
||||
|
||||
rorx $(32-5), A, TA /* T2 = A >>> 5 */
|
||||
rorx $(32-30),A, TB /* b>>>2 for next round */
|
||||
rorx $(32-5), A, TA # T2 = A >>> 5
|
||||
rorx $(32-30),A, TB # b>>>2 for next round
|
||||
|
||||
PRECALC (\r) /* msg scheduling for next 2 blocks */
|
||||
PRECALC (\r) # msg scheduling for next 2 blocks
|
||||
|
||||
/*
|
||||
* Calculate F for the next round
|
||||
* (b & c) ^ andn[b, d]
|
||||
*/
|
||||
and B, A /* b&c */
|
||||
xor T1, A /* F1 = (b&c) ^ (~b&d) */
|
||||
// Calculate F for the next round
|
||||
// (b & c) ^ andn[b, d]
|
||||
and B, A # b&c
|
||||
xor T1, A # F1 = (b&c) ^ (~b&d)
|
||||
|
||||
lea (RE,RTA), E /* E += A >>> 5 */
|
||||
lea (RE,RTA), E # E += A >>> 5
|
||||
.endm
|
||||
|
||||
.macro ROUND_F2 r
|
||||
add WK(\r), E
|
||||
lea (RE,RTB), E /* Add F from the previous round */
|
||||
lea (RE,RTB), E # Add F from the previous round
|
||||
|
||||
/* Calculate F for the next round */
|
||||
rorx $(32-5), A, TA /* T2 = A >>> 5 */
|
||||
rorx $(32-5), A, TA # T2 = A >>> 5
|
||||
.if ((round_id) < 79)
|
||||
rorx $(32-30), A, TB /* b>>>2 for next round */
|
||||
rorx $(32-30), A, TB # b>>>2 for next round
|
||||
.endif
|
||||
PRECALC (\r) /* msg scheduling for next 2 blocks */
|
||||
PRECALC (\r) # msg scheduling for next 2 blocks
|
||||
|
||||
.if ((round_id) < 79)
|
||||
xor B, A
|
||||
.endif
|
||||
|
||||
add TA, E /* E += A >>> 5 */
|
||||
add TA, E # E += A >>> 5
|
||||
|
||||
.if ((round_id) < 79)
|
||||
xor C, A
|
||||
|
@ -404,30 +376,28 @@ Copyright 2014 Intel Corporation\n"
|
|||
|
||||
.macro ROUND_F3 r
|
||||
add WK(\r), E
|
||||
PRECALC (\r) /* msg scheduling for next 2 blocks */
|
||||
PRECALC (\r) # msg scheduling for next 2 blocks
|
||||
|
||||
lea (RE,RTB), E /* Add F from the previous round */
|
||||
lea (RE,RTB), E # Add F from the previous round
|
||||
|
||||
mov B, T1
|
||||
or A, T1
|
||||
|
||||
rorx $(32-5), A, TA /* T2 = A >>> 5 */
|
||||
rorx $(32-30), A, TB /* b>>>2 for next round */
|
||||
rorx $(32-5), A, TA # T2 = A >>> 5
|
||||
rorx $(32-30), A, TB # b>>>2 for next round
|
||||
|
||||
/* Calculate F for the next round
|
||||
* (b and c) or (d and (b or c))
|
||||
*/
|
||||
// Calculate F for the next round
|
||||
// (b and c) or (d and (b or c))
|
||||
and C, T1
|
||||
and B, A
|
||||
or T1, A
|
||||
|
||||
add TA, E /* E += A >>> 5 */
|
||||
add TA, E # E += A >>> 5
|
||||
|
||||
.endm
|
||||
|
||||
/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
|
||||
* %1 + %2 >= %3 ? %4 : 0
|
||||
*/
|
||||
// Add constant only if (%2 > %3) condition met (uses RTA as temp)
|
||||
// %1 + %2 >= %3 ? %4 : 0
|
||||
.macro ADD_IF_GE a, b, c, d
|
||||
mov \a, RTA
|
||||
add $\d, RTA
|
||||
|
@ -435,9 +405,7 @@ Copyright 2014 Intel Corporation\n"
|
|||
cmovge RTA, \a
|
||||
.endm
|
||||
|
||||
/*
|
||||
* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
|
||||
*/
|
||||
// Performs 80 rounds of SHA-1 for multiple blocks with s/w pipelining
|
||||
.macro SHA1_PIPELINED_MAIN_BODY
|
||||
|
||||
REGALLOC
|
||||
|
@ -451,7 +419,7 @@ Copyright 2014 Intel Corporation\n"
|
|||
mov %rsp, PRECALC_BUF
|
||||
lea (2*4*80+32)(%rsp), WK_BUF
|
||||
|
||||
# Precalc WK for first 2 blocks
|
||||
// Precalc WK for first 2 blocks
|
||||
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
|
||||
.set i, 0
|
||||
.rept 160
|
||||
|
@ -459,29 +427,27 @@ Copyright 2014 Intel Corporation\n"
|
|||
.set i, i + 1
|
||||
.endr
|
||||
|
||||
/* Go to next block if needed */
|
||||
// Go to next block if needed
|
||||
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
|
||||
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
|
||||
xchg WK_BUF, PRECALC_BUF
|
||||
|
||||
.align 32
|
||||
.L_loop:
|
||||
/*
|
||||
* code loops through more than one block
|
||||
* we use K_BASE value as a signal of a last block,
|
||||
* it is set below by: cmovae BUFFER_PTR, K_BASE
|
||||
*/
|
||||
|
||||
// code loops through more than one block
|
||||
// we use K_BASE value as a signal of a last block,
|
||||
// it is set below by: cmovae BUFFER_PTR, K_BASE
|
||||
test BLOCKS_CTR, BLOCKS_CTR
|
||||
jnz .L_begin
|
||||
.align 32
|
||||
jmp .L_end
|
||||
|
||||
.align 32
|
||||
.L_begin:
|
||||
|
||||
/*
|
||||
* Do first block
|
||||
* rounds: 0,2,4,6,8
|
||||
*/
|
||||
// process first block
|
||||
// rounds: 0,2,4,6,8
|
||||
.set j, 0
|
||||
.rept 5
|
||||
RR j
|
||||
|
@ -491,28 +457,26 @@ Copyright 2014 Intel Corporation\n"
|
|||
jmp .L_loop0
|
||||
.L_loop0:
|
||||
|
||||
/*
|
||||
* rounds:
|
||||
* 10,12,14,16,18
|
||||
* 20,22,24,26,28
|
||||
* 30,32,34,36,38
|
||||
* 40,42,44,46,48
|
||||
* 50,52,54,56,58
|
||||
*/
|
||||
// rounds
|
||||
// 10,12,14,16,18
|
||||
// 20,22,24,26,28
|
||||
// 30,32,34,36,38
|
||||
// 40,42,44,46,48
|
||||
// 50,52,54,56,58
|
||||
.rept 25
|
||||
RR j
|
||||
.set j, j+2
|
||||
.endr
|
||||
|
||||
/* Update Counter */
|
||||
// Update Counter */
|
||||
sub $1, BLOCKS_CTR
|
||||
/* Move to the next block only if needed*/
|
||||
|
||||
// Move to the next block only if needed*/
|
||||
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
|
||||
/*
|
||||
* rounds
|
||||
* 60,62,64,66,68
|
||||
* 70,72,74,76,78
|
||||
*/
|
||||
|
||||
// rounds
|
||||
// 60,62,64,66,68
|
||||
// 70,72,74,76,78
|
||||
.rept 10
|
||||
RR j
|
||||
.set j, j+2
|
||||
|
@ -529,12 +493,9 @@ Copyright 2014 Intel Corporation\n"
|
|||
|
||||
mov TB, B
|
||||
|
||||
/* Process second block */
|
||||
/*
|
||||
* rounds
|
||||
* 0+80, 2+80, 4+80, 6+80, 8+80
|
||||
* 10+80,12+80,14+80,16+80,18+80
|
||||
*/
|
||||
// process second block
|
||||
// 0+80, 2+80, 4+80, 6+80, 8+80
|
||||
// 10+80,12+80,14+80,16+80,18+80
|
||||
|
||||
.set j, 0
|
||||
.rept 10
|
||||
|
@ -544,11 +505,10 @@ Copyright 2014 Intel Corporation\n"
|
|||
|
||||
jmp .L_loop1
|
||||
.L_loop1:
|
||||
/*
|
||||
* rounds
|
||||
* 20+80,22+80,24+80,26+80,28+80
|
||||
* 30+80,32+80,34+80,36+80,38+80
|
||||
*/
|
||||
|
||||
// rounds
|
||||
// 20+80,22+80,24+80,26+80,28+80
|
||||
// 30+80,32+80,34+80,36+80,38+80
|
||||
.rept 10
|
||||
RR j+80
|
||||
.set j, j+2
|
||||
|
@ -557,29 +517,26 @@ Copyright 2014 Intel Corporation\n"
|
|||
jmp .L_loop2
|
||||
.L_loop2:
|
||||
|
||||
/*
|
||||
* rounds
|
||||
* 40+80,42+80,44+80,46+80,48+80
|
||||
* 50+80,52+80,54+80,56+80,58+80
|
||||
*/
|
||||
// rounds
|
||||
// 40+80,42+80,44+80,46+80,48+80
|
||||
// 50+80,52+80,54+80,56+80,58+80
|
||||
.rept 10
|
||||
RR j+80
|
||||
.set j, j+2
|
||||
.endr
|
||||
|
||||
/* update counter */
|
||||
// update counter
|
||||
sub $1, BLOCKS_CTR
|
||||
/* Move to the next block only if needed*/
|
||||
|
||||
// Move to the next block only if needed
|
||||
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
|
||||
|
||||
jmp .L_loop3
|
||||
.L_loop3:
|
||||
|
||||
/*
|
||||
* rounds
|
||||
* 60+80,62+80,64+80,66+80,68+80
|
||||
* 70+80,72+80,74+80,76+80,78+80
|
||||
*/
|
||||
// rounds
|
||||
// 60+80,62+80,64+80,66+80,68+80
|
||||
// 70+80,72+80,74+80,76+80,78+80
|
||||
.rept 10
|
||||
RR j+80
|
||||
.set j, j+2
|
||||
|
@ -619,14 +576,14 @@ Copyright 2014 Intel Corporation\n"
|
|||
|
||||
.align 128
|
||||
K_XMM_AR:
|
||||
.long K1, K1, K1, K1
|
||||
.long K1, K1, K1, K1
|
||||
.long K2, K2, K2, K2
|
||||
.long K2, K2, K2, K2
|
||||
.long K3, K3, K3, K3
|
||||
.long K3, K3, K3, K3
|
||||
.long K4, K4, K4, K4
|
||||
.long K4, K4, K4, K4
|
||||
.long K1,K1,K1,K1
|
||||
.long K1,K1,K1,K1
|
||||
.long K2,K2,K2,K2
|
||||
.long K2,K2,K2,K2
|
||||
.long K3,K3,K3,K3
|
||||
.long K3,K3,K3,K3
|
||||
.long K4,K4,K4,K4
|
||||
.long K4,K4,K4,K4
|
||||
|
||||
BSWAP_SHUFB_CTL:
|
||||
.long 0x00010203
|
||||
|
@ -639,6 +596,23 @@ BSWAP_SHUFB_CTL:
|
|||
.long 0x0c0d0e0f
|
||||
.text
|
||||
|
||||
// Performs Intel® AVX2™ optimized SHA-1 update.
|
||||
//
|
||||
// This implementation is based on the previous SSSE3 release:
|
||||
// Visit http://software.intel.com/en-us/articles/ and refer
|
||||
// to improving-the-performance-of-the-secure-hash-algorithm-1/
|
||||
//
|
||||
// Updates 20-byte SHA-1 record at start of 'state', from 'input',
|
||||
// for even number of 'blocks' consecutive 64-byte blocks.
|
||||
//
|
||||
// void sha1_transform_avx2(struct sha1_state *state,
|
||||
// const uint8_t *input,
|
||||
// int blocks);
|
||||
//
|
||||
// @param %rdi points to output digest
|
||||
// @param %rsi points to input data
|
||||
// @param %rdx is number of 64-byte blocks to process
|
||||
// @see X86_HAVE(SHA)
|
||||
sha1_transform_avx2:
|
||||
push %rbp
|
||||
mov %rsp,%rbp
|
||||
|
@ -648,33 +622,23 @@ sha1_transform_avx2:
|
|||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
RESERVE_STACK = (W_SIZE*4 + 8+24)
|
||||
|
||||
/* Align stack */
|
||||
mov %rsp, %rbx
|
||||
and $~(0x20-1), %rsp
|
||||
mov %rsp,%rbx
|
||||
and $~(0x20-1),%rsp
|
||||
push %rbx
|
||||
sub $RESERVE_STACK, %rsp
|
||||
|
||||
avx2_zeroupper
|
||||
|
||||
sub $RESERVE_STACK,%rsp
|
||||
vzeroupper
|
||||
/* Setup initial values */
|
||||
mov CTX, HASH_PTR
|
||||
mov BUF, BUFFER_PTR
|
||||
|
||||
mov BUF, BUFFER_PTR2
|
||||
mov CNT, BLOCKS_CTR
|
||||
|
||||
xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
|
||||
|
||||
mov CTX,HASH_PTR
|
||||
mov BUF,BUFFER_PTR
|
||||
mov BUF,BUFFER_PTR2
|
||||
mov CNT,BLOCKS_CTR
|
||||
xmm_mov BSWAP_SHUFB_CTL(%rip),YMM_SHUFB_BSWAP
|
||||
SHA1_PIPELINED_MAIN_BODY
|
||||
|
||||
avx2_zeroupper
|
||||
|
||||
add $RESERVE_STACK, %rsp
|
||||
vzeroupper
|
||||
add $RESERVE_STACK,%rsp
|
||||
pop %rsp
|
||||
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue