Make sha1 / sha256 / sha512 go faster

This commit is contained in:
Justine Tunney 2021-06-26 00:11:12 -07:00
parent 5144c22189
commit 2d79ab6c15
14 changed files with 2299 additions and 93 deletions

View file

@ -1,33 +0,0 @@
#if 0
/*
To the extent possible under law, Justine Tunney has waived
all copyright and related or neighboring rights to this file,
as it is written in the following disclaimers:
http://unlicense.org/
http://creativecommons.org/publicdomain/zero/1.0/
*/
#endif
#include "libc/macros.internal.h"
.rodata.cst16
.align 16
kSha256Tab:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.endobj kSha256Tab,globl,hidden
.previous
.source __FILE__

681
libc/nexgen32e/sha1.S Normal file
View file

@ -0,0 +1,681 @@
/*
* BSD LICENSE
*
* Copyright(c) 2014 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* - Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
*
* This implementation is based on the previous SSSE3 release:
* Visit http://software.intel.com/en-us/articles/
* and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
*
* Updates 20-byte SHA-1 record at start of 'state', from 'input', for
* even number of 'blocks' consecutive 64-byte blocks.
*
* extern "C" void sha1_transform_avx2(
* struct sha1_state *state, const uint8_t *input, int blocks );
*/
#include "libc/macros.internal.h"
.ident "\n\
AVX2 SHA-1 (BSD-3 License)\n\
Copyright 2014 Intel Corporation\n"
.include "libc/disclaimer.inc"
#define CTX %rdi /* arg1 */
#define BUF %rsi /* arg2 */
#define CNT %rdx /* arg3 */
#define REG_A %ecx
#define REG_B %esi
#define REG_C %edi
#define REG_D %eax
#define REG_E %edx
#define REG_TB %ebx
#define REG_TA %r12d
#define REG_RA %rcx
#define REG_RB %rsi
#define REG_RC %rdi
#define REG_RD %rax
#define REG_RE %rdx
#define REG_RTA %r12
#define REG_RTB %rbx
#define REG_T1 %r11d
#define xmm_mov vmovups
#define avx2_zeroupper vzeroupper
#define RND_F1 1
#define RND_F2 2
#define RND_F3 3
.macro REGALLOC
.set A, REG_A
.set B, REG_B
.set C, REG_C
.set D, REG_D
.set E, REG_E
.set TB, REG_TB
.set TA, REG_TA
.set RA, REG_RA
.set RB, REG_RB
.set RC, REG_RC
.set RD, REG_RD
.set RE, REG_RE
.set RTA, REG_RTA
.set RTB, REG_RTB
.set T1, REG_T1
.endm
#define HASH_PTR %r9
#define BLOCKS_CTR %r8
#define BUFFER_PTR %r10
#define BUFFER_PTR2 %r13
#define PRECALC_BUF %r14
#define WK_BUF %r15
#define W_TMP %xmm0
#define WY_TMP %ymm0
#define WY_TMP2 %ymm9
# AVX2 variables
#define WY0 %ymm3
#define WY4 %ymm5
#define WY08 %ymm7
#define WY12 %ymm8
#define WY16 %ymm12
#define WY20 %ymm13
#define WY24 %ymm14
#define WY28 %ymm15
#define YMM_SHUFB_BSWAP %ymm10
/*
* Keep 2 iterations precalculated at a time:
* - 80 DWORDs per iteration * 2
*/
#define W_SIZE (80*2*2 +16)
#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
.macro UPDATE_HASH hash, val
add \hash, \val
mov \val, \hash
.endm
.macro PRECALC_RESET_WY
.set WY_00, WY0
.set WY_04, WY4
.set WY_08, WY08
.set WY_12, WY12
.set WY_16, WY16
.set WY_20, WY20
.set WY_24, WY24
.set WY_28, WY28
.set WY_32, WY_00
.endm
.macro PRECALC_ROTATE_WY
/* Rotate macros */
.set WY_32, WY_28
.set WY_28, WY_24
.set WY_24, WY_20
.set WY_20, WY_16
.set WY_16, WY_12
.set WY_12, WY_08
.set WY_08, WY_04
.set WY_04, WY_00
.set WY_00, WY_32
/* Define register aliases */
.set WY, WY_00
.set WY_minus_04, WY_04
.set WY_minus_08, WY_08
.set WY_minus_12, WY_12
.set WY_minus_16, WY_16
.set WY_minus_20, WY_20
.set WY_minus_24, WY_24
.set WY_minus_28, WY_28
.set WY_minus_32, WY
.endm
.macro PRECALC_00_15
.if (i == 0) # Initialize and rotate registers
PRECALC_RESET_WY
PRECALC_ROTATE_WY
.endif
/* message scheduling pre-compute for rounds 0-15 */
.if ((i & 7) == 0)
/*
* blended AVX2 and ALU instruction scheduling
* 1 vector iteration per 8 rounds
*/
vmovdqu (i * 2)(BUFFER_PTR), W_TMP
.elseif ((i & 7) == 1)
vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
WY_TMP, WY_TMP
.elseif ((i & 7) == 2)
vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
.elseif ((i & 7) == 4)
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
.elseif ((i & 7) == 7)
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
.endif
.endm
.macro PRECALC_16_31
/*
* message scheduling pre-compute for rounds 16-31
* calculating last 32 w[i] values in 8 XMM registers
* pre-calculate K+w[i] values and store to mem
* for later load by ALU add instruction
*
* "brute force" vectorization for rounds 16-31 only
* due to w[i]->w[i-3] dependency
*/
.if ((i & 7) == 0)
/*
* blended AVX2 and ALU instruction scheduling
* 1 vector iteration per 8 rounds
*/
/* w[i-14] */
vpalignr $8, WY_minus_16, WY_minus_12, WY
vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
.elseif ((i & 7) == 1)
vpxor WY_minus_08, WY, WY
vpxor WY_minus_16, WY_TMP, WY_TMP
.elseif ((i & 7) == 2)
vpxor WY_TMP, WY, WY
vpslldq $12, WY, WY_TMP2
.elseif ((i & 7) == 3)
vpslld $1, WY, WY_TMP
vpsrld $31, WY, WY
.elseif ((i & 7) == 4)
vpor WY, WY_TMP, WY_TMP
vpslld $2, WY_TMP2, WY
.elseif ((i & 7) == 5)
vpsrld $30, WY_TMP2, WY_TMP2
vpxor WY, WY_TMP, WY_TMP
.elseif ((i & 7) == 7)
vpxor WY_TMP2, WY_TMP, WY
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
.endif
.endm
.macro PRECALC_32_79
/*
* in SHA-1 specification:
* w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
* instead we do equal:
* w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
* allows more efficient vectorization
* since w[i]=>w[i-3] dependency is broken
*/
.if ((i & 7) == 0)
/*
* blended AVX2 and ALU instruction scheduling
* 1 vector iteration per 8 rounds
*/
vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
.elseif ((i & 7) == 1)
/* W is W_minus_32 before xor */
vpxor WY_minus_28, WY, WY
.elseif ((i & 7) == 2)
vpxor WY_minus_16, WY_TMP, WY_TMP
.elseif ((i & 7) == 3)
vpxor WY_TMP, WY, WY
.elseif ((i & 7) == 4)
vpslld $2, WY, WY_TMP
.elseif ((i & 7) == 5)
vpsrld $30, WY, WY
vpor WY, WY_TMP, WY
.elseif ((i & 7) == 7)
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
.endif
.endm
.macro PRECALC r, s
.set i, \r
.if (i < 40)
.set K_XMM, 32*0
.elseif (i < 80)
.set K_XMM, 32*1
.elseif (i < 120)
.set K_XMM, 32*2
.else
.set K_XMM, 32*3
.endif
.if (i<32)
PRECALC_00_15 \s
.elseif (i<64)
PRECALC_16_31 \s
.elseif (i < 160)
PRECALC_32_79 \s
.endif
.endm
.macro ROTATE_STATE
.set T_REG, E
.set E, D
.set D, C
.set C, B
.set B, TB
.set TB, A
.set A, T_REG
.set T_REG, RE
.set RE, RD
.set RD, RC
.set RC, RB
.set RB, RTB
.set RTB, RA
.set RA, T_REG
.endm
/* Macro relies on saved ROUND_Fx */
.macro RND_FUN f, r
.if (\f == RND_F1)
ROUND_F1 \r
.elseif (\f == RND_F2)
ROUND_F2 \r
.elseif (\f == RND_F3)
ROUND_F3 \r
.endif
.endm
.macro RR r
.set round_id, (\r % 80)
.if (round_id == 0) /* Precalculate F for first round */
.set ROUND_FUNC, RND_F1
mov B, TB
rorx $(32-30), B, B /* b>>>2 */
andn D, TB, T1
and C, TB
xor T1, TB
.endif
RND_FUN ROUND_FUNC, \r
ROTATE_STATE
.if (round_id == 18)
.set ROUND_FUNC, RND_F2
.elseif (round_id == 38)
.set ROUND_FUNC, RND_F3
.elseif (round_id == 58)
.set ROUND_FUNC, RND_F2
.endif
.set round_id, ( (\r+1) % 80)
RND_FUN ROUND_FUNC, (\r+1)
ROTATE_STATE
.endm
.macro ROUND_F1 r
add WK(\r), E
andn C, A, T1 /* ~b&d */
lea (RE,RTB), E /* Add F from the previous round */
rorx $(32-5), A, TA /* T2 = A >>> 5 */
rorx $(32-30),A, TB /* b>>>2 for next round */
PRECALC (\r) /* msg scheduling for next 2 blocks */
/*
* Calculate F for the next round
* (b & c) ^ andn[b, d]
*/
and B, A /* b&c */
xor T1, A /* F1 = (b&c) ^ (~b&d) */
lea (RE,RTA), E /* E += A >>> 5 */
.endm
.macro ROUND_F2 r
add WK(\r), E
lea (RE,RTB), E /* Add F from the previous round */
/* Calculate F for the next round */
rorx $(32-5), A, TA /* T2 = A >>> 5 */
.if ((round_id) < 79)
rorx $(32-30), A, TB /* b>>>2 for next round */
.endif
PRECALC (\r) /* msg scheduling for next 2 blocks */
.if ((round_id) < 79)
xor B, A
.endif
add TA, E /* E += A >>> 5 */
.if ((round_id) < 79)
xor C, A
.endif
.endm
.macro ROUND_F3 r
add WK(\r), E
PRECALC (\r) /* msg scheduling for next 2 blocks */
lea (RE,RTB), E /* Add F from the previous round */
mov B, T1
or A, T1
rorx $(32-5), A, TA /* T2 = A >>> 5 */
rorx $(32-30), A, TB /* b>>>2 for next round */
/* Calculate F for the next round
* (b and c) or (d and (b or c))
*/
and C, T1
and B, A
or T1, A
add TA, E /* E += A >>> 5 */
.endm
/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
* %1 + %2 >= %3 ? %4 : 0
*/
.macro ADD_IF_GE a, b, c, d
mov \a, RTA
add $\d, RTA
cmp $\c, \b
cmovge RTA, \a
.endm
/*
* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
*/
.macro SHA1_PIPELINED_MAIN_BODY
REGALLOC
mov (HASH_PTR), A
mov 4(HASH_PTR), B
mov 8(HASH_PTR), C
mov 12(HASH_PTR), D
mov 16(HASH_PTR), E
mov %rsp, PRECALC_BUF
lea (2*4*80+32)(%rsp), WK_BUF
# Precalc WK for first 2 blocks
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
.set i, 0
.rept 160
PRECALC i
.set i, i + 1
.endr
/* Go to next block if needed */
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
xchg WK_BUF, PRECALC_BUF
.align 32
.L_loop:
/*
* code loops through more than one block
* we use K_BASE value as a signal of a last block,
* it is set below by: cmovae BUFFER_PTR, K_BASE
*/
test BLOCKS_CTR, BLOCKS_CTR
jnz .L_begin
.align 32
jmp .L_end
.align 32
.L_begin:
/*
* Do first block
* rounds: 0,2,4,6,8
*/
.set j, 0
.rept 5
RR j
.set j, j+2
.endr
jmp .L_loop0
.L_loop0:
/*
* rounds:
* 10,12,14,16,18
* 20,22,24,26,28
* 30,32,34,36,38
* 40,42,44,46,48
* 50,52,54,56,58
*/
.rept 25
RR j
.set j, j+2
.endr
/* Update Counter */
sub $1, BLOCKS_CTR
/* Move to the next block only if needed*/
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
/*
* rounds
* 60,62,64,66,68
* 70,72,74,76,78
*/
.rept 10
RR j
.set j, j+2
.endr
UPDATE_HASH (HASH_PTR), A
UPDATE_HASH 4(HASH_PTR), TB
UPDATE_HASH 8(HASH_PTR), C
UPDATE_HASH 12(HASH_PTR), D
UPDATE_HASH 16(HASH_PTR), E
test BLOCKS_CTR, BLOCKS_CTR
jz .L_loop
mov TB, B
/* Process second block */
/*
* rounds
* 0+80, 2+80, 4+80, 6+80, 8+80
* 10+80,12+80,14+80,16+80,18+80
*/
.set j, 0
.rept 10
RR j+80
.set j, j+2
.endr
jmp .L_loop1
.L_loop1:
/*
* rounds
* 20+80,22+80,24+80,26+80,28+80
* 30+80,32+80,34+80,36+80,38+80
*/
.rept 10
RR j+80
.set j, j+2
.endr
jmp .L_loop2
.L_loop2:
/*
* rounds
* 40+80,42+80,44+80,46+80,48+80
* 50+80,52+80,54+80,56+80,58+80
*/
.rept 10
RR j+80
.set j, j+2
.endr
/* update counter */
sub $1, BLOCKS_CTR
/* Move to the next block only if needed*/
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
jmp .L_loop3
.L_loop3:
/*
* rounds
* 60+80,62+80,64+80,66+80,68+80
* 70+80,72+80,74+80,76+80,78+80
*/
.rept 10
RR j+80
.set j, j+2
.endr
UPDATE_HASH (HASH_PTR), A
UPDATE_HASH 4(HASH_PTR), TB
UPDATE_HASH 8(HASH_PTR), C
UPDATE_HASH 12(HASH_PTR), D
UPDATE_HASH 16(HASH_PTR), E
/* Reset state for AVX2 reg permutation */
mov A, TA
mov TB, A
mov C, TB
mov E, C
mov D, B
mov TA, D
REGALLOC
xchg WK_BUF, PRECALC_BUF
jmp .L_loop
.align 32
.L_end:
.endm
.section .rodata
#define K1 0x5a827999
#define K2 0x6ed9eba1
#define K3 0x8f1bbcdc
#define K4 0xca62c1d6
.align 128
K_XMM_AR:
.long K1, K1, K1, K1
.long K1, K1, K1, K1
.long K2, K2, K2, K2
.long K2, K2, K2, K2
.long K3, K3, K3, K3
.long K3, K3, K3, K3
.long K4, K4, K4, K4
.long K4, K4, K4, K4
BSWAP_SHUFB_CTL:
.long 0x00010203
.long 0x04050607
.long 0x08090a0b
.long 0x0c0d0e0f
.long 0x00010203
.long 0x04050607
.long 0x08090a0b
.long 0x0c0d0e0f
.text
sha1_transform_avx2:
push %rbx
push %r12
push %r13
push %r14
push %r15
RESERVE_STACK = (W_SIZE*4 + 8+24)
/* Align stack */
mov %rsp, %rbx
and $~(0x20-1), %rsp
push %rbx
sub $RESERVE_STACK, %rsp
avx2_zeroupper
/* Setup initial values */
mov CTX, HASH_PTR
mov BUF, BUFFER_PTR
mov BUF, BUFFER_PTR2
mov CNT, BLOCKS_CTR
xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
SHA1_PIPELINED_MAIN_BODY
avx2_zeroupper
add $RESERVE_STACK, %rsp
pop %rsp
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
ret
.endfn sha1_transform_avx2,globl

769
libc/nexgen32e/sha256.S Normal file
View file

@ -0,0 +1,769 @@
/////////////////////////////////////////////////////////////////////////
// Implement fast SHA-256 with AVX2 instructions. (x86_64)
//
// Copyright (C) 2013 Intel Corporation.
//
// Authors:
// James Guilford <james.guilford@intel.com>
// Kirk Yap <kirk.s.yap@intel.com>
// Tim Chen <tim.c.chen@linux.intel.com>
//
// This software is available to you under a choice of one of two
// licenses. You may choose to be licensed under the terms of the GNU
// General Public License (GPL) Version 2, available from the file
// COPYING in the main directory of this source tree, or the
// OpenIB.org BSD license below:
//
// Redistribution and use in source and binary forms, with or
// without modification, are permitted provided that the following
// conditions are met:
//
// - Redistributions of source code must retain the above
// copyright notice, this list of conditions and the following
// disclaimer.
//
// - Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials
// provided with the distribution.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
/////////////////////////////////////////////////////////////////////////
//
// This code is described in an Intel White-Paper:
// "Fast SHA-256 Implementations on Intel Architecture Processors"
//
// To find it, surf to http://www.intel.com/p/en_US/embedded
// and search for that title.
//
/////////////////////////////////////////////////////////////////////////
// This code schedules 2 blocks at a time, with 4 lanes per block
/////////////////////////////////////////////////////////////////////////
#include "libc/macros.internal.h"
.ident "\n\
AVX2 SHA-256 (BSD-2 License)\n\
Copyright 2013 Intel Corporation\n"
.include "libc/disclaimer.inc"
## assume buffers not aligned
#define VMOVDQ vmovdqu
################################ Define Macros
# addm [mem], reg
# Add reg to mem using reg-mem add and store
.macro addm p1 p2
add \p1, \p2
mov \p2, \p1
.endm
################################
X0 = %ymm4
X1 = %ymm5
X2 = %ymm6
X3 = %ymm7
# XMM versions of above
XWORD0 = %xmm4
XWORD1 = %xmm5
XWORD2 = %xmm6
XWORD3 = %xmm7
XTMP0 = %ymm0
XTMP1 = %ymm1
XTMP2 = %ymm2
XTMP3 = %ymm3
XTMP4 = %ymm8
XFER = %ymm9
XTMP5 = %ymm11
SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
BYTE_FLIP_MASK = %ymm13
X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
NUM_BLKS = %rdx # 3rd arg
INP = %rsi # 2nd arg
CTX = %rdi # 1st arg
c = %ecx
d = %r8d
e = %edx # clobbers NUM_BLKS
y3 = %esi # clobbers INP
SRND = CTX # SRND is same register as CTX
a = %eax
b = %ebx
f = %r9d
g = %r10d
h = %r11d
old_h = %r11d
T1 = %r12d
y0 = %r13d
y1 = %r14d
y2 = %r15d
_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
_XMM_SAVE_SIZE = 0
_INP_END_SIZE = 8
_INP_SIZE = 8
_CTX_SIZE = 8
_RSP_SIZE = 8
_XFER = 0
_XMM_SAVE = _XFER + _XFER_SIZE
_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
_INP = _INP_END + _INP_END_SIZE
_CTX = _INP + _INP_SIZE
_RSP = _CTX + _CTX_SIZE
STACK_SIZE = _RSP + _RSP_SIZE
# rotate_Xs
# Rotate values of symbols X0...X3
.macro rotate_Xs
X_ = X0
X0 = X1
X1 = X2
X2 = X3
X3 = X_
.endm
# ROTATE_ARGS
# Rotate values of symbols a...h
.macro ROTATE_ARGS
old_h = h
TMP_ = h
h = g
g = f
f = e
e = d
d = c
c = b
b = a
a = TMP_
.endm
.macro FOUR_ROUNDS_AND_SCHED disp
################################### RND N + 0 ############################
mov a, y3 # y3 = a # MAJA
rorx $25, e, y0 # y0 = e >> 25 # S1A
rorx $11, e, y1 # y1 = e >> 11 # S1B
addl \disp(%rsp, SRND), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
mov f, y2 # y2 = f # CH
rorx $13, a, T1 # T1 = a >> 13 # S0B
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
xor g, y2 # y2 = f^g # CH
vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
rorx $6, e, y1 # y1 = (e >> 6) # S1
and e, y2 # y2 = (f^g)&e # CH
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
rorx $22, a, y1 # y1 = a >> 22 # S0A
add h, d # d = k + w + h + d # --
and b, y3 # y3 = (a|c)&b # MAJA
vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
rorx $2, a, T1 # T1 = (a >> 2) # S0
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
vpsrld $7, XTMP1, XTMP2
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
mov a, T1 # T1 = a # MAJB
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
vpslld $(32-7), XTMP1, XTMP3
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
vpsrld $18, XTMP1, XTMP2
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
add y3, h # h = t1 + S0 + MAJ # --
ROTATE_ARGS
################################### RND N + 1 ############################
mov a, y3 # y3 = a # MAJA
rorx $25, e, y0 # y0 = e >> 25 # S1A
rorx $11, e, y1 # y1 = e >> 11 # S1B
offset = \disp + 1*4
addl offset(%rsp, SRND), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
mov f, y2 # y2 = f # CH
rorx $13, a, T1 # T1 = a >> 13 # S0B
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
xor g, y2 # y2 = f^g # CH
rorx $6, e, y1 # y1 = (e >> 6) # S1
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
rorx $22, a, y1 # y1 = a >> 22 # S0A
and e, y2 # y2 = (f^g)&e # CH
add h, d # d = k + w + h + d # --
vpslld $(32-18), XTMP1, XTMP1
and b, y3 # y3 = (a|c)&b # MAJA
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
vpxor XTMP1, XTMP3, XTMP3
rorx $2, a, T1 # T1 = (a >> 2) # S0
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
mov a, T1 # T1 = a # MAJB
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
add y3, h # h = t1 + S0 + MAJ # --
vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
ROTATE_ARGS
################################### RND N + 2 ############################
mov a, y3 # y3 = a # MAJA
rorx $25, e, y0 # y0 = e >> 25 # S1A
offset = \disp + 2*4
addl offset(%rsp, SRND), h # h = k + w + h # --
vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
rorx $11, e, y1 # y1 = e >> 11 # S1B
or c, y3 # y3 = a|c # MAJA
mov f, y2 # y2 = f # CH
xor g, y2 # y2 = f^g # CH
rorx $13, a, T1 # T1 = a >> 13 # S0B
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
and e, y2 # y2 = (f^g)&e # CH
rorx $6, e, y1 # y1 = (e >> 6) # S1
vpxor XTMP3, XTMP2, XTMP2
add h, d # d = k + w + h + d # --
and b, y3 # y3 = (a|c)&b # MAJA
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
rorx $22, a, y1 # y1 = a >> 22 # S0A
vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
rorx $2, a ,T1 # T1 = (a >> 2) # S0
vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
mov a, T1 # T1 = a # MAJB
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1,h # h = k + w + h + S0 # --
add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
add y3,h # h = t1 + S0 + MAJ # --
ROTATE_ARGS
################################### RND N + 3 ############################
mov a, y3 # y3 = a # MAJA
rorx $25, e, y0 # y0 = e >> 25 # S1A
rorx $11, e, y1 # y1 = e >> 11 # S1B
offset = \disp + 3*4
addl offset(%rsp, SRND), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
mov f, y2 # y2 = f # CH
rorx $13, a, T1 # T1 = a >> 13 # S0B
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
xor g, y2 # y2 = f^g # CH
vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
rorx $6, e, y1 # y1 = (e >> 6) # S1
and e, y2 # y2 = (f^g)&e # CH
add h, d # d = k + w + h + d # --
and b, y3 # y3 = (a|c)&b # MAJA
vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
vpxor XTMP3, XTMP2, XTMP2
rorx $22, a, y1 # y1 = a >> 22 # S0A
add y0, y2 # y2 = S1 + CH # --
vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
rorx $2, a, T1 # T1 = (a >> 2) # S0
vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
mov a, T1 # T1 = a # MAJB
and c, T1 # T1 = a&c # MAJB
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
add y3, h # h = t1 + S0 + MAJ # --
ROTATE_ARGS
rotate_Xs
.endm
.macro DO_4ROUNDS disp
################################### RND N + 0 ###########################
mov f, y2 # y2 = f # CH
rorx $25, e, y0 # y0 = e >> 25 # S1A
rorx $11, e, y1 # y1 = e >> 11 # S1B
xor g, y2 # y2 = f^g # CH
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
rorx $6, e, y1 # y1 = (e >> 6) # S1
and e, y2 # y2 = (f^g)&e # CH
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
rorx $13, a, T1 # T1 = a >> 13 # S0B
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
rorx $22, a, y1 # y1 = a >> 22 # S0A
mov a, y3 # y3 = a # MAJA
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
rorx $2, a, T1 # T1 = (a >> 2) # S0
addl \disp(%rsp, SRND), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
mov a, T1 # T1 = a # MAJB
and b, y3 # y3 = (a|c)&b # MAJA
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
add h, d # d = k + w + h + d # --
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
ROTATE_ARGS
################################### RND N + 1 ###########################
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
mov f, y2 # y2 = f # CH
rorx $25, e, y0 # y0 = e >> 25 # S1A
rorx $11, e, y1 # y1 = e >> 11 # S1B
xor g, y2 # y2 = f^g # CH
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
rorx $6, e, y1 # y1 = (e >> 6) # S1
and e, y2 # y2 = (f^g)&e # CH
add y3, old_h # h = t1 + S0 + MAJ # --
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
rorx $13, a, T1 # T1 = a >> 13 # S0B
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
rorx $22, a, y1 # y1 = a >> 22 # S0A
mov a, y3 # y3 = a # MAJA
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
rorx $2, a, T1 # T1 = (a >> 2) # S0
offset = 4*1 + \disp
addl offset(%rsp, SRND), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
mov a, T1 # T1 = a # MAJB
and b, y3 # y3 = (a|c)&b # MAJA
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
add h, d # d = k + w + h + d # --
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
ROTATE_ARGS
################################### RND N + 2 ##############################
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
mov f, y2 # y2 = f # CH
rorx $25, e, y0 # y0 = e >> 25 # S1A
rorx $11, e, y1 # y1 = e >> 11 # S1B
xor g, y2 # y2 = f^g # CH
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
rorx $6, e, y1 # y1 = (e >> 6) # S1
and e, y2 # y2 = (f^g)&e # CH
add y3, old_h # h = t1 + S0 + MAJ # --
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
rorx $13, a, T1 # T1 = a >> 13 # S0B
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
rorx $22, a, y1 # y1 = a >> 22 # S0A
mov a, y3 # y3 = a # MAJA
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
rorx $2, a, T1 # T1 = (a >> 2) # S0
offset = 4*2 + \disp
addl offset(%rsp, SRND), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
mov a, T1 # T1 = a # MAJB
and b, y3 # y3 = (a|c)&b # MAJA
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
add h, d # d = k + w + h + d # --
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
ROTATE_ARGS
################################### RND N + 3 ###########################
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
mov f, y2 # y2 = f # CH
rorx $25, e, y0 # y0 = e >> 25 # S1A
rorx $11, e, y1 # y1 = e >> 11 # S1B
xor g, y2 # y2 = f^g # CH
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
rorx $6, e, y1 # y1 = (e >> 6) # S1
and e, y2 # y2 = (f^g)&e # CH
add y3, old_h # h = t1 + S0 + MAJ # --
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
rorx $13, a, T1 # T1 = a >> 13 # S0B
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
rorx $22, a, y1 # y1 = a >> 22 # S0A
mov a, y3 # y3 = a # MAJA
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
rorx $2, a, T1 # T1 = (a >> 2) # S0
offset = 4*3 + \disp
addl offset(%rsp, SRND), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
mov a, T1 # T1 = a # MAJB
and b, y3 # y3 = (a|c)&b # MAJA
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
add h, d # d = k + w + h + d # --
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
add y3, h # h = t1 + S0 + MAJ # --
ROTATE_ARGS
.endm
########################################################################
## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
## arg 1 : pointer to state
## arg 2 : pointer to input data
## arg 3 : Num blocks
########################################################################
.text
sha256_transform_rorx:
.align 32
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
mov %rsp, %rax
subq $STACK_SIZE, %rsp
and $-32, %rsp # align rsp to 32 byte boundary
mov %rax, _RSP(%rsp)
shl $6, NUM_BLKS # convert to bytes
jz .Ldone_hash
lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
mov NUM_BLKS, _INP_END(%rsp)
cmp NUM_BLKS, INP
je .Lonly_one_block
## load initial digest
mov (CTX), a
mov 4*1(CTX), b
mov 4*2(CTX), c
mov 4*3(CTX), d
mov 4*4(CTX), e
mov 4*5(CTX), f
mov 4*6(CTX), g
mov 4*7(CTX), h
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
vmovdqa _SHUF_00BA(%rip), SHUF_00BA
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
mov CTX, _CTX(%rsp)
.Loop0:
## Load first 16 dwords from two blocks
VMOVDQ 0*32(INP),XTMP0
VMOVDQ 1*32(INP),XTMP1
VMOVDQ 2*32(INP),XTMP2
VMOVDQ 3*32(INP),XTMP3
## byte swap data
vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
## transpose data into high/low halves
vperm2i128 $0x20, XTMP2, XTMP0, X0
vperm2i128 $0x31, XTMP2, XTMP0, X1
vperm2i128 $0x20, XTMP3, XTMP1, X2
vperm2i128 $0x31, XTMP3, XTMP1, X3
.Llast_block_enter:
add $64, INP
mov INP, _INP(%rsp)
## schedule 48 input dwords, by doing 3 rounds of 12 each
xor SRND, SRND
.align 16
.Loop1:
vpaddd K256+0*32(SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 0*32
vpaddd K256+1*32(SRND), X0, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 1*32
vpaddd K256+2*32(SRND), X0, XFER
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 2*32
vpaddd K256+3*32(SRND), X0, XFER
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 3*32
add $4*32, SRND
cmp $3*4*32, SRND
jb .Loop1
.Loop2:
## Do last 16 rounds with no scheduling
vpaddd K256+0*32(SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
DO_4ROUNDS _XFER + 0*32
vpaddd K256+1*32(SRND), X1, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
DO_4ROUNDS _XFER + 1*32
add $2*32, SRND
vmovdqa X2, X0
vmovdqa X3, X1
cmp $4*4*32, SRND
jb .Loop2
mov _CTX(%rsp), CTX
mov _INP(%rsp), INP
addm (4*0)(CTX),a
addm (4*1)(CTX),b
addm (4*2)(CTX),c
addm (4*3)(CTX),d
addm (4*4)(CTX),e
addm (4*5)(CTX),f
addm (4*6)(CTX),g
addm (4*7)(CTX),h
cmp _INP_END(%rsp), INP
ja .Ldone_hash
#### Do second block using previously scheduled results
xor SRND, SRND
.align 16
.Loop3:
DO_4ROUNDS _XFER + 0*32 + 16
DO_4ROUNDS _XFER + 1*32 + 16
add $2*32, SRND
cmp $4*4*32, SRND
jb .Loop3
mov _CTX(%rsp), CTX
mov _INP(%rsp), INP
add $64, INP
addm (4*0)(CTX),a
addm (4*1)(CTX),b
addm (4*2)(CTX),c
addm (4*3)(CTX),d
addm (4*4)(CTX),e
addm (4*5)(CTX),f
addm (4*6)(CTX),g
addm (4*7)(CTX),h
cmp _INP_END(%rsp), INP
jb .Loop0
ja .Ldone_hash
.Ldo_last_block:
VMOVDQ 0*16(INP),XWORD0
VMOVDQ 1*16(INP),XWORD1
VMOVDQ 2*16(INP),XWORD2
VMOVDQ 3*16(INP),XWORD3
vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
jmp .Llast_block_enter
.Lonly_one_block:
## load initial digest
mov (4*0)(CTX),a
mov (4*1)(CTX),b
mov (4*2)(CTX),c
mov (4*3)(CTX),d
mov (4*4)(CTX),e
mov (4*5)(CTX),f
mov (4*6)(CTX),g
mov (4*7)(CTX),h
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
vmovdqa _SHUF_00BA(%rip), SHUF_00BA
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
mov CTX, _CTX(%rsp)
jmp .Ldo_last_block
.Ldone_hash:
mov _RSP(%rsp), %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
ret
.endfn sha256_transform_rorx,globl
.section .rodata.cst512.K256, "aM", @progbits, 512
.align 64
K256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.rodata.cst32
PSHUFFLE_BYTE_FLIP_MASK:
.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
# shuffle xBxA -> 00BA
.rodata.cst32
_SHUF_00BA:
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
# shuffle xDxC -> DC00
.rodata.cst32
_SHUF_DC00:
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF

750
libc/nexgen32e/sha512.S Normal file
View file

@ -0,0 +1,750 @@
/////////////////////////////////////////////////////////////////////////
// Implement fast SHA-512 with AVX2 instructions. (x86_64)
//
// Copyright (C) 2013 Intel Corporation.
//
// Authors:
// James Guilford <james.guilford@intel.com>
// Kirk Yap <kirk.s.yap@intel.com>
// David Cote <david.m.cote@intel.com>
// Tim Chen <tim.c.chen@linux.intel.com>
//
// This software is available to you under a choice of one of two
// licenses. You may choose to be licensed under the terms of the GNU
// General Public License (GPL) Version 2, available from the file
// COPYING in the main directory of this source tree, or the
// OpenIB.org BSD license below:
//
// Redistribution and use in source and binary forms, with or
// without modification, are permitted provided that the following
// conditions are met:
//
// - Redistributions of source code must retain the above
// copyright notice, this list of conditions and the following
// disclaimer.
//
// - Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials
// provided with the distribution.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
/////////////////////////////////////////////////////////////////////////
//
// This code is described in an Intel White-Paper:
// "Fast SHA-512 Implementations on Intel Architecture Processors"
//
// To find it, surf to http://www.intel.com/p/en_US/embedded
// and search for that title.
//
/////////////////////////////////////////////////////////////////////////
// This code schedules 1 blocks at a time, with 4 lanes per block
/////////////////////////////////////////////////////////////////////////
#include "libc/macros.internal.h"
.ident "\n\
AVX2 SHA-512 (BSD-2 License)\n\
Copyright 2013 Intel Corporation\n"
.include "libc/disclaimer.inc"
# Virtual Registers
Y_0 = %ymm4
Y_1 = %ymm5
Y_2 = %ymm6
Y_3 = %ymm7
YTMP0 = %ymm0
YTMP1 = %ymm1
YTMP2 = %ymm2
YTMP3 = %ymm3
YTMP4 = %ymm8
XFER = YTMP0
BYTE_FLIP_MASK = %ymm9
# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
CTX1 = %rdi
CTX2 = %r12
# 2nd arg
INP = %rsi
# 3rd arg
NUM_BLKS = %rdx
c = %rcx
d = %r8
e = %rdx
y3 = %rsi
TBL = %rdi # clobbers CTX1
a = %rax
b = %rbx
f = %r9
g = %r10
h = %r11
old_h = %r11
T1 = %r12 # clobbers CTX2
y0 = %r13
y1 = %r14
y2 = %r15
# Local variables (stack frame)
XFER_SIZE = 4*8
SRND_SIZE = 1*8
INP_SIZE = 1*8
INPEND_SIZE = 1*8
CTX_SIZE = 1*8
RSPSAVE_SIZE = 1*8
GPRSAVE_SIZE = 5*8
frame_XFER = 0
frame_SRND = frame_XFER + XFER_SIZE
frame_INP = frame_SRND + SRND_SIZE
frame_INPEND = frame_INP + INP_SIZE
frame_CTX = frame_INPEND + INPEND_SIZE
frame_RSPSAVE = frame_CTX + CTX_SIZE
frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
frame_size = frame_GPRSAVE + GPRSAVE_SIZE
## assume buffers not aligned
#define VMOVDQ vmovdqu
# addm [mem], reg
# Add reg to mem using reg-mem add and store
.macro addm p1 p2
add \p1, \p2
mov \p2, \p1
.endm
# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
# Load ymm with mem and byte swap each dword
.macro COPY_YMM_AND_BSWAP p1 p2 p3
VMOVDQ \p2, \p1
vpshufb \p3, \p1, \p1
.endm
# rotate_Ys
# Rotate values of symbols Y0...Y3
.macro rotate_Ys
Y_ = Y_0
Y_0 = Y_1
Y_1 = Y_2
Y_2 = Y_3
Y_3 = Y_
.endm
# RotateState
.macro RotateState
# Rotate symbols a..h right
old_h = h
TMP_ = h
h = g
g = f
f = e
e = d
d = c
c = b
b = a
a = TMP_
.endm
# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
# YDST = {YSRC1, YSRC2} >> RVAL*8
.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
.endm
.macro FOUR_ROUNDS_AND_SCHED
################################### RND N + 0 #########################################
# Extract w[t-7]
MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
# Calculate w[t-16] + w[t-7]
vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
# Extract w[t-15]
MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
# Calculate sigma0
# Calculate w[t-15] ror 1
vpsrlq $1, YTMP1, YTMP2
vpsllq $(64-1), YTMP1, YTMP3
vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
# Calculate w[t-15] shr 7
vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
mov a, y3 # y3 = a # MAJA
rorx $41, e, y0 # y0 = e >> 41 # S1A
rorx $18, e, y1 # y1 = e >> 18 # S1B
add frame_XFER(%rsp),h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
mov f, y2 # y2 = f # CH
rorx $34, a, T1 # T1 = a >> 34 # S0B
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
xor g, y2 # y2 = f^g # CH
rorx $14, e, y1 # y1 = (e >> 14) # S1
and e, y2 # y2 = (f^g)&e # CH
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
rorx $39, a, y1 # y1 = a >> 39 # S0A
add h, d # d = k + w + h + d # --
and b, y3 # y3 = (a|c)&b # MAJA
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
rorx $28, a, T1 # T1 = (a >> 28) # S0
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
mov a, T1 # T1 = a # MAJB
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
add y3, h # h = t1 + S0 + MAJ # --
RotateState
################################### RND N + 1 #########################################
# Calculate w[t-15] ror 8
vpsrlq $8, YTMP1, YTMP2
vpsllq $(64-8), YTMP1, YTMP1
vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
# XOR the three components
vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
# Add three components, w[t-16], w[t-7] and sigma0
vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
# Move to appropriate lanes for calculating w[16] and w[17]
vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
# Move to appropriate lanes for calculating w[18] and w[19]
vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
# Calculate w[16] and w[17] in both 128 bit lanes
# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
mov a, y3 # y3 = a # MAJA
rorx $41, e, y0 # y0 = e >> 41 # S1A
rorx $18, e, y1 # y1 = e >> 18 # S1B
add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
mov f, y2 # y2 = f # CH
rorx $34, a, T1 # T1 = a >> 34 # S0B
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
xor g, y2 # y2 = f^g # CH
rorx $14, e, y1 # y1 = (e >> 14) # S1
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
rorx $39, a, y1 # y1 = a >> 39 # S0A
and e, y2 # y2 = (f^g)&e # CH
add h, d # d = k + w + h + d # --
and b, y3 # y3 = (a|c)&b # MAJA
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
rorx $28, a, T1 # T1 = (a >> 28) # S0
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
mov a, T1 # T1 = a # MAJB
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
add y3, h # h = t1 + S0 + MAJ # --
RotateState
################################### RND N + 2 #########################################
vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
# (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
# Add sigma1 to the other compunents to get w[16] and w[17]
vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
mov a, y3 # y3 = a # MAJA
rorx $41, e, y0 # y0 = e >> 41 # S1A
add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
rorx $18, e, y1 # y1 = e >> 18 # S1B
or c, y3 # y3 = a|c # MAJA
mov f, y2 # y2 = f # CH
xor g, y2 # y2 = f^g # CH
rorx $34, a, T1 # T1 = a >> 34 # S0B
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
and e, y2 # y2 = (f^g)&e # CH
rorx $14, e, y1 # y1 = (e >> 14) # S1
add h, d # d = k + w + h + d # --
and b, y3 # y3 = (a|c)&b # MAJA
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
rorx $39, a, y1 # y1 = a >> 39 # S0A
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
rorx $28, a, T1 # T1 = (a >> 28) # S0
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
mov a, T1 # T1 = a # MAJB
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
add y3, h # h = t1 + S0 + MAJ # --
RotateState
################################### RND N + 3 #########################################
vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
# (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
# to newly calculated sigma1 to get w[18] and w[19]
vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
# Form w[19, w[18], w17], w[16]
vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
mov a, y3 # y3 = a # MAJA
rorx $41, e, y0 # y0 = e >> 41 # S1A
rorx $18, e, y1 # y1 = e >> 18 # S1B
add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
mov f, y2 # y2 = f # CH
rorx $34, a, T1 # T1 = a >> 34 # S0B
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
xor g, y2 # y2 = f^g # CH
rorx $14, e, y1 # y1 = (e >> 14) # S1
and e, y2 # y2 = (f^g)&e # CH
add h, d # d = k + w + h + d # --
and b, y3 # y3 = (a|c)&b # MAJA
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
rorx $39, a, y1 # y1 = a >> 39 # S0A
add y0, y2 # y2 = S1 + CH # --
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
rorx $28, a, T1 # T1 = (a >> 28) # S0
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
mov a, T1 # T1 = a # MAJB
and c, T1 # T1 = a&c # MAJB
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
add y3, h # h = t1 + S0 + MAJ # --
RotateState
rotate_Ys
.endm
.macro DO_4ROUNDS
################################### RND N + 0 #########################################
mov f, y2 # y2 = f # CH
rorx $41, e, y0 # y0 = e >> 41 # S1A
rorx $18, e, y1 # y1 = e >> 18 # S1B
xor g, y2 # y2 = f^g # CH
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
rorx $14, e, y1 # y1 = (e >> 14) # S1
and e, y2 # y2 = (f^g)&e # CH
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
rorx $34, a, T1 # T1 = a >> 34 # S0B
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
rorx $39, a, y1 # y1 = a >> 39 # S0A
mov a, y3 # y3 = a # MAJA
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
rorx $28, a, T1 # T1 = (a >> 28) # S0
add frame_XFER(%rsp), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
mov a, T1 # T1 = a # MAJB
and b, y3 # y3 = (a|c)&b # MAJA
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
add h, d # d = k + w + h + d # --
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
RotateState
################################### RND N + 1 #########################################
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
mov f, y2 # y2 = f # CH
rorx $41, e, y0 # y0 = e >> 41 # S1A
rorx $18, e, y1 # y1 = e >> 18 # S1B
xor g, y2 # y2 = f^g # CH
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
rorx $14, e, y1 # y1 = (e >> 14) # S1
and e, y2 # y2 = (f^g)&e # CH
add y3, old_h # h = t1 + S0 + MAJ # --
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
rorx $34, a, T1 # T1 = a >> 34 # S0B
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
rorx $39, a, y1 # y1 = a >> 39 # S0A
mov a, y3 # y3 = a # MAJA
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
rorx $28, a, T1 # T1 = (a >> 28) # S0
add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
mov a, T1 # T1 = a # MAJB
and b, y3 # y3 = (a|c)&b # MAJA
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
add h, d # d = k + w + h + d # --
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
RotateState
################################### RND N + 2 #########################################
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
mov f, y2 # y2 = f # CH
rorx $41, e, y0 # y0 = e >> 41 # S1A
rorx $18, e, y1 # y1 = e >> 18 # S1B
xor g, y2 # y2 = f^g # CH
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
rorx $14, e, y1 # y1 = (e >> 14) # S1
and e, y2 # y2 = (f^g)&e # CH
add y3, old_h # h = t1 + S0 + MAJ # --
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
rorx $34, a, T1 # T1 = a >> 34 # S0B
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
rorx $39, a, y1 # y1 = a >> 39 # S0A
mov a, y3 # y3 = a # MAJA
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
rorx $28, a, T1 # T1 = (a >> 28) # S0
add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
mov a, T1 # T1 = a # MAJB
and b, y3 # y3 = (a|c)&b # MAJA
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
add h, d # d = k + w + h + d # --
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
RotateState
################################### RND N + 3 #########################################
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
mov f, y2 # y2 = f # CH
rorx $41, e, y0 # y0 = e >> 41 # S1A
rorx $18, e, y1 # y1 = e >> 18 # S1B
xor g, y2 # y2 = f^g # CH
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
rorx $14, e, y1 # y1 = (e >> 14) # S1
and e, y2 # y2 = (f^g)&e # CH
add y3, old_h # h = t1 + S0 + MAJ # --
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
rorx $34, a, T1 # T1 = a >> 34 # S0B
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
rorx $39, a, y1 # y1 = a >> 39 # S0A
mov a, y3 # y3 = a # MAJA
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
rorx $28, a, T1 # T1 = (a >> 28) # S0
add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
or c, y3 # y3 = a|c # MAJA
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
mov a, T1 # T1 = a # MAJB
and b, y3 # y3 = (a|c)&b # MAJA
and c, T1 # T1 = a&c # MAJB
add y0, y2 # y2 = S1 + CH # --
add h, d # d = k + w + h + d # --
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
add y1, h # h = k + w + h + S0 # --
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
add y3, h # h = t1 + S0 + MAJ # --
RotateState
.endm
########################################################################
# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
# Purpose: Updates the SHA512 digest stored at "state" with the message
# stored in "data".
# The size of the message pointed to by "data" must be an integer multiple
# of SHA512 message blocks.
# "blocks" is the message length in SHA512 blocks
########################################################################
sha512_transform_rorx:
# Allocate Stack Space
mov %rsp, %rax
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
mov %rax, frame_RSPSAVE(%rsp)
# Save GPRs
mov %rbx, 8*0+frame_GPRSAVE(%rsp)
mov %r12, 8*1+frame_GPRSAVE(%rsp)
mov %r13, 8*2+frame_GPRSAVE(%rsp)
mov %r14, 8*3+frame_GPRSAVE(%rsp)
mov %r15, 8*4+frame_GPRSAVE(%rsp)
shl $7, NUM_BLKS # convert to bytes
jz .Ldone_hash
add INP, NUM_BLKS # pointer to end of data
mov NUM_BLKS, frame_INPEND(%rsp)
## load initial digest
mov 8*0(CTX1), a
mov 8*1(CTX1), b
mov 8*2(CTX1), c
mov 8*3(CTX1), d
mov 8*4(CTX1), e
mov 8*5(CTX1), f
mov 8*6(CTX1), g
mov 8*7(CTX1), h
# save %rdi (CTX) before it gets clobbered
mov %rdi, frame_CTX(%rsp)
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
.Loop0:
lea K512(%rip), TBL
## byte swap first 16 dwords
COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
mov INP, frame_INP(%rsp)
## schedule 64 input dwords, by doing 12 rounds of 4 each
movq $4, frame_SRND(%rsp)
.align 16
.Loop1:
vpaddq (TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
FOUR_ROUNDS_AND_SCHED
vpaddq 1*32(TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
FOUR_ROUNDS_AND_SCHED
vpaddq 2*32(TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
FOUR_ROUNDS_AND_SCHED
vpaddq 3*32(TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
add $(4*32), TBL
FOUR_ROUNDS_AND_SCHED
subq $1, frame_SRND(%rsp)
jne .Loop1
movq $2, frame_SRND(%rsp)
.Loop2:
vpaddq (TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
DO_4ROUNDS
vpaddq 1*32(TBL), Y_1, XFER
vmovdqa XFER, frame_XFER(%rsp)
add $(2*32), TBL
DO_4ROUNDS
vmovdqa Y_2, Y_0
vmovdqa Y_3, Y_1
subq $1, frame_SRND(%rsp)
jne .Loop2
mov frame_CTX(%rsp), CTX2
addm 8*0(CTX2), a
addm 8*1(CTX2), b
addm 8*2(CTX2), c
addm 8*3(CTX2), d
addm 8*4(CTX2), e
addm 8*5(CTX2), f
addm 8*6(CTX2), g
addm 8*7(CTX2), h
mov frame_INP(%rsp), INP
add $128, INP
cmp frame_INPEND(%rsp), INP
jne .Loop0
.Ldone_hash:
# Restore GPRs
mov 8*0+frame_GPRSAVE(%rsp), %rbx
mov 8*1+frame_GPRSAVE(%rsp), %r12
mov 8*2+frame_GPRSAVE(%rsp), %r13
mov 8*3+frame_GPRSAVE(%rsp), %r14
mov 8*4+frame_GPRSAVE(%rsp), %r15
# Restore Stack Pointer
mov frame_RSPSAVE(%rsp), %rsp
ret
.endfn sha512_transform_rorx,globl
########################################################################
### Binary Data
# Mergeable 640-byte rodata section. This allows linker to merge the table
# with other, exactly the same 640-byte fragment of another rodata section
# (if such section exists).
.section .rodata.cst640.K512, "aM", @progbits, 640
.align 64
# K[t] used in SHA512 hashing
K512:
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.rodata.cst32
# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
PSHUFFLE_BYTE_FLIP_MASK:
.octa 0x08090a0b0c0d0e0f0001020304050607
.octa 0x18191a1b1c1d1e1f1011121314151617
.rodata.cst32
MASK_YMM_LO:
.octa 0x00000000000000000000000000000000
.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF

View file

@ -24,4 +24,4 @@ if CLANG=$(command -v clang); then
o/$MODE/test/libc/release/smokeclang.com.dbg || exit o/$MODE/test/libc/release/smokeclang.com.dbg || exit
fi fi
touch o/$MODE/test/libc/release/clang.ok touch o/$MODE/test/libc/release/lld.ok

View file

@ -119,9 +119,9 @@
#define MBEDTLS_MD5_SMALLER #define MBEDTLS_MD5_SMALLER
#define MBEDTLS_SHA1_SMALLER #define MBEDTLS_SHA1_SMALLER
#ifdef TINY
#define MBEDTLS_SHA256_SMALLER #define MBEDTLS_SHA256_SMALLER
#define MBEDTLS_SHA512_SMALLER #define MBEDTLS_SHA512_SMALLER
#ifdef TINY
#define MBEDTLS_AES_ROM_TABLES #define MBEDTLS_AES_ROM_TABLES
#define MBEDTLS_AES_FEWER_TABLES #define MBEDTLS_AES_FEWER_TABLES
#else #else

View file

@ -96,8 +96,7 @@ static int gcm_gen_table( mbedtls_gcm_context *ctx )
#if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64) #if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64)
/* With CLMUL support, we need only h, not the rest of the table */ /* With CLMUL support, we need only h, not the rest of the table */
if( X86_HAVE( PCLMUL ) ) if (X86_HAVE(AES) && X86_HAVE(PCLMUL)) return 0;
return( 0 );
#endif #endif
/* 0 corresponds to 0 in GF(2^128) */ /* 0 corresponds to 0 in GF(2^128) */
@ -191,7 +190,7 @@ static void gcm_mult( mbedtls_gcm_context *ctx, const unsigned char x[16],
uint64_t zh, zl; uint64_t zh, zl;
#if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64) #if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64)
if( X86_HAVE( PCLMUL ) ) { if (X86_HAVE(AES) && X86_HAVE(PCLMUL)) {
unsigned char h[16]; unsigned char h[16];
PUT_UINT32_BE( ctx->HH[8] >> 32, h, 0 ); PUT_UINT32_BE( ctx->HH[8] >> 32, h, 0 );
@ -240,11 +239,11 @@ static void gcm_mult( mbedtls_gcm_context *ctx, const unsigned char x[16],
} }
int mbedtls_gcm_starts( mbedtls_gcm_context *ctx, int mbedtls_gcm_starts( mbedtls_gcm_context *ctx,
int mode, int mode,
const unsigned char *iv, const unsigned char *iv,
size_t iv_len, size_t iv_len,
const unsigned char *add, const unsigned char *add,
size_t add_len ) size_t add_len )
{ {
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED; int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
unsigned char work_buf[16]; unsigned char work_buf[16];
@ -327,9 +326,9 @@ int mbedtls_gcm_starts( mbedtls_gcm_context *ctx,
} }
int mbedtls_gcm_update( mbedtls_gcm_context *ctx, int mbedtls_gcm_update( mbedtls_gcm_context *ctx,
size_t length, size_t length,
const unsigned char *input, const unsigned char *input,
unsigned char *output ) unsigned char *output )
{ {
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED; int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
unsigned char ectr[16]; unsigned char ectr[16];
@ -390,8 +389,8 @@ int mbedtls_gcm_update( mbedtls_gcm_context *ctx,
} }
int mbedtls_gcm_finish( mbedtls_gcm_context *ctx, int mbedtls_gcm_finish( mbedtls_gcm_context *ctx,
unsigned char *tag, unsigned char *tag,
size_t tag_len ) size_t tag_len )
{ {
unsigned char work_buf[16]; unsigned char work_buf[16];
size_t i; size_t i;
@ -431,16 +430,16 @@ int mbedtls_gcm_finish( mbedtls_gcm_context *ctx,
} }
int mbedtls_gcm_crypt_and_tag( mbedtls_gcm_context *ctx, int mbedtls_gcm_crypt_and_tag( mbedtls_gcm_context *ctx,
int mode, int mode,
size_t length, size_t length,
const unsigned char *iv, const unsigned char *iv,
size_t iv_len, size_t iv_len,
const unsigned char *add, const unsigned char *add,
size_t add_len, size_t add_len,
const unsigned char *input, const unsigned char *input,
unsigned char *output, unsigned char *output,
size_t tag_len, size_t tag_len,
unsigned char *tag ) unsigned char *tag )
{ {
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED; int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
@ -464,15 +463,15 @@ int mbedtls_gcm_crypt_and_tag( mbedtls_gcm_context *ctx,
} }
int mbedtls_gcm_auth_decrypt( mbedtls_gcm_context *ctx, int mbedtls_gcm_auth_decrypt( mbedtls_gcm_context *ctx,
size_t length, size_t length,
const unsigned char *iv, const unsigned char *iv,
size_t iv_len, size_t iv_len,
const unsigned char *add, const unsigned char *add,
size_t add_len, size_t add_len,
const unsigned char *tag, const unsigned char *tag,
size_t tag_len, size_t tag_len,
const unsigned char *input, const unsigned char *input,
unsigned char *output ) unsigned char *output )
{ {
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED; int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
unsigned char check_tag[16]; unsigned char check_tag[16];

View file

@ -1,4 +1,6 @@
#include "libc/bits/bits.h" #include "libc/bits/bits.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h" #include "libc/str/str.h"
#include "third_party/mbedtls/common.h" #include "third_party/mbedtls/common.h"
#include "third_party/mbedtls/endian.h" #include "third_party/mbedtls/endian.h"
@ -37,6 +39,8 @@ asm(".include \"libc/disclaimer.inc\"");
* http://www.itl.nist.gov/fipspubs/fip180-1.htm * http://www.itl.nist.gov/fipspubs/fip180-1.htm
*/ */
void sha1_transform_avx2(mbedtls_sha1_context *, const uint8_t *, int);
#define SHA1_VALIDATE_RET(cond) \ #define SHA1_VALIDATE_RET(cond) \
MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA1_BAD_INPUT_DATA ) MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA1_BAD_INPUT_DATA )
@ -145,6 +149,11 @@ int mbedtls_internal_sha1_process( mbedtls_sha1_context *ctx,
SHA1_VALIDATE_RET( ctx != NULL ); SHA1_VALIDATE_RET( ctx != NULL );
SHA1_VALIDATE_RET( (const unsigned char *)data != NULL ); SHA1_VALIDATE_RET( (const unsigned char *)data != NULL );
if (!IsTiny() && X86_HAVE(AVX2) && X86_HAVE(BMI) && X86_HAVE(BMI2)) {
sha1_transform_avx2(ctx, data, 1);
return 0;
}
#ifdef MBEDTLS_SHA1_SMALLER #ifdef MBEDTLS_SHA1_SMALLER
#define ROL(a, b) ((a << b) | (a >> (32 - b))) #define ROL(a, b) ((a << b) | (a >> (32 - b)))
@ -387,8 +396,8 @@ int mbedtls_sha1_update_ret( mbedtls_sha1_context *ctx,
size_t ilen ) size_t ilen )
{ {
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED; int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
size_t fill;
uint32_t left; uint32_t left;
size_t n, fill;
SHA1_VALIDATE_RET( ctx != NULL ); SHA1_VALIDATE_RET( ctx != NULL );
SHA1_VALIDATE_RET( ilen == 0 || input != NULL ); SHA1_VALIDATE_RET( ilen == 0 || input != NULL );
@ -417,6 +426,12 @@ int mbedtls_sha1_update_ret( mbedtls_sha1_context *ctx,
left = 0; left = 0;
} }
if (!IsTiny() && ilen >= 64 && X86_HAVE(AVX2) && X86_HAVE(BMI) && X86_HAVE(BMI2)) {
sha1_transform_avx2(ctx, input, ilen / 64);
input += ROUNDDOWN(ilen, 64);
ilen -= ROUNDDOWN(ilen, 64);
}
while( ilen >= 64 ) while( ilen >= 64 )
{ {
if( ( ret = mbedtls_internal_sha1_process( ctx, input ) ) != 0 ) if( ( ret = mbedtls_internal_sha1_process( ctx, input ) ) != 0 )

View file

@ -18,8 +18,8 @@ COSMOPOLITAN_C_START_
*/ */
typedef struct mbedtls_sha1_context typedef struct mbedtls_sha1_context
{ {
uint32_t total[2]; /*!< The number of Bytes processed. */
uint32_t state[5]; /*!< The intermediate digest state. */ uint32_t state[5]; /*!< The intermediate digest state. */
uint32_t total[2]; /*!< The number of Bytes processed. */
uint8_t buffer[64]; /*!< The data block being processed. */ uint8_t buffer[64]; /*!< The data block being processed. */
} }
mbedtls_sha1_context; mbedtls_sha1_context;

View file

@ -1,3 +1,6 @@
#include "libc/dce.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h" #include "libc/str/str.h"
#include "third_party/mbedtls/common.h" #include "third_party/mbedtls/common.h"
#include "third_party/mbedtls/endian.h" #include "third_party/mbedtls/endian.h"
@ -40,6 +43,8 @@ asm(".include \"libc/disclaimer.inc\"");
MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA256_BAD_INPUT_DATA ) MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA256_BAD_INPUT_DATA )
#define SHA256_VALIDATE(cond) MBEDTLS_INTERNAL_VALIDATE( cond ) #define SHA256_VALIDATE(cond) MBEDTLS_INTERNAL_VALIDATE( cond )
void sha256_transform_rorx(mbedtls_sha256_context *, const uint8_t *, int);
#if !defined(MBEDTLS_SHA256_ALT) #if !defined(MBEDTLS_SHA256_ALT)
void mbedtls_sha256_init( mbedtls_sha256_context *ctx ) void mbedtls_sha256_init( mbedtls_sha256_context *ctx )
@ -151,7 +156,7 @@ static const uint32_t K[] =
} while( 0 ) } while( 0 )
int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx, int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
const unsigned char data[64] ) const unsigned char data[64] )
{ {
struct struct
{ {
@ -164,20 +169,22 @@ int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
SHA256_VALIDATE_RET( ctx != NULL ); SHA256_VALIDATE_RET( ctx != NULL );
SHA256_VALIDATE_RET( (const unsigned char *)data != NULL ); SHA256_VALIDATE_RET( (const unsigned char *)data != NULL );
if (!IsTiny() && X86_HAVE(AVX2) && X86_HAVE(BMI2)) {
sha256_transform_rorx(ctx, data, 1);
return 0;
}
for( i = 0; i < 8; i++ ) for( i = 0; i < 8; i++ )
local.A[i] = ctx->state[i]; local.A[i] = ctx->state[i];
#if defined(MBEDTLS_SHA256_SMALLER) #if defined(MBEDTLS_SHA256_SMALLER)
for( i = 0; i < 64; i++ ) for( i = 0; i < 64; i++ ) {
{
if( i < 16 ) if( i < 16 )
GET_UINT32_BE( local.W[i], data, 4 * i ); GET_UINT32_BE( local.W[i], data, 4 * i );
else else
R( i ); R( i );
P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4], P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
local.A[5], local.A[6], local.A[7], local.W[i], K[i] ); local.A[5], local.A[6], local.A[7], local.W[i], K[i] );
local.temp1 = local.A[7]; local.A[7] = local.A[6]; local.temp1 = local.A[7]; local.A[7] = local.A[6];
local.A[6] = local.A[5]; local.A[5] = local.A[4]; local.A[6] = local.A[5]; local.A[5] = local.A[4];
local.A[4] = local.A[3]; local.A[3] = local.A[2]; local.A[4] = local.A[3]; local.A[3] = local.A[2];
@ -187,9 +194,7 @@ int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
#else /* MBEDTLS_SHA256_SMALLER */ #else /* MBEDTLS_SHA256_SMALLER */
for( i = 0; i < 16; i++ ) for( i = 0; i < 16; i++ )
GET_UINT32_BE( local.W[i], data, 4 * i ); GET_UINT32_BE( local.W[i], data, 4 * i );
for( i = 0; i < 16; i += 8 ) {
for( i = 0; i < 16; i += 8 )
{
P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4], P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
local.A[5], local.A[6], local.A[7], local.W[i+0], K[i+0] ); local.A[5], local.A[6], local.A[7], local.W[i+0], K[i+0] );
P( local.A[7], local.A[0], local.A[1], local.A[2], local.A[3], P( local.A[7], local.A[0], local.A[1], local.A[2], local.A[3],
@ -207,9 +212,7 @@ int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
P( local.A[1], local.A[2], local.A[3], local.A[4], local.A[5], P( local.A[1], local.A[2], local.A[3], local.A[4], local.A[5],
local.A[6], local.A[7], local.A[0], local.W[i+7], K[i+7] ); local.A[6], local.A[7], local.A[0], local.W[i+7], K[i+7] );
} }
for( i = 16; i < 64; i += 8 ) {
for( i = 16; i < 64; i += 8 )
{
P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4], P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
local.A[5], local.A[6], local.A[7], R(i+0), K[i+0] ); local.A[5], local.A[6], local.A[7], R(i+0), K[i+0] );
P( local.A[7], local.A[0], local.A[1], local.A[2], local.A[3], P( local.A[7], local.A[0], local.A[1], local.A[2], local.A[3],
@ -278,6 +281,12 @@ int mbedtls_sha256_update_ret( mbedtls_sha256_context *ctx,
left = 0; left = 0;
} }
if (!IsTiny() && ilen >= 64 && X86_HAVE(AVX2) && X86_HAVE(BMI2)) {
sha256_transform_rorx(ctx, input, ilen / 64);
input += ROUNDDOWN(ilen, 64);
ilen -= ROUNDDOWN(ilen, 64);
}
while( ilen >= 64 ) while( ilen >= 64 )
{ {
if( ( ret = mbedtls_internal_sha256_process( ctx, input ) ) != 0 ) if( ( ret = mbedtls_internal_sha256_process( ctx, input ) ) != 0 )

View file

@ -16,8 +16,8 @@ COSMOPOLITAN_C_START_
*/ */
typedef struct mbedtls_sha256_context typedef struct mbedtls_sha256_context
{ {
uint32_t total[2]; /*!< The number of Bytes processed. */
uint32_t state[8]; /*!< The intermediate digest state. */ uint32_t state[8]; /*!< The intermediate digest state. */
uint32_t total[2]; /*!< The number of Bytes processed. */
unsigned char buffer[64]; /*!< The data block being processed. */ unsigned char buffer[64]; /*!< The data block being processed. */
int is224; /*!< Determines which function to use: int is224; /*!< Determines which function to use:
0: Use SHA-256, or 1: Use SHA-224. */ 0: Use SHA-256, or 1: Use SHA-224. */

View file

@ -1,4 +1,6 @@
#include "libc/literal.h" #include "libc/literal.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h" #include "libc/str/str.h"
#include "third_party/mbedtls/common.h" #include "third_party/mbedtls/common.h"
#include "third_party/mbedtls/endian.h" #include "third_party/mbedtls/endian.h"
@ -37,6 +39,8 @@ asm(".include \"libc/disclaimer.inc\"");
* http://csrc.nist.gov/publications/fips/fips180-2/fips180-2.pdf * http://csrc.nist.gov/publications/fips/fips180-2/fips180-2.pdf
*/ */
void sha512_transform_rorx(mbedtls_sha512_context *, const uint8_t *, int);
#if defined(MBEDTLS_SHA512_C) #if defined(MBEDTLS_SHA512_C)
#define SHA512_VALIDATE_RET(cond) \ #define SHA512_VALIDATE_RET(cond) \
@ -224,12 +228,16 @@ int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx,
SHA512_VALIDATE_RET( ctx != NULL ); SHA512_VALIDATE_RET( ctx != NULL );
SHA512_VALIDATE_RET( (const unsigned char *)data != NULL ); SHA512_VALIDATE_RET( (const unsigned char *)data != NULL );
if (!IsTiny() && X86_HAVE(AVX2)) {
sha512_transform_rorx(ctx, data, 1);
return 0;
}
#define SHR(x,n) ((x) >> (n)) #define SHR(x,n) ((x) >> (n))
#define ROTR(x,n) (SHR((x),(n)) | ((x) << (64 - (n)))) #define ROTR(x,n) (SHR((x),(n)) | ((x) << (64 - (n))))
#define S0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7)) #define S0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7))
#define S1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x, 6)) #define S1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x, 6))
#define S2(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39)) #define S2(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39))
#define S3(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41)) #define S3(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41))
@ -263,10 +271,14 @@ int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx,
P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4], P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
local.A[5], local.A[6], local.A[7], local.W[i], K[i] ); local.A[5], local.A[6], local.A[7], local.W[i], K[i] );
local.temp1 = local.A[7]; local.A[7] = local.A[6]; local.temp1 = local.A[7];
local.A[6] = local.A[5]; local.A[5] = local.A[4]; local.A[7] = local.A[6];
local.A[4] = local.A[3]; local.A[3] = local.A[2]; local.A[6] = local.A[5];
local.A[2] = local.A[1]; local.A[1] = local.A[0]; local.A[5] = local.A[4];
local.A[4] = local.A[3];
local.A[3] = local.A[2];
local.A[2] = local.A[1];
local.A[1] = local.A[0];
local.A[0] = local.temp1; local.A[0] = local.temp1;
} }
#else /* MBEDTLS_SHA512_SMALLER */ #else /* MBEDTLS_SHA512_SMALLER */
@ -362,6 +374,12 @@ int mbedtls_sha512_update_ret( mbedtls_sha512_context *ctx,
left = 0; left = 0;
} }
if (!IsTiny() && ilen >= 128 && X86_HAVE(AVX2)) {
sha512_transform_rorx(ctx, input, ilen / 128);
input += ROUNDDOWN(ilen, 128);
ilen -= ROUNDDOWN(ilen, 128);
}
while( ilen >= 128 ) while( ilen >= 128 )
{ {
if( ( ret = mbedtls_internal_sha512_process( ctx, input ) ) != 0 ) if( ( ret = mbedtls_internal_sha512_process( ctx, input ) ) != 0 )

View file

@ -16,8 +16,8 @@ COSMOPOLITAN_C_START_
*/ */
typedef struct mbedtls_sha512_context typedef struct mbedtls_sha512_context
{ {
uint64_t total[2]; /*!< The number of Bytes processed. */
uint64_t state[8]; /*!< The intermediate digest state. */ uint64_t state[8]; /*!< The intermediate digest state. */
uint64_t total[2]; /*!< The number of Bytes processed. */
unsigned char buffer[128]; /*!< The data block being processed. */ unsigned char buffer[128]; /*!< The data block being processed. */
#if !defined(MBEDTLS_SHA512_NO_SHA384) #if !defined(MBEDTLS_SHA512_NO_SHA384)
int is384; /*!< Determines which function to use: int is384; /*!< Determines which function to use:

View file

@ -1021,17 +1021,15 @@ int execute_tests(int argc, const char **argv, const char *default_filename) {
if (unmet_dep_count > 0 || ret == DISPATCH_UNSUPPORTED_SUITE) { if (unmet_dep_count > 0 || ret == DISPATCH_UNSUPPORTED_SUITE) {
total_skipped++; total_skipped++;
WRITE("----"); WRITE("----");
if (1 == option_verbose && ret == DISPATCH_UNSUPPORTED_SUITE) {
WRITE("\n Test Suite not enabled");
}
if (1 == option_verbose && unmet_dep_count > 0) { if (1 == option_verbose && unmet_dep_count > 0) {
WRITE("\n Unmet dependencies: "); WRITE(" (unmet dependencies: ");
for (i = 0; i < unmet_dep_count; i++) { for (i = 0; i < unmet_dep_count; i++) {
WRITE("%d ", unmet_dependencies[i]); if (i) WRITE(",");
WRITE("%d", unmet_dependencies[i]);
} }
if (missing_unmet_dependencies) WRITE("..."); if (missing_unmet_dependencies) WRITE("...");
} }
WRITE("\n"); WRITE(")\n");
fflush(stdout); fflush(stdout);
unmet_dep_count = 0; unmet_dep_count = 0;
missing_unmet_dependencies = 0; missing_unmet_dependencies = 0;