mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 03:27:39 +00:00
651 lines
14 KiB
ArmAsm
651 lines
14 KiB
ArmAsm
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
|
│ vi: set noet ft=asm ts=8 sw=8 fenc=utf-8 :vi │
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
│ │
|
|
│ Copyright 2014 Intel Corporation │
|
|
│ │
|
|
│ Redistribution and use in source and binary forms, with or without │
|
|
│ modification, are permitted provided that the following conditions │
|
|
│ are met: │
|
|
│ │
|
|
│ * Redistributions of source code must retain the above copyright │
|
|
│ notice, this list of conditions and the following disclaimer. │
|
|
│ * Redistributions in binary form must reproduce the above copyright │
|
|
│ notice, this list of conditions and the following disclaimer in │
|
|
│ the documentation and/or other materials provided with the │
|
|
│ distribution. │
|
|
│ * Neither the name of Intel Corporation nor the names of its │
|
|
│ contributors may be used to endorse or promote products derived │
|
|
│ from this software without specific prior written permission. │
|
|
│ │
|
|
│ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS │
|
|
│ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT │
|
|
│ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR │
|
|
│ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT │
|
|
│ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, │
|
|
│ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT │
|
|
│ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │
|
|
│ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │
|
|
│ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT │
|
|
│ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE │
|
|
│ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. │
|
|
│ │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "libc/macros.h"
|
|
|
|
.section .notice,"aR",@progbits
|
|
.asciz "\n\n\
|
|
AVX2 SHA-1 (BSD-3 License)n\
|
|
Copyright 2014 Intel Corporation"
|
|
.previous
|
|
|
|
#define CTX %rdi /* arg1 */
|
|
#define BUF %rsi /* arg2 */
|
|
#define CNT %rdx /* arg3 */
|
|
|
|
#define REG_A %ecx
|
|
#define REG_B %esi
|
|
#define REG_C %edi
|
|
#define REG_D %eax
|
|
#define REG_E %edx
|
|
#define REG_TB %ebx
|
|
#define REG_TA %r12d
|
|
#define REG_RA %rcx
|
|
#define REG_RB %rsi
|
|
#define REG_RC %rdi
|
|
#define REG_RD %rax
|
|
#define REG_RE %rdx
|
|
#define REG_RTA %r12
|
|
#define REG_RTB %rbx
|
|
#define REG_T1 %r11d
|
|
#define xmm_mov vmovups
|
|
#define RND_F1 1
|
|
#define RND_F2 2
|
|
#define RND_F3 3
|
|
|
|
.macro REGALLOC
|
|
.set A, REG_A
|
|
.set B, REG_B
|
|
.set C, REG_C
|
|
.set D, REG_D
|
|
.set E, REG_E
|
|
.set TB, REG_TB
|
|
.set TA, REG_TA
|
|
.set RA, REG_RA
|
|
.set RB, REG_RB
|
|
.set RC, REG_RC
|
|
.set RD, REG_RD
|
|
.set RE, REG_RE
|
|
.set RTA, REG_RTA
|
|
.set RTB, REG_RTB
|
|
.set T1, REG_T1
|
|
.endm
|
|
|
|
#define HASH_PTR %r9
|
|
#define BLOCKS_CTR %r8
|
|
#define BUFFER_PTR %r10
|
|
#define BUFFER_PTR2 %r13
|
|
|
|
#define PRECALC_BUF %r14
|
|
#define WK_BUF %r15
|
|
|
|
#define W_TMP %xmm0
|
|
#define WY_TMP %ymm0
|
|
#define WY_TMP2 %ymm9
|
|
|
|
# AVX2 variables
|
|
#define WY0 %ymm3
|
|
#define WY4 %ymm5
|
|
#define WY08 %ymm7
|
|
#define WY12 %ymm8
|
|
#define WY16 %ymm12
|
|
#define WY20 %ymm13
|
|
#define WY24 %ymm14
|
|
#define WY28 %ymm15
|
|
|
|
#define YMM_SHUFB_BSWAP %ymm10
|
|
|
|
/*
|
|
* Keep 2 iterations precalculated at a time:
|
|
* - 80 DWORDs per iteration * 2
|
|
*/
|
|
#define W_SIZE (80*2*2 +16)
|
|
|
|
#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
|
|
#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
|
|
|
|
|
|
.macro UPDATE_HASH hash, val
|
|
add \hash, \val
|
|
mov \val, \hash
|
|
.endm
|
|
|
|
.macro PRECALC_RESET_WY
|
|
.set WY_00, WY0
|
|
.set WY_04, WY4
|
|
.set WY_08, WY08
|
|
.set WY_12, WY12
|
|
.set WY_16, WY16
|
|
.set WY_20, WY20
|
|
.set WY_24, WY24
|
|
.set WY_28, WY28
|
|
.set WY_32, WY_00
|
|
.endm
|
|
|
|
.macro PRECALC_ROTATE_WY
|
|
/* Rotate macros */
|
|
.set WY_32, WY_28
|
|
.set WY_28, WY_24
|
|
.set WY_24, WY_20
|
|
.set WY_20, WY_16
|
|
.set WY_16, WY_12
|
|
.set WY_12, WY_08
|
|
.set WY_08, WY_04
|
|
.set WY_04, WY_00
|
|
.set WY_00, WY_32
|
|
|
|
/* Define register aliases */
|
|
.set WY, WY_00
|
|
.set WY_minus_04, WY_04
|
|
.set WY_minus_08, WY_08
|
|
.set WY_minus_12, WY_12
|
|
.set WY_minus_16, WY_16
|
|
.set WY_minus_20, WY_20
|
|
.set WY_minus_24, WY_24
|
|
.set WY_minus_28, WY_28
|
|
.set WY_minus_32, WY
|
|
.endm
|
|
|
|
.macro PRECALC_00_15
|
|
.if (i == 0) # Initialize and rotate registers
|
|
PRECALC_RESET_WY
|
|
PRECALC_ROTATE_WY
|
|
.endif
|
|
/* message scheduling pre-compute for rounds 0-15 */
|
|
.if ((i & 7) == 0)
|
|
/*
|
|
* blended AVX2 and ALU instruction scheduling
|
|
* 1 vector iteration per 8 rounds
|
|
*/
|
|
vmovdqu (i * 2)(BUFFER_PTR), W_TMP
|
|
.elseif ((i & 7) == 1)
|
|
vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
|
|
WY_TMP, WY_TMP
|
|
.elseif ((i & 7) == 2)
|
|
vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
|
|
.elseif ((i & 7) == 4)
|
|
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
|
|
.elseif ((i & 7) == 7)
|
|
vmovdqu WY_TMP, PRECALC_WK(i&~7)
|
|
PRECALC_ROTATE_WY
|
|
.endif
|
|
.endm
|
|
|
|
.macro PRECALC_16_31
|
|
/*
|
|
* message scheduling pre-compute for rounds 16-31
|
|
* calculating last 32 w[i] values in 8 XMM registers
|
|
* pre-calculate K+w[i] values and store to mem
|
|
* for later load by ALU add instruction
|
|
*
|
|
* "brute force" vectorization for rounds 16-31 only
|
|
* due to w[i]->w[i-3] dependency
|
|
*/
|
|
.if ((i & 7) == 0)
|
|
/*
|
|
* blended AVX2 and ALU instruction scheduling
|
|
* 1 vector iteration per 8 rounds
|
|
*/
|
|
/* w[i-14] */
|
|
vpalignr $8, WY_minus_16, WY_minus_12, WY
|
|
vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
|
|
.elseif ((i & 7) == 1)
|
|
vpxor WY_minus_08, WY, WY
|
|
vpxor WY_minus_16, WY_TMP, WY_TMP
|
|
.elseif ((i & 7) == 2)
|
|
vpxor WY_TMP, WY, WY
|
|
vpslldq $12, WY, WY_TMP2
|
|
.elseif ((i & 7) == 3)
|
|
vpslld $1, WY, WY_TMP
|
|
vpsrld $31, WY, WY
|
|
.elseif ((i & 7) == 4)
|
|
vpor WY, WY_TMP, WY_TMP
|
|
vpslld $2, WY_TMP2, WY
|
|
.elseif ((i & 7) == 5)
|
|
vpsrld $30, WY_TMP2, WY_TMP2
|
|
vpxor WY, WY_TMP, WY_TMP
|
|
.elseif ((i & 7) == 7)
|
|
vpxor WY_TMP2, WY_TMP, WY
|
|
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
|
|
vmovdqu WY_TMP, PRECALC_WK(i&~7)
|
|
PRECALC_ROTATE_WY
|
|
.endif
|
|
.endm
|
|
|
|
.macro PRECALC_32_79
|
|
/*
|
|
* in SHA-1 specification:
|
|
* w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
|
|
* instead we do equal:
|
|
* w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
|
|
* allows more efficient vectorization
|
|
* since w[i]=>w[i-3] dependency is broken
|
|
*/
|
|
.if ((i & 7) == 0)
|
|
/*
|
|
* blended AVX2 and ALU instruction scheduling
|
|
* 1 vector iteration per 8 rounds
|
|
*/
|
|
vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
|
|
.elseif ((i & 7) == 1)
|
|
/* W is W_minus_32 before xor */
|
|
vpxor WY_minus_28, WY, WY
|
|
.elseif ((i & 7) == 2)
|
|
vpxor WY_minus_16, WY_TMP, WY_TMP
|
|
.elseif ((i & 7) == 3)
|
|
vpxor WY_TMP, WY, WY
|
|
.elseif ((i & 7) == 4)
|
|
vpslld $2, WY, WY_TMP
|
|
.elseif ((i & 7) == 5)
|
|
vpsrld $30, WY, WY
|
|
vpor WY, WY_TMP, WY
|
|
.elseif ((i & 7) == 7)
|
|
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
|
|
vmovdqu WY_TMP, PRECALC_WK(i&~7)
|
|
PRECALC_ROTATE_WY
|
|
.endif
|
|
.endm
|
|
|
|
.macro PRECALC r, s
|
|
.set i, \r
|
|
.if (i < 40)
|
|
.set K_XMM, 32*0
|
|
.elseif (i < 80)
|
|
.set K_XMM, 32*1
|
|
.elseif (i < 120)
|
|
.set K_XMM, 32*2
|
|
.else
|
|
.set K_XMM, 32*3
|
|
.endif
|
|
.if (i<32)
|
|
PRECALC_00_15 \s
|
|
.elseif (i<64)
|
|
PRECALC_16_31 \s
|
|
.elseif (i < 160)
|
|
PRECALC_32_79 \s
|
|
.endif
|
|
.endm
|
|
|
|
.macro ROTATE_STATE
|
|
.set T_REG, E
|
|
.set E, D
|
|
.set D, C
|
|
.set C, B
|
|
.set B, TB
|
|
.set TB, A
|
|
.set A, T_REG
|
|
.set T_REG, RE
|
|
.set RE, RD
|
|
.set RD, RC
|
|
.set RC, RB
|
|
.set RB, RTB
|
|
.set RTB, RA
|
|
.set RA, T_REG
|
|
.endm
|
|
|
|
// Macro relies on saved ROUND_Fx
|
|
.macro RND_FUN f, r
|
|
.if (\f == RND_F1)
|
|
ROUND_F1 \r
|
|
.elseif (\f == RND_F2)
|
|
ROUND_F2 \r
|
|
.elseif (\f == RND_F3)
|
|
ROUND_F3 \r
|
|
.endif
|
|
.endm
|
|
|
|
.macro RR r
|
|
.set round_id, (\r % 80)
|
|
|
|
.if (round_id == 0) # Precalculate F for first round
|
|
.set ROUND_FUNC, RND_F1
|
|
mov B, TB
|
|
|
|
rorx $(32-30), B, B # b>>>2
|
|
andn D, TB, T1
|
|
and C, TB
|
|
xor T1, TB
|
|
.endif
|
|
|
|
RND_FUN ROUND_FUNC, \r
|
|
ROTATE_STATE
|
|
|
|
.if (round_id == 18)
|
|
.set ROUND_FUNC, RND_F2
|
|
.elseif (round_id == 38)
|
|
.set ROUND_FUNC, RND_F3
|
|
.elseif (round_id == 58)
|
|
.set ROUND_FUNC, RND_F2
|
|
.endif
|
|
|
|
.set round_id, ( (\r+1) % 80)
|
|
|
|
RND_FUN ROUND_FUNC, (\r+1)
|
|
ROTATE_STATE
|
|
.endm
|
|
|
|
.macro ROUND_F1 r
|
|
add WK(\r), E
|
|
|
|
andn C, A, T1 # ~b&d
|
|
lea (RE,RTB), E # Add F from the previous round
|
|
|
|
rorx $(32-5), A, TA # T2 = A >>> 5
|
|
rorx $(32-30),A, TB # b>>>2 for next round
|
|
|
|
PRECALC (\r) # msg scheduling for next 2 blocks
|
|
|
|
// Calculate F for the next round
|
|
// (b & c) ^ andn[b, d]
|
|
and B, A # b&c
|
|
xor T1, A # F1 = (b&c) ^ (~b&d)
|
|
|
|
lea (RE,RTA), E # E += A >>> 5
|
|
.endm
|
|
|
|
.macro ROUND_F2 r
|
|
add WK(\r), E
|
|
lea (RE,RTB), E # Add F from the previous round
|
|
|
|
/* Calculate F for the next round */
|
|
rorx $(32-5), A, TA # T2 = A >>> 5
|
|
.if ((round_id) < 79)
|
|
rorx $(32-30), A, TB # b>>>2 for next round
|
|
.endif
|
|
PRECALC (\r) # msg scheduling for next 2 blocks
|
|
|
|
.if ((round_id) < 79)
|
|
xor B, A
|
|
.endif
|
|
|
|
add TA, E # E += A >>> 5
|
|
|
|
.if ((round_id) < 79)
|
|
xor C, A
|
|
.endif
|
|
.endm
|
|
|
|
.macro ROUND_F3 r
|
|
add WK(\r), E
|
|
PRECALC (\r) # msg scheduling for next 2 blocks
|
|
|
|
lea (RE,RTB), E # Add F from the previous round
|
|
|
|
mov B, T1
|
|
or A, T1
|
|
|
|
rorx $(32-5), A, TA # T2 = A >>> 5
|
|
rorx $(32-30), A, TB # b>>>2 for next round
|
|
|
|
// Calculate F for the next round
|
|
// (b and c) or (d and (b or c))
|
|
and C, T1
|
|
and B, A
|
|
or T1, A
|
|
|
|
add TA, E # E += A >>> 5
|
|
|
|
.endm
|
|
|
|
// Add constant only if (%2 > %3) condition met (uses RTA as temp)
|
|
// %1 + %2 >= %3 ? %4 : 0
|
|
.macro ADD_IF_GE a, b, c, d
|
|
mov \a, RTA
|
|
add $\d, RTA
|
|
cmp $\c, \b
|
|
cmovge RTA, \a
|
|
.endm
|
|
|
|
// Performs 80 rounds of SHA-1 for multiple blocks with s/w pipelining
|
|
.macro SHA1_PIPELINED_MAIN_BODY
|
|
|
|
REGALLOC
|
|
|
|
mov (HASH_PTR), A
|
|
mov 4(HASH_PTR), B
|
|
mov 8(HASH_PTR), C
|
|
mov 12(HASH_PTR), D
|
|
mov 16(HASH_PTR), E
|
|
|
|
mov %rsp, PRECALC_BUF
|
|
lea (2*4*80+32)(%rsp), WK_BUF
|
|
|
|
// Precalc WK for first 2 blocks
|
|
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
|
|
.set i, 0
|
|
.rept 160
|
|
PRECALC i
|
|
.set i, i + 1
|
|
.endr
|
|
|
|
// Go to next block if needed
|
|
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
|
|
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
|
|
xchg WK_BUF, PRECALC_BUF
|
|
|
|
.balign 32
|
|
.L_loop:
|
|
|
|
// code loops through more than one block
|
|
// we use K_BASE value as a signal of a last block,
|
|
// it is set below by: cmovae BUFFER_PTR, K_BASE
|
|
test BLOCKS_CTR, BLOCKS_CTR
|
|
jnz .L_begin
|
|
.balign 32
|
|
jmp .L_end
|
|
|
|
.balign 32
|
|
.L_begin:
|
|
|
|
// process first block
|
|
// rounds: 0,2,4,6,8
|
|
.set j, 0
|
|
.rept 5
|
|
RR j
|
|
.set j, j+2
|
|
.endr
|
|
|
|
jmp .L_loop0
|
|
.L_loop0:
|
|
|
|
// rounds
|
|
// 10,12,14,16,18
|
|
// 20,22,24,26,28
|
|
// 30,32,34,36,38
|
|
// 40,42,44,46,48
|
|
// 50,52,54,56,58
|
|
.rept 25
|
|
RR j
|
|
.set j, j+2
|
|
.endr
|
|
|
|
// Update Counter */
|
|
sub $1, BLOCKS_CTR
|
|
|
|
// Move to the next block only if needed*/
|
|
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
|
|
|
|
// rounds
|
|
// 60,62,64,66,68
|
|
// 70,72,74,76,78
|
|
.rept 10
|
|
RR j
|
|
.set j, j+2
|
|
.endr
|
|
|
|
UPDATE_HASH (HASH_PTR), A
|
|
UPDATE_HASH 4(HASH_PTR), TB
|
|
UPDATE_HASH 8(HASH_PTR), C
|
|
UPDATE_HASH 12(HASH_PTR), D
|
|
UPDATE_HASH 16(HASH_PTR), E
|
|
|
|
test BLOCKS_CTR, BLOCKS_CTR
|
|
jz .L_loop
|
|
|
|
mov TB, B
|
|
|
|
// process second block
|
|
// 0+80, 2+80, 4+80, 6+80, 8+80
|
|
// 10+80,12+80,14+80,16+80,18+80
|
|
|
|
.set j, 0
|
|
.rept 10
|
|
RR j+80
|
|
.set j, j+2
|
|
.endr
|
|
|
|
jmp .L_loop1
|
|
.L_loop1:
|
|
|
|
// rounds
|
|
// 20+80,22+80,24+80,26+80,28+80
|
|
// 30+80,32+80,34+80,36+80,38+80
|
|
.rept 10
|
|
RR j+80
|
|
.set j, j+2
|
|
.endr
|
|
|
|
jmp .L_loop2
|
|
.L_loop2:
|
|
|
|
// rounds
|
|
// 40+80,42+80,44+80,46+80,48+80
|
|
// 50+80,52+80,54+80,56+80,58+80
|
|
.rept 10
|
|
RR j+80
|
|
.set j, j+2
|
|
.endr
|
|
|
|
// update counter
|
|
sub $1, BLOCKS_CTR
|
|
|
|
// Move to the next block only if needed
|
|
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
|
|
|
|
jmp .L_loop3
|
|
.L_loop3:
|
|
|
|
// rounds
|
|
// 60+80,62+80,64+80,66+80,68+80
|
|
// 70+80,72+80,74+80,76+80,78+80
|
|
.rept 10
|
|
RR j+80
|
|
.set j, j+2
|
|
.endr
|
|
|
|
UPDATE_HASH (HASH_PTR), A
|
|
UPDATE_HASH 4(HASH_PTR), TB
|
|
UPDATE_HASH 8(HASH_PTR), C
|
|
UPDATE_HASH 12(HASH_PTR), D
|
|
UPDATE_HASH 16(HASH_PTR), E
|
|
|
|
/* Reset state for AVX2 reg permutation */
|
|
mov A, TA
|
|
mov TB, A
|
|
mov C, TB
|
|
mov E, C
|
|
mov D, B
|
|
mov TA, D
|
|
|
|
REGALLOC
|
|
|
|
xchg WK_BUF, PRECALC_BUF
|
|
|
|
jmp .L_loop
|
|
|
|
.balign 32
|
|
.L_end:
|
|
|
|
.endm
|
|
|
|
.section .rodata
|
|
|
|
#define K1 0x5a827999
|
|
#define K2 0x6ed9eba1
|
|
#define K3 0x8f1bbcdc
|
|
#define K4 0xca62c1d6
|
|
|
|
.balign 128
|
|
K_XMM_AR:
|
|
.long K1,K1,K1,K1
|
|
.long K1,K1,K1,K1
|
|
.long K2,K2,K2,K2
|
|
.long K2,K2,K2,K2
|
|
.long K3,K3,K3,K3
|
|
.long K3,K3,K3,K3
|
|
.long K4,K4,K4,K4
|
|
.long K4,K4,K4,K4
|
|
|
|
BSWAP_SHUFB_CTL:
|
|
.long 0x00010203
|
|
.long 0x04050607
|
|
.long 0x08090a0b
|
|
.long 0x0c0d0e0f
|
|
.long 0x00010203
|
|
.long 0x04050607
|
|
.long 0x08090a0b
|
|
.long 0x0c0d0e0f
|
|
.text
|
|
|
|
// Performs Intel® AVX2™ optimized SHA-1 update.
|
|
//
|
|
// This implementation is based on the previous SSSE3 release:
|
|
// Visit http://software.intel.com/en-us/articles/ and refer
|
|
// to improving-the-performance-of-the-secure-hash-algorithm-1/
|
|
//
|
|
// Updates 20-byte SHA-1 record at start of 'state', from 'input',
|
|
// for even number of 'blocks' consecutive 64-byte blocks.
|
|
//
|
|
// void sha1_transform_avx2(struct sha1_state *state,
|
|
// const uint8_t *input,
|
|
// int blocks);
|
|
//
|
|
// @param %rdi points to output digest
|
|
// @param %rsi points to input data
|
|
// @param %rdx is number of 64-byte blocks to process
|
|
// @see X86_HAVE(SHA)
|
|
.ftrace1
|
|
sha1_transform_avx2:
|
|
.ftrace2
|
|
push %rbp
|
|
mov %rsp,%rbp
|
|
push %rbx
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
RESERVE_STACK = (W_SIZE*4 + 8+24)
|
|
/* Align stack */
|
|
mov %rsp,%rbx
|
|
and $~(0x20-1),%rsp
|
|
push %rbx
|
|
sub $RESERVE_STACK,%rsp
|
|
vzeroupper
|
|
/* Setup initial values */
|
|
mov CTX,HASH_PTR
|
|
mov BUF,BUFFER_PTR
|
|
mov BUF,BUFFER_PTR2
|
|
mov CNT,BLOCKS_CTR
|
|
xmm_mov BSWAP_SHUFB_CTL(%rip),YMM_SHUFB_BSWAP
|
|
SHA1_PIPELINED_MAIN_BODY
|
|
vzeroupper
|
|
add $RESERVE_STACK,%rsp
|
|
pop %rsp
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %rbx
|
|
pop %rbp
|
|
ret
|
|
.endfn sha1_transform_avx2,globl
|