/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ │vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ ╞══════════════════════════════════════════════════════════════════════════════╡ │ │ │ Copyright 2014 Intel Corporation │ │ │ │ Redistribution and use in source and binary forms, with or without │ │ modification, are permitted provided that the following conditions │ │ are met: │ │ │ │ * Redistributions of source code must retain the above copyright │ │ notice, this list of conditions and the following disclaimer. │ │ * Redistributions in binary form must reproduce the above copyright │ │ notice, this list of conditions and the following disclaimer in │ │ the documentation and/or other materials provided with the │ │ distribution. │ │ * Neither the name of Intel Corporation nor the names of its │ │ contributors may be used to endorse or promote products derived │ │ from this software without specific prior written permission. │ │ │ │ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS │ │ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT │ │ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR │ │ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT │ │ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, │ │ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT │ │ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │ │ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │ │ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT │ │ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE │ │ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. │ │ │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/macros.internal.h" .ident "\n\ AVX2 SHA-1 (BSD-3 License)\n\ Copyright 2014 Intel Corporation\n" .include "libc/disclaimer.inc" #define CTX %rdi /* arg1 */ #define BUF %rsi /* arg2 */ #define CNT %rdx /* arg3 */ #define REG_A %ecx #define REG_B %esi #define REG_C %edi #define REG_D %eax #define REG_E %edx #define REG_TB %ebx #define REG_TA %r12d #define REG_RA %rcx #define REG_RB %rsi #define REG_RC %rdi #define REG_RD %rax #define REG_RE %rdx #define REG_RTA %r12 #define REG_RTB %rbx #define REG_T1 %r11d #define xmm_mov vmovups #define RND_F1 1 #define RND_F2 2 #define RND_F3 3 .macro REGALLOC .set A, REG_A .set B, REG_B .set C, REG_C .set D, REG_D .set E, REG_E .set TB, REG_TB .set TA, REG_TA .set RA, REG_RA .set RB, REG_RB .set RC, REG_RC .set RD, REG_RD .set RE, REG_RE .set RTA, REG_RTA .set RTB, REG_RTB .set T1, REG_T1 .endm #define HASH_PTR %r9 #define BLOCKS_CTR %r8 #define BUFFER_PTR %r10 #define BUFFER_PTR2 %r13 #define PRECALC_BUF %r14 #define WK_BUF %r15 #define W_TMP %xmm0 #define WY_TMP %ymm0 #define WY_TMP2 %ymm9 # AVX2 variables #define WY0 %ymm3 #define WY4 %ymm5 #define WY08 %ymm7 #define WY12 %ymm8 #define WY16 %ymm12 #define WY20 %ymm13 #define WY24 %ymm14 #define WY28 %ymm15 #define YMM_SHUFB_BSWAP %ymm10 /* * Keep 2 iterations precalculated at a time: * - 80 DWORDs per iteration * 2 */ #define W_SIZE (80*2*2 +16) #define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) #define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) .macro UPDATE_HASH hash, val add \hash, \val mov \val, \hash .endm .macro PRECALC_RESET_WY .set WY_00, WY0 .set WY_04, WY4 .set WY_08, WY08 .set WY_12, WY12 .set WY_16, WY16 .set WY_20, WY20 .set WY_24, WY24 .set WY_28, WY28 .set WY_32, WY_00 .endm .macro PRECALC_ROTATE_WY /* Rotate macros */ .set WY_32, WY_28 .set WY_28, WY_24 .set WY_24, WY_20 .set WY_20, WY_16 .set WY_16, WY_12 .set WY_12, WY_08 .set WY_08, WY_04 .set WY_04, WY_00 .set WY_00, WY_32 /* Define register aliases */ .set WY, WY_00 .set WY_minus_04, WY_04 .set WY_minus_08, WY_08 .set WY_minus_12, WY_12 .set WY_minus_16, WY_16 .set WY_minus_20, WY_20 .set WY_minus_24, WY_24 .set WY_minus_28, WY_28 .set WY_minus_32, WY .endm .macro PRECALC_00_15 .if (i == 0) # Initialize and rotate registers PRECALC_RESET_WY PRECALC_ROTATE_WY .endif /* message scheduling pre-compute for rounds 0-15 */ .if ((i & 7) == 0) /* * blended AVX2 and ALU instruction scheduling * 1 vector iteration per 8 rounds */ vmovdqu (i * 2)(BUFFER_PTR), W_TMP .elseif ((i & 7) == 1) vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ WY_TMP, WY_TMP .elseif ((i & 7) == 2) vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY .elseif ((i & 7) == 4) vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP .elseif ((i & 7) == 7) vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY .endif .endm .macro PRECALC_16_31 /* * message scheduling pre-compute for rounds 16-31 * calculating last 32 w[i] values in 8 XMM registers * pre-calculate K+w[i] values and store to mem * for later load by ALU add instruction * * "brute force" vectorization for rounds 16-31 only * due to w[i]->w[i-3] dependency */ .if ((i & 7) == 0) /* * blended AVX2 and ALU instruction scheduling * 1 vector iteration per 8 rounds */ /* w[i-14] */ vpalignr $8, WY_minus_16, WY_minus_12, WY vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ .elseif ((i & 7) == 1) vpxor WY_minus_08, WY, WY vpxor WY_minus_16, WY_TMP, WY_TMP .elseif ((i & 7) == 2) vpxor WY_TMP, WY, WY vpslldq $12, WY, WY_TMP2 .elseif ((i & 7) == 3) vpslld $1, WY, WY_TMP vpsrld $31, WY, WY .elseif ((i & 7) == 4) vpor WY, WY_TMP, WY_TMP vpslld $2, WY_TMP2, WY .elseif ((i & 7) == 5) vpsrld $30, WY_TMP2, WY_TMP2 vpxor WY, WY_TMP, WY_TMP .elseif ((i & 7) == 7) vpxor WY_TMP2, WY_TMP, WY vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY .endif .endm .macro PRECALC_32_79 /* * in SHA-1 specification: * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 * instead we do equal: * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 * allows more efficient vectorization * since w[i]=>w[i-3] dependency is broken */ .if ((i & 7) == 0) /* * blended AVX2 and ALU instruction scheduling * 1 vector iteration per 8 rounds */ vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP .elseif ((i & 7) == 1) /* W is W_minus_32 before xor */ vpxor WY_minus_28, WY, WY .elseif ((i & 7) == 2) vpxor WY_minus_16, WY_TMP, WY_TMP .elseif ((i & 7) == 3) vpxor WY_TMP, WY, WY .elseif ((i & 7) == 4) vpslld $2, WY, WY_TMP .elseif ((i & 7) == 5) vpsrld $30, WY, WY vpor WY, WY_TMP, WY .elseif ((i & 7) == 7) vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY .endif .endm .macro PRECALC r, s .set i, \r .if (i < 40) .set K_XMM, 32*0 .elseif (i < 80) .set K_XMM, 32*1 .elseif (i < 120) .set K_XMM, 32*2 .else .set K_XMM, 32*3 .endif .if (i<32) PRECALC_00_15 \s .elseif (i<64) PRECALC_16_31 \s .elseif (i < 160) PRECALC_32_79 \s .endif .endm .macro ROTATE_STATE .set T_REG, E .set E, D .set D, C .set C, B .set B, TB .set TB, A .set A, T_REG .set T_REG, RE .set RE, RD .set RD, RC .set RC, RB .set RB, RTB .set RTB, RA .set RA, T_REG .endm // Macro relies on saved ROUND_Fx .macro RND_FUN f, r .if (\f == RND_F1) ROUND_F1 \r .elseif (\f == RND_F2) ROUND_F2 \r .elseif (\f == RND_F3) ROUND_F3 \r .endif .endm .macro RR r .set round_id, (\r % 80) .if (round_id == 0) # Precalculate F for first round .set ROUND_FUNC, RND_F1 mov B, TB rorx $(32-30), B, B # b>>>2 andn D, TB, T1 and C, TB xor T1, TB .endif RND_FUN ROUND_FUNC, \r ROTATE_STATE .if (round_id == 18) .set ROUND_FUNC, RND_F2 .elseif (round_id == 38) .set ROUND_FUNC, RND_F3 .elseif (round_id == 58) .set ROUND_FUNC, RND_F2 .endif .set round_id, ( (\r+1) % 80) RND_FUN ROUND_FUNC, (\r+1) ROTATE_STATE .endm .macro ROUND_F1 r add WK(\r), E andn C, A, T1 # ~b&d lea (RE,RTB), E # Add F from the previous round rorx $(32-5), A, TA # T2 = A >>> 5 rorx $(32-30),A, TB # b>>>2 for next round PRECALC (\r) # msg scheduling for next 2 blocks // Calculate F for the next round // (b & c) ^ andn[b, d] and B, A # b&c xor T1, A # F1 = (b&c) ^ (~b&d) lea (RE,RTA), E # E += A >>> 5 .endm .macro ROUND_F2 r add WK(\r), E lea (RE,RTB), E # Add F from the previous round /* Calculate F for the next round */ rorx $(32-5), A, TA # T2 = A >>> 5 .if ((round_id) < 79) rorx $(32-30), A, TB # b>>>2 for next round .endif PRECALC (\r) # msg scheduling for next 2 blocks .if ((round_id) < 79) xor B, A .endif add TA, E # E += A >>> 5 .if ((round_id) < 79) xor C, A .endif .endm .macro ROUND_F3 r add WK(\r), E PRECALC (\r) # msg scheduling for next 2 blocks lea (RE,RTB), E # Add F from the previous round mov B, T1 or A, T1 rorx $(32-5), A, TA # T2 = A >>> 5 rorx $(32-30), A, TB # b>>>2 for next round // Calculate F for the next round // (b and c) or (d and (b or c)) and C, T1 and B, A or T1, A add TA, E # E += A >>> 5 .endm // Add constant only if (%2 > %3) condition met (uses RTA as temp) // %1 + %2 >= %3 ? %4 : 0 .macro ADD_IF_GE a, b, c, d mov \a, RTA add $\d, RTA cmp $\c, \b cmovge RTA, \a .endm // Performs 80 rounds of SHA-1 for multiple blocks with s/w pipelining .macro SHA1_PIPELINED_MAIN_BODY REGALLOC mov (HASH_PTR), A mov 4(HASH_PTR), B mov 8(HASH_PTR), C mov 12(HASH_PTR), D mov 16(HASH_PTR), E mov %rsp, PRECALC_BUF lea (2*4*80+32)(%rsp), WK_BUF // Precalc WK for first 2 blocks ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 .set i, 0 .rept 160 PRECALC i .set i, i + 1 .endr // Go to next block if needed ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 xchg WK_BUF, PRECALC_BUF .balign 32 .L_loop: // code loops through more than one block // we use K_BASE value as a signal of a last block, // it is set below by: cmovae BUFFER_PTR, K_BASE test BLOCKS_CTR, BLOCKS_CTR jnz .L_begin .balign 32 jmp .L_end .balign 32 .L_begin: // process first block // rounds: 0,2,4,6,8 .set j, 0 .rept 5 RR j .set j, j+2 .endr jmp .L_loop0 .L_loop0: // rounds // 10,12,14,16,18 // 20,22,24,26,28 // 30,32,34,36,38 // 40,42,44,46,48 // 50,52,54,56,58 .rept 25 RR j .set j, j+2 .endr // Update Counter */ sub $1, BLOCKS_CTR // Move to the next block only if needed*/ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 // rounds // 60,62,64,66,68 // 70,72,74,76,78 .rept 10 RR j .set j, j+2 .endr UPDATE_HASH (HASH_PTR), A UPDATE_HASH 4(HASH_PTR), TB UPDATE_HASH 8(HASH_PTR), C UPDATE_HASH 12(HASH_PTR), D UPDATE_HASH 16(HASH_PTR), E test BLOCKS_CTR, BLOCKS_CTR jz .L_loop mov TB, B // process second block // 0+80, 2+80, 4+80, 6+80, 8+80 // 10+80,12+80,14+80,16+80,18+80 .set j, 0 .rept 10 RR j+80 .set j, j+2 .endr jmp .L_loop1 .L_loop1: // rounds // 20+80,22+80,24+80,26+80,28+80 // 30+80,32+80,34+80,36+80,38+80 .rept 10 RR j+80 .set j, j+2 .endr jmp .L_loop2 .L_loop2: // rounds // 40+80,42+80,44+80,46+80,48+80 // 50+80,52+80,54+80,56+80,58+80 .rept 10 RR j+80 .set j, j+2 .endr // update counter sub $1, BLOCKS_CTR // Move to the next block only if needed ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 jmp .L_loop3 .L_loop3: // rounds // 60+80,62+80,64+80,66+80,68+80 // 70+80,72+80,74+80,76+80,78+80 .rept 10 RR j+80 .set j, j+2 .endr UPDATE_HASH (HASH_PTR), A UPDATE_HASH 4(HASH_PTR), TB UPDATE_HASH 8(HASH_PTR), C UPDATE_HASH 12(HASH_PTR), D UPDATE_HASH 16(HASH_PTR), E /* Reset state for AVX2 reg permutation */ mov A, TA mov TB, A mov C, TB mov E, C mov D, B mov TA, D REGALLOC xchg WK_BUF, PRECALC_BUF jmp .L_loop .balign 32 .L_end: .endm .section .rodata #define K1 0x5a827999 #define K2 0x6ed9eba1 #define K3 0x8f1bbcdc #define K4 0xca62c1d6 .balign 128 K_XMM_AR: .long K1,K1,K1,K1 .long K1,K1,K1,K1 .long K2,K2,K2,K2 .long K2,K2,K2,K2 .long K3,K3,K3,K3 .long K3,K3,K3,K3 .long K4,K4,K4,K4 .long K4,K4,K4,K4 BSWAP_SHUFB_CTL: .long 0x00010203 .long 0x04050607 .long 0x08090a0b .long 0x0c0d0e0f .long 0x00010203 .long 0x04050607 .long 0x08090a0b .long 0x0c0d0e0f .text // Performs Intel® AVX2™ optimized SHA-1 update. // // This implementation is based on the previous SSSE3 release: // Visit http://software.intel.com/en-us/articles/ and refer // to improving-the-performance-of-the-secure-hash-algorithm-1/ // // Updates 20-byte SHA-1 record at start of 'state', from 'input', // for even number of 'blocks' consecutive 64-byte blocks. // // void sha1_transform_avx2(struct sha1_state *state, // const uint8_t *input, // int blocks); // // @param %rdi points to output digest // @param %rsi points to input data // @param %rdx is number of 64-byte blocks to process // @see X86_HAVE(SHA) .ftrace1 sha1_transform_avx2: .ftrace2 push %rbp mov %rsp,%rbp push %rbx push %r12 push %r13 push %r14 push %r15 RESERVE_STACK = (W_SIZE*4 + 8+24) /* Align stack */ mov %rsp,%rbx and $~(0x20-1),%rsp push %rbx sub $RESERVE_STACK,%rsp vzeroupper /* Setup initial values */ mov CTX,HASH_PTR mov BUF,BUFFER_PTR mov BUF,BUFFER_PTR2 mov CNT,BLOCKS_CTR xmm_mov BSWAP_SHUFB_CTL(%rip),YMM_SHUFB_BSWAP SHA1_PIPELINED_MAIN_BODY vzeroupper add $RESERVE_STACK,%rsp pop %rsp pop %r15 pop %r14 pop %r13 pop %r12 pop %rbx pop %rbp ret .endfn sha1_transform_avx2,globl