mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-02-07 15:03:34 +00:00
Make sha1 / sha256 / sha512 go faster
This commit is contained in:
parent
5144c22189
commit
2d79ab6c15
14 changed files with 2299 additions and 93 deletions
|
@ -1,33 +0,0 @@
|
||||||
#if 0
|
|
||||||
/*─────────────────────────────────────────────────────────────────╗
|
|
||||||
│ To the extent possible under law, Justine Tunney has waived │
|
|
||||||
│ all copyright and related or neighboring rights to this file, │
|
|
||||||
│ as it is written in the following disclaimers: │
|
|
||||||
│ • http://unlicense.org/ │
|
|
||||||
│ • http://creativecommons.org/publicdomain/zero/1.0/ │
|
|
||||||
╚─────────────────────────────────────────────────────────────────*/
|
|
||||||
#endif
|
|
||||||
#include "libc/macros.internal.h"
|
|
||||||
|
|
||||||
.rodata.cst16
|
|
||||||
.align 16
|
|
||||||
kSha256Tab:
|
|
||||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
|
||||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
|
||||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
|
||||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
|
||||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
|
||||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
|
||||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
|
||||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
|
||||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
|
||||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
|
||||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
|
||||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
|
||||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
|
||||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
|
||||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
|
||||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
|
||||||
.endobj kSha256Tab,globl,hidden
|
|
||||||
.previous
|
|
||||||
.source __FILE__
|
|
681
libc/nexgen32e/sha1.S
Normal file
681
libc/nexgen32e/sha1.S
Normal file
|
@ -0,0 +1,681 @@
|
||||||
|
/*
|
||||||
|
* BSD LICENSE
|
||||||
|
*
|
||||||
|
* Copyright(c) 2014 Intel Corporation.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* - Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* - Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* - Neither the name of Intel Corporation nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived
|
||||||
|
* from this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
|
||||||
|
*
|
||||||
|
* This implementation is based on the previous SSSE3 release:
|
||||||
|
* Visit http://software.intel.com/en-us/articles/
|
||||||
|
* and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
|
||||||
|
*
|
||||||
|
* Updates 20-byte SHA-1 record at start of 'state', from 'input', for
|
||||||
|
* even number of 'blocks' consecutive 64-byte blocks.
|
||||||
|
*
|
||||||
|
* extern "C" void sha1_transform_avx2(
|
||||||
|
* struct sha1_state *state, const uint8_t *input, int blocks );
|
||||||
|
*/
|
||||||
|
#include "libc/macros.internal.h"
|
||||||
|
|
||||||
|
.ident "\n\
|
||||||
|
AVX2 SHA-1 (BSD-3 License)\n\
|
||||||
|
Copyright 2014 Intel Corporation\n"
|
||||||
|
.include "libc/disclaimer.inc"
|
||||||
|
|
||||||
|
#define CTX %rdi /* arg1 */
|
||||||
|
#define BUF %rsi /* arg2 */
|
||||||
|
#define CNT %rdx /* arg3 */
|
||||||
|
|
||||||
|
#define REG_A %ecx
|
||||||
|
#define REG_B %esi
|
||||||
|
#define REG_C %edi
|
||||||
|
#define REG_D %eax
|
||||||
|
#define REG_E %edx
|
||||||
|
#define REG_TB %ebx
|
||||||
|
#define REG_TA %r12d
|
||||||
|
#define REG_RA %rcx
|
||||||
|
#define REG_RB %rsi
|
||||||
|
#define REG_RC %rdi
|
||||||
|
#define REG_RD %rax
|
||||||
|
#define REG_RE %rdx
|
||||||
|
#define REG_RTA %r12
|
||||||
|
#define REG_RTB %rbx
|
||||||
|
#define REG_T1 %r11d
|
||||||
|
#define xmm_mov vmovups
|
||||||
|
#define avx2_zeroupper vzeroupper
|
||||||
|
#define RND_F1 1
|
||||||
|
#define RND_F2 2
|
||||||
|
#define RND_F3 3
|
||||||
|
|
||||||
|
.macro REGALLOC
|
||||||
|
.set A, REG_A
|
||||||
|
.set B, REG_B
|
||||||
|
.set C, REG_C
|
||||||
|
.set D, REG_D
|
||||||
|
.set E, REG_E
|
||||||
|
.set TB, REG_TB
|
||||||
|
.set TA, REG_TA
|
||||||
|
|
||||||
|
.set RA, REG_RA
|
||||||
|
.set RB, REG_RB
|
||||||
|
.set RC, REG_RC
|
||||||
|
.set RD, REG_RD
|
||||||
|
.set RE, REG_RE
|
||||||
|
|
||||||
|
.set RTA, REG_RTA
|
||||||
|
.set RTB, REG_RTB
|
||||||
|
|
||||||
|
.set T1, REG_T1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
#define HASH_PTR %r9
|
||||||
|
#define BLOCKS_CTR %r8
|
||||||
|
#define BUFFER_PTR %r10
|
||||||
|
#define BUFFER_PTR2 %r13
|
||||||
|
|
||||||
|
#define PRECALC_BUF %r14
|
||||||
|
#define WK_BUF %r15
|
||||||
|
|
||||||
|
#define W_TMP %xmm0
|
||||||
|
#define WY_TMP %ymm0
|
||||||
|
#define WY_TMP2 %ymm9
|
||||||
|
|
||||||
|
# AVX2 variables
|
||||||
|
#define WY0 %ymm3
|
||||||
|
#define WY4 %ymm5
|
||||||
|
#define WY08 %ymm7
|
||||||
|
#define WY12 %ymm8
|
||||||
|
#define WY16 %ymm12
|
||||||
|
#define WY20 %ymm13
|
||||||
|
#define WY24 %ymm14
|
||||||
|
#define WY28 %ymm15
|
||||||
|
|
||||||
|
#define YMM_SHUFB_BSWAP %ymm10
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Keep 2 iterations precalculated at a time:
|
||||||
|
* - 80 DWORDs per iteration * 2
|
||||||
|
*/
|
||||||
|
#define W_SIZE (80*2*2 +16)
|
||||||
|
|
||||||
|
#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
|
||||||
|
#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
|
||||||
|
|
||||||
|
|
||||||
|
.macro UPDATE_HASH hash, val
|
||||||
|
add \hash, \val
|
||||||
|
mov \val, \hash
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro PRECALC_RESET_WY
|
||||||
|
.set WY_00, WY0
|
||||||
|
.set WY_04, WY4
|
||||||
|
.set WY_08, WY08
|
||||||
|
.set WY_12, WY12
|
||||||
|
.set WY_16, WY16
|
||||||
|
.set WY_20, WY20
|
||||||
|
.set WY_24, WY24
|
||||||
|
.set WY_28, WY28
|
||||||
|
.set WY_32, WY_00
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro PRECALC_ROTATE_WY
|
||||||
|
/* Rotate macros */
|
||||||
|
.set WY_32, WY_28
|
||||||
|
.set WY_28, WY_24
|
||||||
|
.set WY_24, WY_20
|
||||||
|
.set WY_20, WY_16
|
||||||
|
.set WY_16, WY_12
|
||||||
|
.set WY_12, WY_08
|
||||||
|
.set WY_08, WY_04
|
||||||
|
.set WY_04, WY_00
|
||||||
|
.set WY_00, WY_32
|
||||||
|
|
||||||
|
/* Define register aliases */
|
||||||
|
.set WY, WY_00
|
||||||
|
.set WY_minus_04, WY_04
|
||||||
|
.set WY_minus_08, WY_08
|
||||||
|
.set WY_minus_12, WY_12
|
||||||
|
.set WY_minus_16, WY_16
|
||||||
|
.set WY_minus_20, WY_20
|
||||||
|
.set WY_minus_24, WY_24
|
||||||
|
.set WY_minus_28, WY_28
|
||||||
|
.set WY_minus_32, WY
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro PRECALC_00_15
|
||||||
|
.if (i == 0) # Initialize and rotate registers
|
||||||
|
PRECALC_RESET_WY
|
||||||
|
PRECALC_ROTATE_WY
|
||||||
|
.endif
|
||||||
|
|
||||||
|
/* message scheduling pre-compute for rounds 0-15 */
|
||||||
|
.if ((i & 7) == 0)
|
||||||
|
/*
|
||||||
|
* blended AVX2 and ALU instruction scheduling
|
||||||
|
* 1 vector iteration per 8 rounds
|
||||||
|
*/
|
||||||
|
vmovdqu (i * 2)(BUFFER_PTR), W_TMP
|
||||||
|
.elseif ((i & 7) == 1)
|
||||||
|
vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
|
||||||
|
WY_TMP, WY_TMP
|
||||||
|
.elseif ((i & 7) == 2)
|
||||||
|
vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
|
||||||
|
.elseif ((i & 7) == 4)
|
||||||
|
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
|
||||||
|
.elseif ((i & 7) == 7)
|
||||||
|
vmovdqu WY_TMP, PRECALC_WK(i&~7)
|
||||||
|
|
||||||
|
PRECALC_ROTATE_WY
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro PRECALC_16_31
|
||||||
|
/*
|
||||||
|
* message scheduling pre-compute for rounds 16-31
|
||||||
|
* calculating last 32 w[i] values in 8 XMM registers
|
||||||
|
* pre-calculate K+w[i] values and store to mem
|
||||||
|
* for later load by ALU add instruction
|
||||||
|
*
|
||||||
|
* "brute force" vectorization for rounds 16-31 only
|
||||||
|
* due to w[i]->w[i-3] dependency
|
||||||
|
*/
|
||||||
|
.if ((i & 7) == 0)
|
||||||
|
/*
|
||||||
|
* blended AVX2 and ALU instruction scheduling
|
||||||
|
* 1 vector iteration per 8 rounds
|
||||||
|
*/
|
||||||
|
/* w[i-14] */
|
||||||
|
vpalignr $8, WY_minus_16, WY_minus_12, WY
|
||||||
|
vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
|
||||||
|
.elseif ((i & 7) == 1)
|
||||||
|
vpxor WY_minus_08, WY, WY
|
||||||
|
vpxor WY_minus_16, WY_TMP, WY_TMP
|
||||||
|
.elseif ((i & 7) == 2)
|
||||||
|
vpxor WY_TMP, WY, WY
|
||||||
|
vpslldq $12, WY, WY_TMP2
|
||||||
|
.elseif ((i & 7) == 3)
|
||||||
|
vpslld $1, WY, WY_TMP
|
||||||
|
vpsrld $31, WY, WY
|
||||||
|
.elseif ((i & 7) == 4)
|
||||||
|
vpor WY, WY_TMP, WY_TMP
|
||||||
|
vpslld $2, WY_TMP2, WY
|
||||||
|
.elseif ((i & 7) == 5)
|
||||||
|
vpsrld $30, WY_TMP2, WY_TMP2
|
||||||
|
vpxor WY, WY_TMP, WY_TMP
|
||||||
|
.elseif ((i & 7) == 7)
|
||||||
|
vpxor WY_TMP2, WY_TMP, WY
|
||||||
|
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
|
||||||
|
vmovdqu WY_TMP, PRECALC_WK(i&~7)
|
||||||
|
|
||||||
|
PRECALC_ROTATE_WY
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro PRECALC_32_79
|
||||||
|
/*
|
||||||
|
* in SHA-1 specification:
|
||||||
|
* w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
|
||||||
|
* instead we do equal:
|
||||||
|
* w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
|
||||||
|
* allows more efficient vectorization
|
||||||
|
* since w[i]=>w[i-3] dependency is broken
|
||||||
|
*/
|
||||||
|
|
||||||
|
.if ((i & 7) == 0)
|
||||||
|
/*
|
||||||
|
* blended AVX2 and ALU instruction scheduling
|
||||||
|
* 1 vector iteration per 8 rounds
|
||||||
|
*/
|
||||||
|
vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
|
||||||
|
.elseif ((i & 7) == 1)
|
||||||
|
/* W is W_minus_32 before xor */
|
||||||
|
vpxor WY_minus_28, WY, WY
|
||||||
|
.elseif ((i & 7) == 2)
|
||||||
|
vpxor WY_minus_16, WY_TMP, WY_TMP
|
||||||
|
.elseif ((i & 7) == 3)
|
||||||
|
vpxor WY_TMP, WY, WY
|
||||||
|
.elseif ((i & 7) == 4)
|
||||||
|
vpslld $2, WY, WY_TMP
|
||||||
|
.elseif ((i & 7) == 5)
|
||||||
|
vpsrld $30, WY, WY
|
||||||
|
vpor WY, WY_TMP, WY
|
||||||
|
.elseif ((i & 7) == 7)
|
||||||
|
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
|
||||||
|
vmovdqu WY_TMP, PRECALC_WK(i&~7)
|
||||||
|
|
||||||
|
PRECALC_ROTATE_WY
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro PRECALC r, s
|
||||||
|
.set i, \r
|
||||||
|
|
||||||
|
.if (i < 40)
|
||||||
|
.set K_XMM, 32*0
|
||||||
|
.elseif (i < 80)
|
||||||
|
.set K_XMM, 32*1
|
||||||
|
.elseif (i < 120)
|
||||||
|
.set K_XMM, 32*2
|
||||||
|
.else
|
||||||
|
.set K_XMM, 32*3
|
||||||
|
.endif
|
||||||
|
|
||||||
|
.if (i<32)
|
||||||
|
PRECALC_00_15 \s
|
||||||
|
.elseif (i<64)
|
||||||
|
PRECALC_16_31 \s
|
||||||
|
.elseif (i < 160)
|
||||||
|
PRECALC_32_79 \s
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ROTATE_STATE
|
||||||
|
.set T_REG, E
|
||||||
|
.set E, D
|
||||||
|
.set D, C
|
||||||
|
.set C, B
|
||||||
|
.set B, TB
|
||||||
|
.set TB, A
|
||||||
|
.set A, T_REG
|
||||||
|
|
||||||
|
.set T_REG, RE
|
||||||
|
.set RE, RD
|
||||||
|
.set RD, RC
|
||||||
|
.set RC, RB
|
||||||
|
.set RB, RTB
|
||||||
|
.set RTB, RA
|
||||||
|
.set RA, T_REG
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* Macro relies on saved ROUND_Fx */
|
||||||
|
|
||||||
|
.macro RND_FUN f, r
|
||||||
|
.if (\f == RND_F1)
|
||||||
|
ROUND_F1 \r
|
||||||
|
.elseif (\f == RND_F2)
|
||||||
|
ROUND_F2 \r
|
||||||
|
.elseif (\f == RND_F3)
|
||||||
|
ROUND_F3 \r
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro RR r
|
||||||
|
.set round_id, (\r % 80)
|
||||||
|
|
||||||
|
.if (round_id == 0) /* Precalculate F for first round */
|
||||||
|
.set ROUND_FUNC, RND_F1
|
||||||
|
mov B, TB
|
||||||
|
|
||||||
|
rorx $(32-30), B, B /* b>>>2 */
|
||||||
|
andn D, TB, T1
|
||||||
|
and C, TB
|
||||||
|
xor T1, TB
|
||||||
|
.endif
|
||||||
|
|
||||||
|
RND_FUN ROUND_FUNC, \r
|
||||||
|
ROTATE_STATE
|
||||||
|
|
||||||
|
.if (round_id == 18)
|
||||||
|
.set ROUND_FUNC, RND_F2
|
||||||
|
.elseif (round_id == 38)
|
||||||
|
.set ROUND_FUNC, RND_F3
|
||||||
|
.elseif (round_id == 58)
|
||||||
|
.set ROUND_FUNC, RND_F2
|
||||||
|
.endif
|
||||||
|
|
||||||
|
.set round_id, ( (\r+1) % 80)
|
||||||
|
|
||||||
|
RND_FUN ROUND_FUNC, (\r+1)
|
||||||
|
ROTATE_STATE
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ROUND_F1 r
|
||||||
|
add WK(\r), E
|
||||||
|
|
||||||
|
andn C, A, T1 /* ~b&d */
|
||||||
|
lea (RE,RTB), E /* Add F from the previous round */
|
||||||
|
|
||||||
|
rorx $(32-5), A, TA /* T2 = A >>> 5 */
|
||||||
|
rorx $(32-30),A, TB /* b>>>2 for next round */
|
||||||
|
|
||||||
|
PRECALC (\r) /* msg scheduling for next 2 blocks */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Calculate F for the next round
|
||||||
|
* (b & c) ^ andn[b, d]
|
||||||
|
*/
|
||||||
|
and B, A /* b&c */
|
||||||
|
xor T1, A /* F1 = (b&c) ^ (~b&d) */
|
||||||
|
|
||||||
|
lea (RE,RTA), E /* E += A >>> 5 */
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ROUND_F2 r
|
||||||
|
add WK(\r), E
|
||||||
|
lea (RE,RTB), E /* Add F from the previous round */
|
||||||
|
|
||||||
|
/* Calculate F for the next round */
|
||||||
|
rorx $(32-5), A, TA /* T2 = A >>> 5 */
|
||||||
|
.if ((round_id) < 79)
|
||||||
|
rorx $(32-30), A, TB /* b>>>2 for next round */
|
||||||
|
.endif
|
||||||
|
PRECALC (\r) /* msg scheduling for next 2 blocks */
|
||||||
|
|
||||||
|
.if ((round_id) < 79)
|
||||||
|
xor B, A
|
||||||
|
.endif
|
||||||
|
|
||||||
|
add TA, E /* E += A >>> 5 */
|
||||||
|
|
||||||
|
.if ((round_id) < 79)
|
||||||
|
xor C, A
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ROUND_F3 r
|
||||||
|
add WK(\r), E
|
||||||
|
PRECALC (\r) /* msg scheduling for next 2 blocks */
|
||||||
|
|
||||||
|
lea (RE,RTB), E /* Add F from the previous round */
|
||||||
|
|
||||||
|
mov B, T1
|
||||||
|
or A, T1
|
||||||
|
|
||||||
|
rorx $(32-5), A, TA /* T2 = A >>> 5 */
|
||||||
|
rorx $(32-30), A, TB /* b>>>2 for next round */
|
||||||
|
|
||||||
|
/* Calculate F for the next round
|
||||||
|
* (b and c) or (d and (b or c))
|
||||||
|
*/
|
||||||
|
and C, T1
|
||||||
|
and B, A
|
||||||
|
or T1, A
|
||||||
|
|
||||||
|
add TA, E /* E += A >>> 5 */
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
|
||||||
|
* %1 + %2 >= %3 ? %4 : 0
|
||||||
|
*/
|
||||||
|
.macro ADD_IF_GE a, b, c, d
|
||||||
|
mov \a, RTA
|
||||||
|
add $\d, RTA
|
||||||
|
cmp $\c, \b
|
||||||
|
cmovge RTA, \a
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
|
||||||
|
*/
|
||||||
|
.macro SHA1_PIPELINED_MAIN_BODY
|
||||||
|
|
||||||
|
REGALLOC
|
||||||
|
|
||||||
|
mov (HASH_PTR), A
|
||||||
|
mov 4(HASH_PTR), B
|
||||||
|
mov 8(HASH_PTR), C
|
||||||
|
mov 12(HASH_PTR), D
|
||||||
|
mov 16(HASH_PTR), E
|
||||||
|
|
||||||
|
mov %rsp, PRECALC_BUF
|
||||||
|
lea (2*4*80+32)(%rsp), WK_BUF
|
||||||
|
|
||||||
|
# Precalc WK for first 2 blocks
|
||||||
|
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
|
||||||
|
.set i, 0
|
||||||
|
.rept 160
|
||||||
|
PRECALC i
|
||||||
|
.set i, i + 1
|
||||||
|
.endr
|
||||||
|
|
||||||
|
/* Go to next block if needed */
|
||||||
|
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
|
||||||
|
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
|
||||||
|
xchg WK_BUF, PRECALC_BUF
|
||||||
|
|
||||||
|
.align 32
|
||||||
|
.L_loop:
|
||||||
|
/*
|
||||||
|
* code loops through more than one block
|
||||||
|
* we use K_BASE value as a signal of a last block,
|
||||||
|
* it is set below by: cmovae BUFFER_PTR, K_BASE
|
||||||
|
*/
|
||||||
|
test BLOCKS_CTR, BLOCKS_CTR
|
||||||
|
jnz .L_begin
|
||||||
|
.align 32
|
||||||
|
jmp .L_end
|
||||||
|
.align 32
|
||||||
|
.L_begin:
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Do first block
|
||||||
|
* rounds: 0,2,4,6,8
|
||||||
|
*/
|
||||||
|
.set j, 0
|
||||||
|
.rept 5
|
||||||
|
RR j
|
||||||
|
.set j, j+2
|
||||||
|
.endr
|
||||||
|
|
||||||
|
jmp .L_loop0
|
||||||
|
.L_loop0:
|
||||||
|
|
||||||
|
/*
|
||||||
|
* rounds:
|
||||||
|
* 10,12,14,16,18
|
||||||
|
* 20,22,24,26,28
|
||||||
|
* 30,32,34,36,38
|
||||||
|
* 40,42,44,46,48
|
||||||
|
* 50,52,54,56,58
|
||||||
|
*/
|
||||||
|
.rept 25
|
||||||
|
RR j
|
||||||
|
.set j, j+2
|
||||||
|
.endr
|
||||||
|
|
||||||
|
/* Update Counter */
|
||||||
|
sub $1, BLOCKS_CTR
|
||||||
|
/* Move to the next block only if needed*/
|
||||||
|
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
|
||||||
|
/*
|
||||||
|
* rounds
|
||||||
|
* 60,62,64,66,68
|
||||||
|
* 70,72,74,76,78
|
||||||
|
*/
|
||||||
|
.rept 10
|
||||||
|
RR j
|
||||||
|
.set j, j+2
|
||||||
|
.endr
|
||||||
|
|
||||||
|
UPDATE_HASH (HASH_PTR), A
|
||||||
|
UPDATE_HASH 4(HASH_PTR), TB
|
||||||
|
UPDATE_HASH 8(HASH_PTR), C
|
||||||
|
UPDATE_HASH 12(HASH_PTR), D
|
||||||
|
UPDATE_HASH 16(HASH_PTR), E
|
||||||
|
|
||||||
|
test BLOCKS_CTR, BLOCKS_CTR
|
||||||
|
jz .L_loop
|
||||||
|
|
||||||
|
mov TB, B
|
||||||
|
|
||||||
|
/* Process second block */
|
||||||
|
/*
|
||||||
|
* rounds
|
||||||
|
* 0+80, 2+80, 4+80, 6+80, 8+80
|
||||||
|
* 10+80,12+80,14+80,16+80,18+80
|
||||||
|
*/
|
||||||
|
|
||||||
|
.set j, 0
|
||||||
|
.rept 10
|
||||||
|
RR j+80
|
||||||
|
.set j, j+2
|
||||||
|
.endr
|
||||||
|
|
||||||
|
jmp .L_loop1
|
||||||
|
.L_loop1:
|
||||||
|
/*
|
||||||
|
* rounds
|
||||||
|
* 20+80,22+80,24+80,26+80,28+80
|
||||||
|
* 30+80,32+80,34+80,36+80,38+80
|
||||||
|
*/
|
||||||
|
.rept 10
|
||||||
|
RR j+80
|
||||||
|
.set j, j+2
|
||||||
|
.endr
|
||||||
|
|
||||||
|
jmp .L_loop2
|
||||||
|
.L_loop2:
|
||||||
|
|
||||||
|
/*
|
||||||
|
* rounds
|
||||||
|
* 40+80,42+80,44+80,46+80,48+80
|
||||||
|
* 50+80,52+80,54+80,56+80,58+80
|
||||||
|
*/
|
||||||
|
.rept 10
|
||||||
|
RR j+80
|
||||||
|
.set j, j+2
|
||||||
|
.endr
|
||||||
|
|
||||||
|
/* update counter */
|
||||||
|
sub $1, BLOCKS_CTR
|
||||||
|
/* Move to the next block only if needed*/
|
||||||
|
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
|
||||||
|
|
||||||
|
jmp .L_loop3
|
||||||
|
.L_loop3:
|
||||||
|
|
||||||
|
/*
|
||||||
|
* rounds
|
||||||
|
* 60+80,62+80,64+80,66+80,68+80
|
||||||
|
* 70+80,72+80,74+80,76+80,78+80
|
||||||
|
*/
|
||||||
|
.rept 10
|
||||||
|
RR j+80
|
||||||
|
.set j, j+2
|
||||||
|
.endr
|
||||||
|
|
||||||
|
UPDATE_HASH (HASH_PTR), A
|
||||||
|
UPDATE_HASH 4(HASH_PTR), TB
|
||||||
|
UPDATE_HASH 8(HASH_PTR), C
|
||||||
|
UPDATE_HASH 12(HASH_PTR), D
|
||||||
|
UPDATE_HASH 16(HASH_PTR), E
|
||||||
|
|
||||||
|
/* Reset state for AVX2 reg permutation */
|
||||||
|
mov A, TA
|
||||||
|
mov TB, A
|
||||||
|
mov C, TB
|
||||||
|
mov E, C
|
||||||
|
mov D, B
|
||||||
|
mov TA, D
|
||||||
|
|
||||||
|
REGALLOC
|
||||||
|
|
||||||
|
xchg WK_BUF, PRECALC_BUF
|
||||||
|
|
||||||
|
jmp .L_loop
|
||||||
|
|
||||||
|
.align 32
|
||||||
|
.L_end:
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.section .rodata
|
||||||
|
|
||||||
|
#define K1 0x5a827999
|
||||||
|
#define K2 0x6ed9eba1
|
||||||
|
#define K3 0x8f1bbcdc
|
||||||
|
#define K4 0xca62c1d6
|
||||||
|
|
||||||
|
.align 128
|
||||||
|
K_XMM_AR:
|
||||||
|
.long K1, K1, K1, K1
|
||||||
|
.long K1, K1, K1, K1
|
||||||
|
.long K2, K2, K2, K2
|
||||||
|
.long K2, K2, K2, K2
|
||||||
|
.long K3, K3, K3, K3
|
||||||
|
.long K3, K3, K3, K3
|
||||||
|
.long K4, K4, K4, K4
|
||||||
|
.long K4, K4, K4, K4
|
||||||
|
|
||||||
|
BSWAP_SHUFB_CTL:
|
||||||
|
.long 0x00010203
|
||||||
|
.long 0x04050607
|
||||||
|
.long 0x08090a0b
|
||||||
|
.long 0x0c0d0e0f
|
||||||
|
.long 0x00010203
|
||||||
|
.long 0x04050607
|
||||||
|
.long 0x08090a0b
|
||||||
|
.long 0x0c0d0e0f
|
||||||
|
.text
|
||||||
|
|
||||||
|
sha1_transform_avx2:
|
||||||
|
push %rbx
|
||||||
|
push %r12
|
||||||
|
push %r13
|
||||||
|
push %r14
|
||||||
|
push %r15
|
||||||
|
|
||||||
|
RESERVE_STACK = (W_SIZE*4 + 8+24)
|
||||||
|
|
||||||
|
/* Align stack */
|
||||||
|
mov %rsp, %rbx
|
||||||
|
and $~(0x20-1), %rsp
|
||||||
|
push %rbx
|
||||||
|
sub $RESERVE_STACK, %rsp
|
||||||
|
|
||||||
|
avx2_zeroupper
|
||||||
|
|
||||||
|
/* Setup initial values */
|
||||||
|
mov CTX, HASH_PTR
|
||||||
|
mov BUF, BUFFER_PTR
|
||||||
|
|
||||||
|
mov BUF, BUFFER_PTR2
|
||||||
|
mov CNT, BLOCKS_CTR
|
||||||
|
|
||||||
|
xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
|
||||||
|
|
||||||
|
SHA1_PIPELINED_MAIN_BODY
|
||||||
|
|
||||||
|
avx2_zeroupper
|
||||||
|
|
||||||
|
add $RESERVE_STACK, %rsp
|
||||||
|
pop %rsp
|
||||||
|
|
||||||
|
pop %r15
|
||||||
|
pop %r14
|
||||||
|
pop %r13
|
||||||
|
pop %r12
|
||||||
|
pop %rbx
|
||||||
|
ret
|
||||||
|
.endfn sha1_transform_avx2,globl
|
769
libc/nexgen32e/sha256.S
Normal file
769
libc/nexgen32e/sha256.S
Normal file
|
@ -0,0 +1,769 @@
|
||||||
|
/////////////////////////////////////////////////////////////////////////
|
||||||
|
// Implement fast SHA-256 with AVX2 instructions. (x86_64)
|
||||||
|
//
|
||||||
|
// Copyright (C) 2013 Intel Corporation.
|
||||||
|
//
|
||||||
|
// Authors:
|
||||||
|
// James Guilford <james.guilford@intel.com>
|
||||||
|
// Kirk Yap <kirk.s.yap@intel.com>
|
||||||
|
// Tim Chen <tim.c.chen@linux.intel.com>
|
||||||
|
//
|
||||||
|
// This software is available to you under a choice of one of two
|
||||||
|
// licenses. You may choose to be licensed under the terms of the GNU
|
||||||
|
// General Public License (GPL) Version 2, available from the file
|
||||||
|
// COPYING in the main directory of this source tree, or the
|
||||||
|
// OpenIB.org BSD license below:
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or
|
||||||
|
// without modification, are permitted provided that the following
|
||||||
|
// conditions are met:
|
||||||
|
//
|
||||||
|
// - Redistributions of source code must retain the above
|
||||||
|
// copyright notice, this list of conditions and the following
|
||||||
|
// disclaimer.
|
||||||
|
//
|
||||||
|
// - Redistributions in binary form must reproduce the above
|
||||||
|
// copyright notice, this list of conditions and the following
|
||||||
|
// disclaimer in the documentation and/or other materials
|
||||||
|
// provided with the distribution.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||||
|
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
// SOFTWARE.
|
||||||
|
//
|
||||||
|
/////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// This code is described in an Intel White-Paper:
|
||||||
|
// "Fast SHA-256 Implementations on Intel Architecture Processors"
|
||||||
|
//
|
||||||
|
// To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||||
|
// and search for that title.
|
||||||
|
//
|
||||||
|
/////////////////////////////////////////////////////////////////////////
|
||||||
|
// This code schedules 2 blocks at a time, with 4 lanes per block
|
||||||
|
/////////////////////////////////////////////////////////////////////////
|
||||||
|
#include "libc/macros.internal.h"
|
||||||
|
|
||||||
|
.ident "\n\
|
||||||
|
AVX2 SHA-256 (BSD-2 License)\n\
|
||||||
|
Copyright 2013 Intel Corporation\n"
|
||||||
|
.include "libc/disclaimer.inc"
|
||||||
|
|
||||||
|
## assume buffers not aligned
|
||||||
|
#define VMOVDQ vmovdqu
|
||||||
|
|
||||||
|
################################ Define Macros
|
||||||
|
|
||||||
|
# addm [mem], reg
|
||||||
|
# Add reg to mem using reg-mem add and store
|
||||||
|
.macro addm p1 p2
|
||||||
|
add \p1, \p2
|
||||||
|
mov \p2, \p1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
################################
|
||||||
|
|
||||||
|
X0 = %ymm4
|
||||||
|
X1 = %ymm5
|
||||||
|
X2 = %ymm6
|
||||||
|
X3 = %ymm7
|
||||||
|
|
||||||
|
# XMM versions of above
|
||||||
|
XWORD0 = %xmm4
|
||||||
|
XWORD1 = %xmm5
|
||||||
|
XWORD2 = %xmm6
|
||||||
|
XWORD3 = %xmm7
|
||||||
|
|
||||||
|
XTMP0 = %ymm0
|
||||||
|
XTMP1 = %ymm1
|
||||||
|
XTMP2 = %ymm2
|
||||||
|
XTMP3 = %ymm3
|
||||||
|
XTMP4 = %ymm8
|
||||||
|
XFER = %ymm9
|
||||||
|
XTMP5 = %ymm11
|
||||||
|
|
||||||
|
SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
|
||||||
|
SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
|
||||||
|
BYTE_FLIP_MASK = %ymm13
|
||||||
|
|
||||||
|
X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
|
||||||
|
|
||||||
|
NUM_BLKS = %rdx # 3rd arg
|
||||||
|
INP = %rsi # 2nd arg
|
||||||
|
CTX = %rdi # 1st arg
|
||||||
|
c = %ecx
|
||||||
|
d = %r8d
|
||||||
|
e = %edx # clobbers NUM_BLKS
|
||||||
|
y3 = %esi # clobbers INP
|
||||||
|
|
||||||
|
SRND = CTX # SRND is same register as CTX
|
||||||
|
|
||||||
|
a = %eax
|
||||||
|
b = %ebx
|
||||||
|
f = %r9d
|
||||||
|
g = %r10d
|
||||||
|
h = %r11d
|
||||||
|
old_h = %r11d
|
||||||
|
|
||||||
|
T1 = %r12d
|
||||||
|
y0 = %r13d
|
||||||
|
y1 = %r14d
|
||||||
|
y2 = %r15d
|
||||||
|
|
||||||
|
|
||||||
|
_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
|
||||||
|
_XMM_SAVE_SIZE = 0
|
||||||
|
_INP_END_SIZE = 8
|
||||||
|
_INP_SIZE = 8
|
||||||
|
_CTX_SIZE = 8
|
||||||
|
_RSP_SIZE = 8
|
||||||
|
|
||||||
|
_XFER = 0
|
||||||
|
_XMM_SAVE = _XFER + _XFER_SIZE
|
||||||
|
_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
|
||||||
|
_INP = _INP_END + _INP_END_SIZE
|
||||||
|
_CTX = _INP + _INP_SIZE
|
||||||
|
_RSP = _CTX + _CTX_SIZE
|
||||||
|
STACK_SIZE = _RSP + _RSP_SIZE
|
||||||
|
|
||||||
|
# rotate_Xs
|
||||||
|
# Rotate values of symbols X0...X3
|
||||||
|
.macro rotate_Xs
|
||||||
|
X_ = X0
|
||||||
|
X0 = X1
|
||||||
|
X1 = X2
|
||||||
|
X2 = X3
|
||||||
|
X3 = X_
|
||||||
|
.endm
|
||||||
|
|
||||||
|
# ROTATE_ARGS
|
||||||
|
# Rotate values of symbols a...h
|
||||||
|
.macro ROTATE_ARGS
|
||||||
|
old_h = h
|
||||||
|
TMP_ = h
|
||||||
|
h = g
|
||||||
|
g = f
|
||||||
|
f = e
|
||||||
|
e = d
|
||||||
|
d = c
|
||||||
|
c = b
|
||||||
|
b = a
|
||||||
|
a = TMP_
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro FOUR_ROUNDS_AND_SCHED disp
|
||||||
|
################################### RND N + 0 ############################
|
||||||
|
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||||
|
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||||
|
|
||||||
|
addl \disp(%rsp, SRND), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
|
||||||
|
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||||
|
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||||
|
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||||
|
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||||
|
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
vpsrld $7, XTMP1, XTMP2
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
vpslld $(32-7), XTMP1, XTMP3
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
|
||||||
|
|
||||||
|
vpsrld $18, XTMP1, XTMP2
|
||||||
|
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
add y3, h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
|
||||||
|
ROTATE_ARGS
|
||||||
|
|
||||||
|
################################### RND N + 1 ############################
|
||||||
|
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||||
|
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||||
|
offset = \disp + 1*4
|
||||||
|
addl offset(%rsp, SRND), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
|
||||||
|
|
||||||
|
vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
|
||||||
|
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||||
|
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
|
||||||
|
vpslld $(32-18), XTMP1, XTMP1
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||||
|
|
||||||
|
vpxor XTMP1, XTMP3, XTMP3
|
||||||
|
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
|
||||||
|
vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
|
||||||
|
vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
|
||||||
|
vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
add y3, h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
|
||||||
|
|
||||||
|
|
||||||
|
ROTATE_ARGS
|
||||||
|
|
||||||
|
################################### RND N + 2 ############################
|
||||||
|
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||||
|
offset = \disp + 2*4
|
||||||
|
addl offset(%rsp, SRND), h # h = k + w + h # --
|
||||||
|
|
||||||
|
vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
|
||||||
|
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||||
|
vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
|
||||||
|
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||||
|
vpxor XTMP3, XTMP2, XTMP2
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||||
|
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||||
|
vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
|
||||||
|
vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||||
|
rorx $2, a ,T1 # T1 = (a >> 2) # S0
|
||||||
|
vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
|
||||||
|
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1,h # h = k + w + h + S0 # --
|
||||||
|
add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
|
||||||
|
add y3,h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
|
||||||
|
ROTATE_ARGS
|
||||||
|
|
||||||
|
################################### RND N + 3 ############################
|
||||||
|
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||||
|
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||||
|
offset = \disp + 3*4
|
||||||
|
addl offset(%rsp, SRND), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
|
||||||
|
|
||||||
|
vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
|
||||||
|
vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
|
||||||
|
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
|
||||||
|
vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
|
||||||
|
vpxor XTMP3, XTMP2, XTMP2
|
||||||
|
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
|
||||||
|
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||||
|
vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
|
||||||
|
|
||||||
|
vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
add y3, h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
ROTATE_ARGS
|
||||||
|
rotate_Xs
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro DO_4ROUNDS disp
|
||||||
|
################################### RND N + 0 ###########################
|
||||||
|
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||||
|
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||||
|
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||||
|
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||||
|
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||||
|
addl \disp(%rsp, SRND), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
|
||||||
|
ROTATE_ARGS
|
||||||
|
|
||||||
|
################################### RND N + 1 ###########################
|
||||||
|
|
||||||
|
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||||
|
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||||
|
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
add y3, old_h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||||
|
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||||
|
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||||
|
offset = 4*1 + \disp
|
||||||
|
addl offset(%rsp, SRND), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
|
||||||
|
ROTATE_ARGS
|
||||||
|
|
||||||
|
################################### RND N + 2 ##############################
|
||||||
|
|
||||||
|
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||||
|
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||||
|
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
add y3, old_h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||||
|
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||||
|
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||||
|
offset = 4*2 + \disp
|
||||||
|
addl offset(%rsp, SRND), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
|
||||||
|
ROTATE_ARGS
|
||||||
|
|
||||||
|
################################### RND N + 3 ###########################
|
||||||
|
|
||||||
|
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $25, e, y0 # y0 = e >> 25 # S1A
|
||||||
|
rorx $11, e, y1 # y1 = e >> 11 # S1B
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
|
||||||
|
rorx $6, e, y1 # y1 = (e >> 6) # S1
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
add y3, old_h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
|
||||||
|
rorx $13, a, T1 # T1 = a >> 13 # S0B
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
rorx $22, a, y1 # y1 = a >> 22 # S0A
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
|
||||||
|
rorx $2, a, T1 # T1 = (a >> 2) # S0
|
||||||
|
offset = 4*3 + \disp
|
||||||
|
addl offset(%rsp, SRND), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
|
||||||
|
|
||||||
|
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
|
||||||
|
add y3, h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
ROTATE_ARGS
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
|
||||||
|
## arg 1 : pointer to state
|
||||||
|
## arg 2 : pointer to input data
|
||||||
|
## arg 3 : Num blocks
|
||||||
|
########################################################################
|
||||||
|
.text
|
||||||
|
sha256_transform_rorx:
|
||||||
|
.align 32
|
||||||
|
pushq %rbx
|
||||||
|
pushq %r12
|
||||||
|
pushq %r13
|
||||||
|
pushq %r14
|
||||||
|
pushq %r15
|
||||||
|
|
||||||
|
mov %rsp, %rax
|
||||||
|
subq $STACK_SIZE, %rsp
|
||||||
|
and $-32, %rsp # align rsp to 32 byte boundary
|
||||||
|
mov %rax, _RSP(%rsp)
|
||||||
|
|
||||||
|
|
||||||
|
shl $6, NUM_BLKS # convert to bytes
|
||||||
|
jz .Ldone_hash
|
||||||
|
lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
|
||||||
|
mov NUM_BLKS, _INP_END(%rsp)
|
||||||
|
|
||||||
|
cmp NUM_BLKS, INP
|
||||||
|
je .Lonly_one_block
|
||||||
|
|
||||||
|
## load initial digest
|
||||||
|
mov (CTX), a
|
||||||
|
mov 4*1(CTX), b
|
||||||
|
mov 4*2(CTX), c
|
||||||
|
mov 4*3(CTX), d
|
||||||
|
mov 4*4(CTX), e
|
||||||
|
mov 4*5(CTX), f
|
||||||
|
mov 4*6(CTX), g
|
||||||
|
mov 4*7(CTX), h
|
||||||
|
|
||||||
|
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
|
||||||
|
vmovdqa _SHUF_00BA(%rip), SHUF_00BA
|
||||||
|
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
|
||||||
|
|
||||||
|
mov CTX, _CTX(%rsp)
|
||||||
|
|
||||||
|
.Loop0:
|
||||||
|
## Load first 16 dwords from two blocks
|
||||||
|
VMOVDQ 0*32(INP),XTMP0
|
||||||
|
VMOVDQ 1*32(INP),XTMP1
|
||||||
|
VMOVDQ 2*32(INP),XTMP2
|
||||||
|
VMOVDQ 3*32(INP),XTMP3
|
||||||
|
|
||||||
|
## byte swap data
|
||||||
|
vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
|
||||||
|
vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
|
||||||
|
vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
|
||||||
|
vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
|
||||||
|
|
||||||
|
## transpose data into high/low halves
|
||||||
|
vperm2i128 $0x20, XTMP2, XTMP0, X0
|
||||||
|
vperm2i128 $0x31, XTMP2, XTMP0, X1
|
||||||
|
vperm2i128 $0x20, XTMP3, XTMP1, X2
|
||||||
|
vperm2i128 $0x31, XTMP3, XTMP1, X3
|
||||||
|
|
||||||
|
.Llast_block_enter:
|
||||||
|
add $64, INP
|
||||||
|
mov INP, _INP(%rsp)
|
||||||
|
|
||||||
|
## schedule 48 input dwords, by doing 3 rounds of 12 each
|
||||||
|
xor SRND, SRND
|
||||||
|
|
||||||
|
.align 16
|
||||||
|
.Loop1:
|
||||||
|
vpaddd K256+0*32(SRND), X0, XFER
|
||||||
|
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
|
||||||
|
FOUR_ROUNDS_AND_SCHED _XFER + 0*32
|
||||||
|
|
||||||
|
vpaddd K256+1*32(SRND), X0, XFER
|
||||||
|
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
|
||||||
|
FOUR_ROUNDS_AND_SCHED _XFER + 1*32
|
||||||
|
|
||||||
|
vpaddd K256+2*32(SRND), X0, XFER
|
||||||
|
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
|
||||||
|
FOUR_ROUNDS_AND_SCHED _XFER + 2*32
|
||||||
|
|
||||||
|
vpaddd K256+3*32(SRND), X0, XFER
|
||||||
|
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
|
||||||
|
FOUR_ROUNDS_AND_SCHED _XFER + 3*32
|
||||||
|
|
||||||
|
add $4*32, SRND
|
||||||
|
cmp $3*4*32, SRND
|
||||||
|
jb .Loop1
|
||||||
|
|
||||||
|
.Loop2:
|
||||||
|
## Do last 16 rounds with no scheduling
|
||||||
|
vpaddd K256+0*32(SRND), X0, XFER
|
||||||
|
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
|
||||||
|
DO_4ROUNDS _XFER + 0*32
|
||||||
|
|
||||||
|
vpaddd K256+1*32(SRND), X1, XFER
|
||||||
|
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
|
||||||
|
DO_4ROUNDS _XFER + 1*32
|
||||||
|
add $2*32, SRND
|
||||||
|
|
||||||
|
vmovdqa X2, X0
|
||||||
|
vmovdqa X3, X1
|
||||||
|
|
||||||
|
cmp $4*4*32, SRND
|
||||||
|
jb .Loop2
|
||||||
|
|
||||||
|
mov _CTX(%rsp), CTX
|
||||||
|
mov _INP(%rsp), INP
|
||||||
|
|
||||||
|
addm (4*0)(CTX),a
|
||||||
|
addm (4*1)(CTX),b
|
||||||
|
addm (4*2)(CTX),c
|
||||||
|
addm (4*3)(CTX),d
|
||||||
|
addm (4*4)(CTX),e
|
||||||
|
addm (4*5)(CTX),f
|
||||||
|
addm (4*6)(CTX),g
|
||||||
|
addm (4*7)(CTX),h
|
||||||
|
|
||||||
|
cmp _INP_END(%rsp), INP
|
||||||
|
ja .Ldone_hash
|
||||||
|
|
||||||
|
#### Do second block using previously scheduled results
|
||||||
|
xor SRND, SRND
|
||||||
|
.align 16
|
||||||
|
.Loop3:
|
||||||
|
DO_4ROUNDS _XFER + 0*32 + 16
|
||||||
|
DO_4ROUNDS _XFER + 1*32 + 16
|
||||||
|
add $2*32, SRND
|
||||||
|
cmp $4*4*32, SRND
|
||||||
|
jb .Loop3
|
||||||
|
|
||||||
|
mov _CTX(%rsp), CTX
|
||||||
|
mov _INP(%rsp), INP
|
||||||
|
add $64, INP
|
||||||
|
|
||||||
|
addm (4*0)(CTX),a
|
||||||
|
addm (4*1)(CTX),b
|
||||||
|
addm (4*2)(CTX),c
|
||||||
|
addm (4*3)(CTX),d
|
||||||
|
addm (4*4)(CTX),e
|
||||||
|
addm (4*5)(CTX),f
|
||||||
|
addm (4*6)(CTX),g
|
||||||
|
addm (4*7)(CTX),h
|
||||||
|
|
||||||
|
cmp _INP_END(%rsp), INP
|
||||||
|
jb .Loop0
|
||||||
|
ja .Ldone_hash
|
||||||
|
|
||||||
|
.Ldo_last_block:
|
||||||
|
VMOVDQ 0*16(INP),XWORD0
|
||||||
|
VMOVDQ 1*16(INP),XWORD1
|
||||||
|
VMOVDQ 2*16(INP),XWORD2
|
||||||
|
VMOVDQ 3*16(INP),XWORD3
|
||||||
|
|
||||||
|
vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||||
|
vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||||
|
vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||||
|
vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||||
|
|
||||||
|
jmp .Llast_block_enter
|
||||||
|
|
||||||
|
.Lonly_one_block:
|
||||||
|
|
||||||
|
## load initial digest
|
||||||
|
mov (4*0)(CTX),a
|
||||||
|
mov (4*1)(CTX),b
|
||||||
|
mov (4*2)(CTX),c
|
||||||
|
mov (4*3)(CTX),d
|
||||||
|
mov (4*4)(CTX),e
|
||||||
|
mov (4*5)(CTX),f
|
||||||
|
mov (4*6)(CTX),g
|
||||||
|
mov (4*7)(CTX),h
|
||||||
|
|
||||||
|
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
|
||||||
|
vmovdqa _SHUF_00BA(%rip), SHUF_00BA
|
||||||
|
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
|
||||||
|
|
||||||
|
mov CTX, _CTX(%rsp)
|
||||||
|
jmp .Ldo_last_block
|
||||||
|
|
||||||
|
.Ldone_hash:
|
||||||
|
|
||||||
|
mov _RSP(%rsp), %rsp
|
||||||
|
|
||||||
|
popq %r15
|
||||||
|
popq %r14
|
||||||
|
popq %r13
|
||||||
|
popq %r12
|
||||||
|
popq %rbx
|
||||||
|
ret
|
||||||
|
.endfn sha256_transform_rorx,globl
|
||||||
|
|
||||||
|
.section .rodata.cst512.K256, "aM", @progbits, 512
|
||||||
|
.align 64
|
||||||
|
K256:
|
||||||
|
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||||
|
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||||
|
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||||
|
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||||
|
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||||
|
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||||
|
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||||
|
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||||
|
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||||
|
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||||
|
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||||
|
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||||
|
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||||
|
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||||
|
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||||
|
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||||
|
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||||
|
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||||
|
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||||
|
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||||
|
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||||
|
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||||
|
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||||
|
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||||
|
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||||
|
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||||
|
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||||
|
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||||
|
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||||
|
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||||
|
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||||
|
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||||
|
|
||||||
|
.rodata.cst32
|
||||||
|
PSHUFFLE_BYTE_FLIP_MASK:
|
||||||
|
.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
|
||||||
|
|
||||||
|
# shuffle xBxA -> 00BA
|
||||||
|
.rodata.cst32
|
||||||
|
_SHUF_00BA:
|
||||||
|
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||||
|
|
||||||
|
# shuffle xDxC -> DC00
|
||||||
|
.rodata.cst32
|
||||||
|
_SHUF_DC00:
|
||||||
|
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
|
750
libc/nexgen32e/sha512.S
Normal file
750
libc/nexgen32e/sha512.S
Normal file
|
@ -0,0 +1,750 @@
|
||||||
|
/////////////////////////////////////////////////////////////////////////
|
||||||
|
// Implement fast SHA-512 with AVX2 instructions. (x86_64)
|
||||||
|
//
|
||||||
|
// Copyright (C) 2013 Intel Corporation.
|
||||||
|
//
|
||||||
|
// Authors:
|
||||||
|
// James Guilford <james.guilford@intel.com>
|
||||||
|
// Kirk Yap <kirk.s.yap@intel.com>
|
||||||
|
// David Cote <david.m.cote@intel.com>
|
||||||
|
// Tim Chen <tim.c.chen@linux.intel.com>
|
||||||
|
//
|
||||||
|
// This software is available to you under a choice of one of two
|
||||||
|
// licenses. You may choose to be licensed under the terms of the GNU
|
||||||
|
// General Public License (GPL) Version 2, available from the file
|
||||||
|
// COPYING in the main directory of this source tree, or the
|
||||||
|
// OpenIB.org BSD license below:
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or
|
||||||
|
// without modification, are permitted provided that the following
|
||||||
|
// conditions are met:
|
||||||
|
//
|
||||||
|
// - Redistributions of source code must retain the above
|
||||||
|
// copyright notice, this list of conditions and the following
|
||||||
|
// disclaimer.
|
||||||
|
//
|
||||||
|
// - Redistributions in binary form must reproduce the above
|
||||||
|
// copyright notice, this list of conditions and the following
|
||||||
|
// disclaimer in the documentation and/or other materials
|
||||||
|
// provided with the distribution.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||||
|
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||||
|
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
// SOFTWARE.
|
||||||
|
//
|
||||||
|
/////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// This code is described in an Intel White-Paper:
|
||||||
|
// "Fast SHA-512 Implementations on Intel Architecture Processors"
|
||||||
|
//
|
||||||
|
// To find it, surf to http://www.intel.com/p/en_US/embedded
|
||||||
|
// and search for that title.
|
||||||
|
//
|
||||||
|
/////////////////////////////////////////////////////////////////////////
|
||||||
|
// This code schedules 1 blocks at a time, with 4 lanes per block
|
||||||
|
/////////////////////////////////////////////////////////////////////////
|
||||||
|
#include "libc/macros.internal.h"
|
||||||
|
|
||||||
|
.ident "\n\
|
||||||
|
AVX2 SHA-512 (BSD-2 License)\n\
|
||||||
|
Copyright 2013 Intel Corporation\n"
|
||||||
|
.include "libc/disclaimer.inc"
|
||||||
|
|
||||||
|
# Virtual Registers
|
||||||
|
Y_0 = %ymm4
|
||||||
|
Y_1 = %ymm5
|
||||||
|
Y_2 = %ymm6
|
||||||
|
Y_3 = %ymm7
|
||||||
|
|
||||||
|
YTMP0 = %ymm0
|
||||||
|
YTMP1 = %ymm1
|
||||||
|
YTMP2 = %ymm2
|
||||||
|
YTMP3 = %ymm3
|
||||||
|
YTMP4 = %ymm8
|
||||||
|
XFER = YTMP0
|
||||||
|
|
||||||
|
BYTE_FLIP_MASK = %ymm9
|
||||||
|
|
||||||
|
# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
|
||||||
|
CTX1 = %rdi
|
||||||
|
CTX2 = %r12
|
||||||
|
# 2nd arg
|
||||||
|
INP = %rsi
|
||||||
|
# 3rd arg
|
||||||
|
NUM_BLKS = %rdx
|
||||||
|
|
||||||
|
c = %rcx
|
||||||
|
d = %r8
|
||||||
|
e = %rdx
|
||||||
|
y3 = %rsi
|
||||||
|
|
||||||
|
TBL = %rdi # clobbers CTX1
|
||||||
|
|
||||||
|
a = %rax
|
||||||
|
b = %rbx
|
||||||
|
|
||||||
|
f = %r9
|
||||||
|
g = %r10
|
||||||
|
h = %r11
|
||||||
|
old_h = %r11
|
||||||
|
|
||||||
|
T1 = %r12 # clobbers CTX2
|
||||||
|
y0 = %r13
|
||||||
|
y1 = %r14
|
||||||
|
y2 = %r15
|
||||||
|
|
||||||
|
# Local variables (stack frame)
|
||||||
|
XFER_SIZE = 4*8
|
||||||
|
SRND_SIZE = 1*8
|
||||||
|
INP_SIZE = 1*8
|
||||||
|
INPEND_SIZE = 1*8
|
||||||
|
CTX_SIZE = 1*8
|
||||||
|
RSPSAVE_SIZE = 1*8
|
||||||
|
GPRSAVE_SIZE = 5*8
|
||||||
|
|
||||||
|
frame_XFER = 0
|
||||||
|
frame_SRND = frame_XFER + XFER_SIZE
|
||||||
|
frame_INP = frame_SRND + SRND_SIZE
|
||||||
|
frame_INPEND = frame_INP + INP_SIZE
|
||||||
|
frame_CTX = frame_INPEND + INPEND_SIZE
|
||||||
|
frame_RSPSAVE = frame_CTX + CTX_SIZE
|
||||||
|
frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
|
||||||
|
frame_size = frame_GPRSAVE + GPRSAVE_SIZE
|
||||||
|
|
||||||
|
## assume buffers not aligned
|
||||||
|
#define VMOVDQ vmovdqu
|
||||||
|
|
||||||
|
# addm [mem], reg
|
||||||
|
# Add reg to mem using reg-mem add and store
|
||||||
|
.macro addm p1 p2
|
||||||
|
add \p1, \p2
|
||||||
|
mov \p2, \p1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
|
||||||
|
# Load ymm with mem and byte swap each dword
|
||||||
|
.macro COPY_YMM_AND_BSWAP p1 p2 p3
|
||||||
|
VMOVDQ \p2, \p1
|
||||||
|
vpshufb \p3, \p1, \p1
|
||||||
|
.endm
|
||||||
|
# rotate_Ys
|
||||||
|
# Rotate values of symbols Y0...Y3
|
||||||
|
.macro rotate_Ys
|
||||||
|
Y_ = Y_0
|
||||||
|
Y_0 = Y_1
|
||||||
|
Y_1 = Y_2
|
||||||
|
Y_2 = Y_3
|
||||||
|
Y_3 = Y_
|
||||||
|
.endm
|
||||||
|
|
||||||
|
# RotateState
|
||||||
|
.macro RotateState
|
||||||
|
# Rotate symbols a..h right
|
||||||
|
old_h = h
|
||||||
|
TMP_ = h
|
||||||
|
h = g
|
||||||
|
g = f
|
||||||
|
f = e
|
||||||
|
e = d
|
||||||
|
d = c
|
||||||
|
c = b
|
||||||
|
b = a
|
||||||
|
a = TMP_
|
||||||
|
.endm
|
||||||
|
|
||||||
|
# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
|
||||||
|
# YDST = {YSRC1, YSRC2} >> RVAL*8
|
||||||
|
.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
|
||||||
|
vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
|
||||||
|
vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro FOUR_ROUNDS_AND_SCHED
|
||||||
|
################################### RND N + 0 #########################################
|
||||||
|
|
||||||
|
# Extract w[t-7]
|
||||||
|
MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
|
||||||
|
# Calculate w[t-16] + w[t-7]
|
||||||
|
vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
|
||||||
|
# Extract w[t-15]
|
||||||
|
MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
|
||||||
|
|
||||||
|
# Calculate sigma0
|
||||||
|
|
||||||
|
# Calculate w[t-15] ror 1
|
||||||
|
vpsrlq $1, YTMP1, YTMP2
|
||||||
|
vpsllq $(64-1), YTMP1, YTMP3
|
||||||
|
vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
|
||||||
|
# Calculate w[t-15] shr 7
|
||||||
|
vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
|
||||||
|
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||||
|
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||||
|
add frame_XFER(%rsp),h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||||
|
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||||
|
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||||
|
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||||
|
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
|
||||||
|
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
add y3, h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
RotateState
|
||||||
|
|
||||||
|
################################### RND N + 1 #########################################
|
||||||
|
|
||||||
|
# Calculate w[t-15] ror 8
|
||||||
|
vpsrlq $8, YTMP1, YTMP2
|
||||||
|
vpsllq $(64-8), YTMP1, YTMP1
|
||||||
|
vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
|
||||||
|
# XOR the three components
|
||||||
|
vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
|
||||||
|
vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
|
||||||
|
|
||||||
|
|
||||||
|
# Add three components, w[t-16], w[t-7] and sigma0
|
||||||
|
vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
|
||||||
|
# Move to appropriate lanes for calculating w[16] and w[17]
|
||||||
|
vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
|
||||||
|
# Move to appropriate lanes for calculating w[18] and w[19]
|
||||||
|
vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
|
||||||
|
|
||||||
|
# Calculate w[16] and w[17] in both 128 bit lanes
|
||||||
|
|
||||||
|
# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
|
||||||
|
vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
|
||||||
|
vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
|
||||||
|
|
||||||
|
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||||
|
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||||
|
add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
|
||||||
|
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
|
||||||
|
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||||
|
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||||
|
|
||||||
|
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
add y3, h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
RotateState
|
||||||
|
|
||||||
|
|
||||||
|
################################### RND N + 2 #########################################
|
||||||
|
|
||||||
|
vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
|
||||||
|
vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
|
||||||
|
vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
|
||||||
|
vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
|
||||||
|
vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
|
||||||
|
vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
|
||||||
|
vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
|
||||||
|
vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
|
||||||
|
# (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
|
||||||
|
|
||||||
|
# Add sigma1 to the other compunents to get w[16] and w[17]
|
||||||
|
vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
|
||||||
|
|
||||||
|
# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
|
||||||
|
vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
|
||||||
|
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||||
|
add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
|
||||||
|
|
||||||
|
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
|
||||||
|
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||||
|
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||||
|
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
|
||||||
|
add y3, h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
RotateState
|
||||||
|
|
||||||
|
################################### RND N + 3 #########################################
|
||||||
|
|
||||||
|
vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
|
||||||
|
vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
|
||||||
|
vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
|
||||||
|
vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
|
||||||
|
vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
|
||||||
|
vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
|
||||||
|
vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
|
||||||
|
vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
|
||||||
|
# (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
|
||||||
|
|
||||||
|
# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
|
||||||
|
# to newly calculated sigma1 to get w[18] and w[19]
|
||||||
|
vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
|
||||||
|
|
||||||
|
# Form w[19, w[18], w17], w[16]
|
||||||
|
vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
|
||||||
|
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||||
|
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||||
|
add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
|
||||||
|
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
|
||||||
|
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
|
||||||
|
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
|
||||||
|
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
add y3, h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
RotateState
|
||||||
|
|
||||||
|
rotate_Ys
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro DO_4ROUNDS
|
||||||
|
|
||||||
|
################################### RND N + 0 #########################################
|
||||||
|
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||||
|
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||||
|
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||||
|
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||||
|
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||||
|
add frame_XFER(%rsp), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
|
||||||
|
RotateState
|
||||||
|
|
||||||
|
################################### RND N + 1 #########################################
|
||||||
|
|
||||||
|
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||||
|
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||||
|
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
add y3, old_h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||||
|
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||||
|
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||||
|
add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
|
||||||
|
RotateState
|
||||||
|
|
||||||
|
################################### RND N + 2 #########################################
|
||||||
|
|
||||||
|
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||||
|
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||||
|
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
add y3, old_h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||||
|
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||||
|
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||||
|
add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
|
||||||
|
RotateState
|
||||||
|
|
||||||
|
################################### RND N + 3 #########################################
|
||||||
|
|
||||||
|
add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
mov f, y2 # y2 = f # CH
|
||||||
|
rorx $41, e, y0 # y0 = e >> 41 # S1A
|
||||||
|
rorx $18, e, y1 # y1 = e >> 18 # S1B
|
||||||
|
xor g, y2 # y2 = f^g # CH
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
|
||||||
|
rorx $14, e, y1 # y1 = (e >> 14) # S1
|
||||||
|
and e, y2 # y2 = (f^g)&e # CH
|
||||||
|
add y3, old_h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
|
||||||
|
rorx $34, a, T1 # T1 = a >> 34 # S0B
|
||||||
|
xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
|
||||||
|
rorx $39, a, y1 # y1 = a >> 39 # S0A
|
||||||
|
mov a, y3 # y3 = a # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
|
||||||
|
rorx $28, a, T1 # T1 = (a >> 28) # S0
|
||||||
|
add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
|
||||||
|
or c, y3 # y3 = a|c # MAJA
|
||||||
|
|
||||||
|
xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
|
||||||
|
mov a, T1 # T1 = a # MAJB
|
||||||
|
and b, y3 # y3 = (a|c)&b # MAJA
|
||||||
|
and c, T1 # T1 = a&c # MAJB
|
||||||
|
add y0, y2 # y2 = S1 + CH # --
|
||||||
|
|
||||||
|
|
||||||
|
add h, d # d = k + w + h + d # --
|
||||||
|
or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
|
||||||
|
add y1, h # h = k + w + h + S0 # --
|
||||||
|
|
||||||
|
add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
|
||||||
|
|
||||||
|
add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
|
||||||
|
|
||||||
|
add y3, h # h = t1 + S0 + MAJ # --
|
||||||
|
|
||||||
|
RotateState
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
|
||||||
|
# Purpose: Updates the SHA512 digest stored at "state" with the message
|
||||||
|
# stored in "data".
|
||||||
|
# The size of the message pointed to by "data" must be an integer multiple
|
||||||
|
# of SHA512 message blocks.
|
||||||
|
# "blocks" is the message length in SHA512 blocks
|
||||||
|
########################################################################
|
||||||
|
sha512_transform_rorx:
|
||||||
|
# Allocate Stack Space
|
||||||
|
mov %rsp, %rax
|
||||||
|
sub $frame_size, %rsp
|
||||||
|
and $~(0x20 - 1), %rsp
|
||||||
|
mov %rax, frame_RSPSAVE(%rsp)
|
||||||
|
|
||||||
|
# Save GPRs
|
||||||
|
mov %rbx, 8*0+frame_GPRSAVE(%rsp)
|
||||||
|
mov %r12, 8*1+frame_GPRSAVE(%rsp)
|
||||||
|
mov %r13, 8*2+frame_GPRSAVE(%rsp)
|
||||||
|
mov %r14, 8*3+frame_GPRSAVE(%rsp)
|
||||||
|
mov %r15, 8*4+frame_GPRSAVE(%rsp)
|
||||||
|
|
||||||
|
shl $7, NUM_BLKS # convert to bytes
|
||||||
|
jz .Ldone_hash
|
||||||
|
add INP, NUM_BLKS # pointer to end of data
|
||||||
|
mov NUM_BLKS, frame_INPEND(%rsp)
|
||||||
|
|
||||||
|
## load initial digest
|
||||||
|
mov 8*0(CTX1), a
|
||||||
|
mov 8*1(CTX1), b
|
||||||
|
mov 8*2(CTX1), c
|
||||||
|
mov 8*3(CTX1), d
|
||||||
|
mov 8*4(CTX1), e
|
||||||
|
mov 8*5(CTX1), f
|
||||||
|
mov 8*6(CTX1), g
|
||||||
|
mov 8*7(CTX1), h
|
||||||
|
|
||||||
|
# save %rdi (CTX) before it gets clobbered
|
||||||
|
mov %rdi, frame_CTX(%rsp)
|
||||||
|
|
||||||
|
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
|
||||||
|
|
||||||
|
.Loop0:
|
||||||
|
lea K512(%rip), TBL
|
||||||
|
|
||||||
|
## byte swap first 16 dwords
|
||||||
|
COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
|
||||||
|
COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
|
||||||
|
COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
|
||||||
|
COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
|
||||||
|
|
||||||
|
mov INP, frame_INP(%rsp)
|
||||||
|
|
||||||
|
## schedule 64 input dwords, by doing 12 rounds of 4 each
|
||||||
|
movq $4, frame_SRND(%rsp)
|
||||||
|
|
||||||
|
.align 16
|
||||||
|
.Loop1:
|
||||||
|
vpaddq (TBL), Y_0, XFER
|
||||||
|
vmovdqa XFER, frame_XFER(%rsp)
|
||||||
|
FOUR_ROUNDS_AND_SCHED
|
||||||
|
|
||||||
|
vpaddq 1*32(TBL), Y_0, XFER
|
||||||
|
vmovdqa XFER, frame_XFER(%rsp)
|
||||||
|
FOUR_ROUNDS_AND_SCHED
|
||||||
|
|
||||||
|
vpaddq 2*32(TBL), Y_0, XFER
|
||||||
|
vmovdqa XFER, frame_XFER(%rsp)
|
||||||
|
FOUR_ROUNDS_AND_SCHED
|
||||||
|
|
||||||
|
vpaddq 3*32(TBL), Y_0, XFER
|
||||||
|
vmovdqa XFER, frame_XFER(%rsp)
|
||||||
|
add $(4*32), TBL
|
||||||
|
FOUR_ROUNDS_AND_SCHED
|
||||||
|
|
||||||
|
subq $1, frame_SRND(%rsp)
|
||||||
|
jne .Loop1
|
||||||
|
|
||||||
|
movq $2, frame_SRND(%rsp)
|
||||||
|
.Loop2:
|
||||||
|
vpaddq (TBL), Y_0, XFER
|
||||||
|
vmovdqa XFER, frame_XFER(%rsp)
|
||||||
|
DO_4ROUNDS
|
||||||
|
vpaddq 1*32(TBL), Y_1, XFER
|
||||||
|
vmovdqa XFER, frame_XFER(%rsp)
|
||||||
|
add $(2*32), TBL
|
||||||
|
DO_4ROUNDS
|
||||||
|
|
||||||
|
vmovdqa Y_2, Y_0
|
||||||
|
vmovdqa Y_3, Y_1
|
||||||
|
|
||||||
|
subq $1, frame_SRND(%rsp)
|
||||||
|
jne .Loop2
|
||||||
|
|
||||||
|
mov frame_CTX(%rsp), CTX2
|
||||||
|
addm 8*0(CTX2), a
|
||||||
|
addm 8*1(CTX2), b
|
||||||
|
addm 8*2(CTX2), c
|
||||||
|
addm 8*3(CTX2), d
|
||||||
|
addm 8*4(CTX2), e
|
||||||
|
addm 8*5(CTX2), f
|
||||||
|
addm 8*6(CTX2), g
|
||||||
|
addm 8*7(CTX2), h
|
||||||
|
|
||||||
|
mov frame_INP(%rsp), INP
|
||||||
|
add $128, INP
|
||||||
|
cmp frame_INPEND(%rsp), INP
|
||||||
|
jne .Loop0
|
||||||
|
|
||||||
|
.Ldone_hash:
|
||||||
|
|
||||||
|
# Restore GPRs
|
||||||
|
mov 8*0+frame_GPRSAVE(%rsp), %rbx
|
||||||
|
mov 8*1+frame_GPRSAVE(%rsp), %r12
|
||||||
|
mov 8*2+frame_GPRSAVE(%rsp), %r13
|
||||||
|
mov 8*3+frame_GPRSAVE(%rsp), %r14
|
||||||
|
mov 8*4+frame_GPRSAVE(%rsp), %r15
|
||||||
|
|
||||||
|
# Restore Stack Pointer
|
||||||
|
mov frame_RSPSAVE(%rsp), %rsp
|
||||||
|
ret
|
||||||
|
.endfn sha512_transform_rorx,globl
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
### Binary Data
|
||||||
|
|
||||||
|
|
||||||
|
# Mergeable 640-byte rodata section. This allows linker to merge the table
|
||||||
|
# with other, exactly the same 640-byte fragment of another rodata section
|
||||||
|
# (if such section exists).
|
||||||
|
.section .rodata.cst640.K512, "aM", @progbits, 640
|
||||||
|
.align 64
|
||||||
|
# K[t] used in SHA512 hashing
|
||||||
|
K512:
|
||||||
|
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
|
||||||
|
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
|
||||||
|
.quad 0x3956c25bf348b538,0x59f111f1b605d019
|
||||||
|
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
|
||||||
|
.quad 0xd807aa98a3030242,0x12835b0145706fbe
|
||||||
|
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
|
||||||
|
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
|
||||||
|
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
|
||||||
|
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
|
||||||
|
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
|
||||||
|
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
|
||||||
|
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
|
||||||
|
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
|
||||||
|
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
|
||||||
|
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
|
||||||
|
.quad 0x06ca6351e003826f,0x142929670a0e6e70
|
||||||
|
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
|
||||||
|
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
|
||||||
|
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
|
||||||
|
.quad 0x81c2c92e47edaee6,0x92722c851482353b
|
||||||
|
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
|
||||||
|
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
|
||||||
|
.quad 0xd192e819d6ef5218,0xd69906245565a910
|
||||||
|
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
|
||||||
|
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
|
||||||
|
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
|
||||||
|
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
|
||||||
|
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
|
||||||
|
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
|
||||||
|
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
|
||||||
|
.quad 0x90befffa23631e28,0xa4506cebde82bde9
|
||||||
|
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
|
||||||
|
.quad 0xca273eceea26619c,0xd186b8c721c0c207
|
||||||
|
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
|
||||||
|
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
|
||||||
|
.quad 0x113f9804bef90dae,0x1b710b35131c471b
|
||||||
|
.quad 0x28db77f523047d84,0x32caab7b40c72493
|
||||||
|
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
|
||||||
|
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
|
||||||
|
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
|
||||||
|
|
||||||
|
.rodata.cst32
|
||||||
|
# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
|
||||||
|
PSHUFFLE_BYTE_FLIP_MASK:
|
||||||
|
.octa 0x08090a0b0c0d0e0f0001020304050607
|
||||||
|
.octa 0x18191a1b1c1d1e1f1011121314151617
|
||||||
|
|
||||||
|
.rodata.cst32
|
||||||
|
MASK_YMM_LO:
|
||||||
|
.octa 0x00000000000000000000000000000000
|
||||||
|
.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
|
|
@ -24,4 +24,4 @@ if CLANG=$(command -v clang); then
|
||||||
o/$MODE/test/libc/release/smokeclang.com.dbg || exit
|
o/$MODE/test/libc/release/smokeclang.com.dbg || exit
|
||||||
fi
|
fi
|
||||||
|
|
||||||
touch o/$MODE/test/libc/release/clang.ok
|
touch o/$MODE/test/libc/release/lld.ok
|
||||||
|
|
2
third_party/mbedtls/config.h
vendored
2
third_party/mbedtls/config.h
vendored
|
@ -119,9 +119,9 @@
|
||||||
|
|
||||||
#define MBEDTLS_MD5_SMALLER
|
#define MBEDTLS_MD5_SMALLER
|
||||||
#define MBEDTLS_SHA1_SMALLER
|
#define MBEDTLS_SHA1_SMALLER
|
||||||
#ifdef TINY
|
|
||||||
#define MBEDTLS_SHA256_SMALLER
|
#define MBEDTLS_SHA256_SMALLER
|
||||||
#define MBEDTLS_SHA512_SMALLER
|
#define MBEDTLS_SHA512_SMALLER
|
||||||
|
#ifdef TINY
|
||||||
#define MBEDTLS_AES_ROM_TABLES
|
#define MBEDTLS_AES_ROM_TABLES
|
||||||
#define MBEDTLS_AES_FEWER_TABLES
|
#define MBEDTLS_AES_FEWER_TABLES
|
||||||
#else
|
#else
|
||||||
|
|
63
third_party/mbedtls/gcm.c
vendored
63
third_party/mbedtls/gcm.c
vendored
|
@ -96,8 +96,7 @@ static int gcm_gen_table( mbedtls_gcm_context *ctx )
|
||||||
|
|
||||||
#if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64)
|
#if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64)
|
||||||
/* With CLMUL support, we need only h, not the rest of the table */
|
/* With CLMUL support, we need only h, not the rest of the table */
|
||||||
if( X86_HAVE( PCLMUL ) )
|
if (X86_HAVE(AES) && X86_HAVE(PCLMUL)) return 0;
|
||||||
return( 0 );
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* 0 corresponds to 0 in GF(2^128) */
|
/* 0 corresponds to 0 in GF(2^128) */
|
||||||
|
@ -191,7 +190,7 @@ static void gcm_mult( mbedtls_gcm_context *ctx, const unsigned char x[16],
|
||||||
uint64_t zh, zl;
|
uint64_t zh, zl;
|
||||||
|
|
||||||
#if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64)
|
#if defined(MBEDTLS_AESNI_C) && defined(MBEDTLS_HAVE_X86_64)
|
||||||
if( X86_HAVE( PCLMUL ) ) {
|
if (X86_HAVE(AES) && X86_HAVE(PCLMUL)) {
|
||||||
unsigned char h[16];
|
unsigned char h[16];
|
||||||
|
|
||||||
PUT_UINT32_BE( ctx->HH[8] >> 32, h, 0 );
|
PUT_UINT32_BE( ctx->HH[8] >> 32, h, 0 );
|
||||||
|
@ -240,11 +239,11 @@ static void gcm_mult( mbedtls_gcm_context *ctx, const unsigned char x[16],
|
||||||
}
|
}
|
||||||
|
|
||||||
int mbedtls_gcm_starts( mbedtls_gcm_context *ctx,
|
int mbedtls_gcm_starts( mbedtls_gcm_context *ctx,
|
||||||
int mode,
|
int mode,
|
||||||
const unsigned char *iv,
|
const unsigned char *iv,
|
||||||
size_t iv_len,
|
size_t iv_len,
|
||||||
const unsigned char *add,
|
const unsigned char *add,
|
||||||
size_t add_len )
|
size_t add_len )
|
||||||
{
|
{
|
||||||
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
|
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
|
||||||
unsigned char work_buf[16];
|
unsigned char work_buf[16];
|
||||||
|
@ -327,9 +326,9 @@ int mbedtls_gcm_starts( mbedtls_gcm_context *ctx,
|
||||||
}
|
}
|
||||||
|
|
||||||
int mbedtls_gcm_update( mbedtls_gcm_context *ctx,
|
int mbedtls_gcm_update( mbedtls_gcm_context *ctx,
|
||||||
size_t length,
|
size_t length,
|
||||||
const unsigned char *input,
|
const unsigned char *input,
|
||||||
unsigned char *output )
|
unsigned char *output )
|
||||||
{
|
{
|
||||||
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
|
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
|
||||||
unsigned char ectr[16];
|
unsigned char ectr[16];
|
||||||
|
@ -390,8 +389,8 @@ int mbedtls_gcm_update( mbedtls_gcm_context *ctx,
|
||||||
}
|
}
|
||||||
|
|
||||||
int mbedtls_gcm_finish( mbedtls_gcm_context *ctx,
|
int mbedtls_gcm_finish( mbedtls_gcm_context *ctx,
|
||||||
unsigned char *tag,
|
unsigned char *tag,
|
||||||
size_t tag_len )
|
size_t tag_len )
|
||||||
{
|
{
|
||||||
unsigned char work_buf[16];
|
unsigned char work_buf[16];
|
||||||
size_t i;
|
size_t i;
|
||||||
|
@ -431,16 +430,16 @@ int mbedtls_gcm_finish( mbedtls_gcm_context *ctx,
|
||||||
}
|
}
|
||||||
|
|
||||||
int mbedtls_gcm_crypt_and_tag( mbedtls_gcm_context *ctx,
|
int mbedtls_gcm_crypt_and_tag( mbedtls_gcm_context *ctx,
|
||||||
int mode,
|
int mode,
|
||||||
size_t length,
|
size_t length,
|
||||||
const unsigned char *iv,
|
const unsigned char *iv,
|
||||||
size_t iv_len,
|
size_t iv_len,
|
||||||
const unsigned char *add,
|
const unsigned char *add,
|
||||||
size_t add_len,
|
size_t add_len,
|
||||||
const unsigned char *input,
|
const unsigned char *input,
|
||||||
unsigned char *output,
|
unsigned char *output,
|
||||||
size_t tag_len,
|
size_t tag_len,
|
||||||
unsigned char *tag )
|
unsigned char *tag )
|
||||||
{
|
{
|
||||||
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
|
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
|
||||||
|
|
||||||
|
@ -464,15 +463,15 @@ int mbedtls_gcm_crypt_and_tag( mbedtls_gcm_context *ctx,
|
||||||
}
|
}
|
||||||
|
|
||||||
int mbedtls_gcm_auth_decrypt( mbedtls_gcm_context *ctx,
|
int mbedtls_gcm_auth_decrypt( mbedtls_gcm_context *ctx,
|
||||||
size_t length,
|
size_t length,
|
||||||
const unsigned char *iv,
|
const unsigned char *iv,
|
||||||
size_t iv_len,
|
size_t iv_len,
|
||||||
const unsigned char *add,
|
const unsigned char *add,
|
||||||
size_t add_len,
|
size_t add_len,
|
||||||
const unsigned char *tag,
|
const unsigned char *tag,
|
||||||
size_t tag_len,
|
size_t tag_len,
|
||||||
const unsigned char *input,
|
const unsigned char *input,
|
||||||
unsigned char *output )
|
unsigned char *output )
|
||||||
{
|
{
|
||||||
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
|
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
|
||||||
unsigned char check_tag[16];
|
unsigned char check_tag[16];
|
||||||
|
|
17
third_party/mbedtls/sha1.c
vendored
17
third_party/mbedtls/sha1.c
vendored
|
@ -1,4 +1,6 @@
|
||||||
#include "libc/bits/bits.h"
|
#include "libc/bits/bits.h"
|
||||||
|
#include "libc/macros.internal.h"
|
||||||
|
#include "libc/nexgen32e/x86feature.h"
|
||||||
#include "libc/str/str.h"
|
#include "libc/str/str.h"
|
||||||
#include "third_party/mbedtls/common.h"
|
#include "third_party/mbedtls/common.h"
|
||||||
#include "third_party/mbedtls/endian.h"
|
#include "third_party/mbedtls/endian.h"
|
||||||
|
@ -37,6 +39,8 @@ asm(".include \"libc/disclaimer.inc\"");
|
||||||
* http://www.itl.nist.gov/fipspubs/fip180-1.htm
|
* http://www.itl.nist.gov/fipspubs/fip180-1.htm
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
void sha1_transform_avx2(mbedtls_sha1_context *, const uint8_t *, int);
|
||||||
|
|
||||||
#define SHA1_VALIDATE_RET(cond) \
|
#define SHA1_VALIDATE_RET(cond) \
|
||||||
MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA1_BAD_INPUT_DATA )
|
MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA1_BAD_INPUT_DATA )
|
||||||
|
|
||||||
|
@ -145,6 +149,11 @@ int mbedtls_internal_sha1_process( mbedtls_sha1_context *ctx,
|
||||||
SHA1_VALIDATE_RET( ctx != NULL );
|
SHA1_VALIDATE_RET( ctx != NULL );
|
||||||
SHA1_VALIDATE_RET( (const unsigned char *)data != NULL );
|
SHA1_VALIDATE_RET( (const unsigned char *)data != NULL );
|
||||||
|
|
||||||
|
if (!IsTiny() && X86_HAVE(AVX2) && X86_HAVE(BMI) && X86_HAVE(BMI2)) {
|
||||||
|
sha1_transform_avx2(ctx, data, 1);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef MBEDTLS_SHA1_SMALLER
|
#ifdef MBEDTLS_SHA1_SMALLER
|
||||||
#define ROL(a, b) ((a << b) | (a >> (32 - b)))
|
#define ROL(a, b) ((a << b) | (a >> (32 - b)))
|
||||||
|
|
||||||
|
@ -387,8 +396,8 @@ int mbedtls_sha1_update_ret( mbedtls_sha1_context *ctx,
|
||||||
size_t ilen )
|
size_t ilen )
|
||||||
{
|
{
|
||||||
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
|
int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
|
||||||
size_t fill;
|
|
||||||
uint32_t left;
|
uint32_t left;
|
||||||
|
size_t n, fill;
|
||||||
|
|
||||||
SHA1_VALIDATE_RET( ctx != NULL );
|
SHA1_VALIDATE_RET( ctx != NULL );
|
||||||
SHA1_VALIDATE_RET( ilen == 0 || input != NULL );
|
SHA1_VALIDATE_RET( ilen == 0 || input != NULL );
|
||||||
|
@ -417,6 +426,12 @@ int mbedtls_sha1_update_ret( mbedtls_sha1_context *ctx,
|
||||||
left = 0;
|
left = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!IsTiny() && ilen >= 64 && X86_HAVE(AVX2) && X86_HAVE(BMI) && X86_HAVE(BMI2)) {
|
||||||
|
sha1_transform_avx2(ctx, input, ilen / 64);
|
||||||
|
input += ROUNDDOWN(ilen, 64);
|
||||||
|
ilen -= ROUNDDOWN(ilen, 64);
|
||||||
|
}
|
||||||
|
|
||||||
while( ilen >= 64 )
|
while( ilen >= 64 )
|
||||||
{
|
{
|
||||||
if( ( ret = mbedtls_internal_sha1_process( ctx, input ) ) != 0 )
|
if( ( ret = mbedtls_internal_sha1_process( ctx, input ) ) != 0 )
|
||||||
|
|
2
third_party/mbedtls/sha1.h
vendored
2
third_party/mbedtls/sha1.h
vendored
|
@ -18,8 +18,8 @@ COSMOPOLITAN_C_START_
|
||||||
*/
|
*/
|
||||||
typedef struct mbedtls_sha1_context
|
typedef struct mbedtls_sha1_context
|
||||||
{
|
{
|
||||||
uint32_t total[2]; /*!< The number of Bytes processed. */
|
|
||||||
uint32_t state[5]; /*!< The intermediate digest state. */
|
uint32_t state[5]; /*!< The intermediate digest state. */
|
||||||
|
uint32_t total[2]; /*!< The number of Bytes processed. */
|
||||||
uint8_t buffer[64]; /*!< The data block being processed. */
|
uint8_t buffer[64]; /*!< The data block being processed. */
|
||||||
}
|
}
|
||||||
mbedtls_sha1_context;
|
mbedtls_sha1_context;
|
||||||
|
|
31
third_party/mbedtls/sha256.c
vendored
31
third_party/mbedtls/sha256.c
vendored
|
@ -1,3 +1,6 @@
|
||||||
|
#include "libc/dce.h"
|
||||||
|
#include "libc/macros.internal.h"
|
||||||
|
#include "libc/nexgen32e/x86feature.h"
|
||||||
#include "libc/str/str.h"
|
#include "libc/str/str.h"
|
||||||
#include "third_party/mbedtls/common.h"
|
#include "third_party/mbedtls/common.h"
|
||||||
#include "third_party/mbedtls/endian.h"
|
#include "third_party/mbedtls/endian.h"
|
||||||
|
@ -40,6 +43,8 @@ asm(".include \"libc/disclaimer.inc\"");
|
||||||
MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA256_BAD_INPUT_DATA )
|
MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA256_BAD_INPUT_DATA )
|
||||||
#define SHA256_VALIDATE(cond) MBEDTLS_INTERNAL_VALIDATE( cond )
|
#define SHA256_VALIDATE(cond) MBEDTLS_INTERNAL_VALIDATE( cond )
|
||||||
|
|
||||||
|
void sha256_transform_rorx(mbedtls_sha256_context *, const uint8_t *, int);
|
||||||
|
|
||||||
#if !defined(MBEDTLS_SHA256_ALT)
|
#if !defined(MBEDTLS_SHA256_ALT)
|
||||||
|
|
||||||
void mbedtls_sha256_init( mbedtls_sha256_context *ctx )
|
void mbedtls_sha256_init( mbedtls_sha256_context *ctx )
|
||||||
|
@ -151,7 +156,7 @@ static const uint32_t K[] =
|
||||||
} while( 0 )
|
} while( 0 )
|
||||||
|
|
||||||
int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
|
int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
|
||||||
const unsigned char data[64] )
|
const unsigned char data[64] )
|
||||||
{
|
{
|
||||||
struct
|
struct
|
||||||
{
|
{
|
||||||
|
@ -164,20 +169,22 @@ int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
|
||||||
SHA256_VALIDATE_RET( ctx != NULL );
|
SHA256_VALIDATE_RET( ctx != NULL );
|
||||||
SHA256_VALIDATE_RET( (const unsigned char *)data != NULL );
|
SHA256_VALIDATE_RET( (const unsigned char *)data != NULL );
|
||||||
|
|
||||||
|
if (!IsTiny() && X86_HAVE(AVX2) && X86_HAVE(BMI2)) {
|
||||||
|
sha256_transform_rorx(ctx, data, 1);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
for( i = 0; i < 8; i++ )
|
for( i = 0; i < 8; i++ )
|
||||||
local.A[i] = ctx->state[i];
|
local.A[i] = ctx->state[i];
|
||||||
|
|
||||||
#if defined(MBEDTLS_SHA256_SMALLER)
|
#if defined(MBEDTLS_SHA256_SMALLER)
|
||||||
for( i = 0; i < 64; i++ )
|
for( i = 0; i < 64; i++ ) {
|
||||||
{
|
|
||||||
if( i < 16 )
|
if( i < 16 )
|
||||||
GET_UINT32_BE( local.W[i], data, 4 * i );
|
GET_UINT32_BE( local.W[i], data, 4 * i );
|
||||||
else
|
else
|
||||||
R( i );
|
R( i );
|
||||||
|
|
||||||
P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
|
P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
|
||||||
local.A[5], local.A[6], local.A[7], local.W[i], K[i] );
|
local.A[5], local.A[6], local.A[7], local.W[i], K[i] );
|
||||||
|
|
||||||
local.temp1 = local.A[7]; local.A[7] = local.A[6];
|
local.temp1 = local.A[7]; local.A[7] = local.A[6];
|
||||||
local.A[6] = local.A[5]; local.A[5] = local.A[4];
|
local.A[6] = local.A[5]; local.A[5] = local.A[4];
|
||||||
local.A[4] = local.A[3]; local.A[3] = local.A[2];
|
local.A[4] = local.A[3]; local.A[3] = local.A[2];
|
||||||
|
@ -187,9 +194,7 @@ int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
|
||||||
#else /* MBEDTLS_SHA256_SMALLER */
|
#else /* MBEDTLS_SHA256_SMALLER */
|
||||||
for( i = 0; i < 16; i++ )
|
for( i = 0; i < 16; i++ )
|
||||||
GET_UINT32_BE( local.W[i], data, 4 * i );
|
GET_UINT32_BE( local.W[i], data, 4 * i );
|
||||||
|
for( i = 0; i < 16; i += 8 ) {
|
||||||
for( i = 0; i < 16; i += 8 )
|
|
||||||
{
|
|
||||||
P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
|
P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
|
||||||
local.A[5], local.A[6], local.A[7], local.W[i+0], K[i+0] );
|
local.A[5], local.A[6], local.A[7], local.W[i+0], K[i+0] );
|
||||||
P( local.A[7], local.A[0], local.A[1], local.A[2], local.A[3],
|
P( local.A[7], local.A[0], local.A[1], local.A[2], local.A[3],
|
||||||
|
@ -207,9 +212,7 @@ int mbedtls_internal_sha256_process( mbedtls_sha256_context *ctx,
|
||||||
P( local.A[1], local.A[2], local.A[3], local.A[4], local.A[5],
|
P( local.A[1], local.A[2], local.A[3], local.A[4], local.A[5],
|
||||||
local.A[6], local.A[7], local.A[0], local.W[i+7], K[i+7] );
|
local.A[6], local.A[7], local.A[0], local.W[i+7], K[i+7] );
|
||||||
}
|
}
|
||||||
|
for( i = 16; i < 64; i += 8 ) {
|
||||||
for( i = 16; i < 64; i += 8 )
|
|
||||||
{
|
|
||||||
P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
|
P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
|
||||||
local.A[5], local.A[6], local.A[7], R(i+0), K[i+0] );
|
local.A[5], local.A[6], local.A[7], R(i+0), K[i+0] );
|
||||||
P( local.A[7], local.A[0], local.A[1], local.A[2], local.A[3],
|
P( local.A[7], local.A[0], local.A[1], local.A[2], local.A[3],
|
||||||
|
@ -278,6 +281,12 @@ int mbedtls_sha256_update_ret( mbedtls_sha256_context *ctx,
|
||||||
left = 0;
|
left = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!IsTiny() && ilen >= 64 && X86_HAVE(AVX2) && X86_HAVE(BMI2)) {
|
||||||
|
sha256_transform_rorx(ctx, input, ilen / 64);
|
||||||
|
input += ROUNDDOWN(ilen, 64);
|
||||||
|
ilen -= ROUNDDOWN(ilen, 64);
|
||||||
|
}
|
||||||
|
|
||||||
while( ilen >= 64 )
|
while( ilen >= 64 )
|
||||||
{
|
{
|
||||||
if( ( ret = mbedtls_internal_sha256_process( ctx, input ) ) != 0 )
|
if( ( ret = mbedtls_internal_sha256_process( ctx, input ) ) != 0 )
|
||||||
|
|
2
third_party/mbedtls/sha256.h
vendored
2
third_party/mbedtls/sha256.h
vendored
|
@ -16,8 +16,8 @@ COSMOPOLITAN_C_START_
|
||||||
*/
|
*/
|
||||||
typedef struct mbedtls_sha256_context
|
typedef struct mbedtls_sha256_context
|
||||||
{
|
{
|
||||||
uint32_t total[2]; /*!< The number of Bytes processed. */
|
|
||||||
uint32_t state[8]; /*!< The intermediate digest state. */
|
uint32_t state[8]; /*!< The intermediate digest state. */
|
||||||
|
uint32_t total[2]; /*!< The number of Bytes processed. */
|
||||||
unsigned char buffer[64]; /*!< The data block being processed. */
|
unsigned char buffer[64]; /*!< The data block being processed. */
|
||||||
int is224; /*!< Determines which function to use:
|
int is224; /*!< Determines which function to use:
|
||||||
0: Use SHA-256, or 1: Use SHA-224. */
|
0: Use SHA-256, or 1: Use SHA-224. */
|
||||||
|
|
28
third_party/mbedtls/sha512.c
vendored
28
third_party/mbedtls/sha512.c
vendored
|
@ -1,4 +1,6 @@
|
||||||
#include "libc/literal.h"
|
#include "libc/literal.h"
|
||||||
|
#include "libc/macros.internal.h"
|
||||||
|
#include "libc/nexgen32e/x86feature.h"
|
||||||
#include "libc/str/str.h"
|
#include "libc/str/str.h"
|
||||||
#include "third_party/mbedtls/common.h"
|
#include "third_party/mbedtls/common.h"
|
||||||
#include "third_party/mbedtls/endian.h"
|
#include "third_party/mbedtls/endian.h"
|
||||||
|
@ -37,6 +39,8 @@ asm(".include \"libc/disclaimer.inc\"");
|
||||||
* http://csrc.nist.gov/publications/fips/fips180-2/fips180-2.pdf
|
* http://csrc.nist.gov/publications/fips/fips180-2/fips180-2.pdf
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
void sha512_transform_rorx(mbedtls_sha512_context *, const uint8_t *, int);
|
||||||
|
|
||||||
#if defined(MBEDTLS_SHA512_C)
|
#if defined(MBEDTLS_SHA512_C)
|
||||||
|
|
||||||
#define SHA512_VALIDATE_RET(cond) \
|
#define SHA512_VALIDATE_RET(cond) \
|
||||||
|
@ -224,12 +228,16 @@ int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx,
|
||||||
SHA512_VALIDATE_RET( ctx != NULL );
|
SHA512_VALIDATE_RET( ctx != NULL );
|
||||||
SHA512_VALIDATE_RET( (const unsigned char *)data != NULL );
|
SHA512_VALIDATE_RET( (const unsigned char *)data != NULL );
|
||||||
|
|
||||||
|
if (!IsTiny() && X86_HAVE(AVX2)) {
|
||||||
|
sha512_transform_rorx(ctx, data, 1);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
#define SHR(x,n) ((x) >> (n))
|
#define SHR(x,n) ((x) >> (n))
|
||||||
#define ROTR(x,n) (SHR((x),(n)) | ((x) << (64 - (n))))
|
#define ROTR(x,n) (SHR((x),(n)) | ((x) << (64 - (n))))
|
||||||
|
|
||||||
#define S0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7))
|
#define S0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7))
|
||||||
#define S1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x, 6))
|
#define S1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x, 6))
|
||||||
|
|
||||||
#define S2(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39))
|
#define S2(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39))
|
||||||
#define S3(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41))
|
#define S3(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41))
|
||||||
|
|
||||||
|
@ -263,10 +271,14 @@ int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx,
|
||||||
P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
|
P( local.A[0], local.A[1], local.A[2], local.A[3], local.A[4],
|
||||||
local.A[5], local.A[6], local.A[7], local.W[i], K[i] );
|
local.A[5], local.A[6], local.A[7], local.W[i], K[i] );
|
||||||
|
|
||||||
local.temp1 = local.A[7]; local.A[7] = local.A[6];
|
local.temp1 = local.A[7];
|
||||||
local.A[6] = local.A[5]; local.A[5] = local.A[4];
|
local.A[7] = local.A[6];
|
||||||
local.A[4] = local.A[3]; local.A[3] = local.A[2];
|
local.A[6] = local.A[5];
|
||||||
local.A[2] = local.A[1]; local.A[1] = local.A[0];
|
local.A[5] = local.A[4];
|
||||||
|
local.A[4] = local.A[3];
|
||||||
|
local.A[3] = local.A[2];
|
||||||
|
local.A[2] = local.A[1];
|
||||||
|
local.A[1] = local.A[0];
|
||||||
local.A[0] = local.temp1;
|
local.A[0] = local.temp1;
|
||||||
}
|
}
|
||||||
#else /* MBEDTLS_SHA512_SMALLER */
|
#else /* MBEDTLS_SHA512_SMALLER */
|
||||||
|
@ -362,6 +374,12 @@ int mbedtls_sha512_update_ret( mbedtls_sha512_context *ctx,
|
||||||
left = 0;
|
left = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!IsTiny() && ilen >= 128 && X86_HAVE(AVX2)) {
|
||||||
|
sha512_transform_rorx(ctx, input, ilen / 128);
|
||||||
|
input += ROUNDDOWN(ilen, 128);
|
||||||
|
ilen -= ROUNDDOWN(ilen, 128);
|
||||||
|
}
|
||||||
|
|
||||||
while( ilen >= 128 )
|
while( ilen >= 128 )
|
||||||
{
|
{
|
||||||
if( ( ret = mbedtls_internal_sha512_process( ctx, input ) ) != 0 )
|
if( ( ret = mbedtls_internal_sha512_process( ctx, input ) ) != 0 )
|
||||||
|
|
2
third_party/mbedtls/sha512.h
vendored
2
third_party/mbedtls/sha512.h
vendored
|
@ -16,8 +16,8 @@ COSMOPOLITAN_C_START_
|
||||||
*/
|
*/
|
||||||
typedef struct mbedtls_sha512_context
|
typedef struct mbedtls_sha512_context
|
||||||
{
|
{
|
||||||
uint64_t total[2]; /*!< The number of Bytes processed. */
|
|
||||||
uint64_t state[8]; /*!< The intermediate digest state. */
|
uint64_t state[8]; /*!< The intermediate digest state. */
|
||||||
|
uint64_t total[2]; /*!< The number of Bytes processed. */
|
||||||
unsigned char buffer[128]; /*!< The data block being processed. */
|
unsigned char buffer[128]; /*!< The data block being processed. */
|
||||||
#if !defined(MBEDTLS_SHA512_NO_SHA384)
|
#if !defined(MBEDTLS_SHA512_NO_SHA384)
|
||||||
int is384; /*!< Determines which function to use:
|
int is384; /*!< Determines which function to use:
|
||||||
|
|
10
third_party/mbedtls/test/lib.c
vendored
10
third_party/mbedtls/test/lib.c
vendored
|
@ -1021,17 +1021,15 @@ int execute_tests(int argc, const char **argv, const char *default_filename) {
|
||||||
if (unmet_dep_count > 0 || ret == DISPATCH_UNSUPPORTED_SUITE) {
|
if (unmet_dep_count > 0 || ret == DISPATCH_UNSUPPORTED_SUITE) {
|
||||||
total_skipped++;
|
total_skipped++;
|
||||||
WRITE("----");
|
WRITE("----");
|
||||||
if (1 == option_verbose && ret == DISPATCH_UNSUPPORTED_SUITE) {
|
|
||||||
WRITE("\n Test Suite not enabled");
|
|
||||||
}
|
|
||||||
if (1 == option_verbose && unmet_dep_count > 0) {
|
if (1 == option_verbose && unmet_dep_count > 0) {
|
||||||
WRITE("\n Unmet dependencies: ");
|
WRITE(" (unmet dependencies: ");
|
||||||
for (i = 0; i < unmet_dep_count; i++) {
|
for (i = 0; i < unmet_dep_count; i++) {
|
||||||
WRITE("%d ", unmet_dependencies[i]);
|
if (i) WRITE(",");
|
||||||
|
WRITE("%d", unmet_dependencies[i]);
|
||||||
}
|
}
|
||||||
if (missing_unmet_dependencies) WRITE("...");
|
if (missing_unmet_dependencies) WRITE("...");
|
||||||
}
|
}
|
||||||
WRITE("\n");
|
WRITE(")\n");
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
unmet_dep_count = 0;
|
unmet_dep_count = 0;
|
||||||
missing_unmet_dependencies = 0;
|
missing_unmet_dependencies = 0;
|
||||||
|
|
Loading…
Reference in a new issue