///////////////////////////////////////////////////////////////////////// // Implement fast SHA-256 with AVX2 instructions. (x86_64) // // Copyright (C) 2013 Intel Corporation. // // Authors: // James Guilford // Kirk Yap // Tim Chen // // This software is available to you under a choice of one of two // licenses. You may choose to be licensed under the terms of the GNU // General Public License (GPL) Version 2, available from the file // COPYING in the main directory of this source tree, or the // OpenIB.org BSD license below: // // Redistribution and use in source and binary forms, with or // without modification, are permitted provided that the following // conditions are met: // // - Redistributions of source code must retain the above // copyright notice, this list of conditions and the following // disclaimer. // // - Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials // provided with the distribution. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // ///////////////////////////////////////////////////////////////////////// // // This code is described in an Intel White-Paper: // "Fast SHA-256 Implementations on Intel Architecture Processors" // // To find it, surf to http://www.intel.com/p/en_US/embedded // and search for that title. // ///////////////////////////////////////////////////////////////////////// // This code schedules 2 blocks at a time, with 4 lanes per block ///////////////////////////////////////////////////////////////////////// #include "libc/macros.internal.h" .ident "\n\ AVX2 SHA2 (BSD-2 License)\n\ Copyright 2013 Intel Corporation\n" .include "libc/disclaimer.inc" ## assume buffers not aligned #define VMOVDQ vmovdqu ################################ Define Macros # addm [mem], reg # Add reg to mem using reg-mem add and store .macro addm p1 p2 add \p1, \p2 mov \p2, \p1 .endm ################################ X0 = %ymm4 X1 = %ymm5 X2 = %ymm6 X3 = %ymm7 # XMM versions of above XWORD0 = %xmm4 XWORD1 = %xmm5 XWORD2 = %xmm6 XWORD3 = %xmm7 XTMP0 = %ymm0 XTMP1 = %ymm1 XTMP2 = %ymm2 XTMP3 = %ymm3 XTMP4 = %ymm8 XFER = %ymm9 XTMP5 = %ymm11 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 BYTE_FLIP_MASK = %ymm13 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK NUM_BLKS = %rdx # 3rd arg INP = %rsi # 2nd arg CTX = %rdi # 1st arg c = %ecx d = %r8d e = %edx # clobbers NUM_BLKS y3 = %esi # clobbers INP SRND = CTX # SRND is same register as CTX a = %eax b = %ebx f = %r9d g = %r10d h = %r11d old_h = %r11d T1 = %r12d y0 = %r13d y1 = %r14d y2 = %r15d _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round _XMM_SAVE_SIZE = 0 _INP_END_SIZE = 8 _INP_SIZE = 8 _CTX_SIZE = 8 _RSP_SIZE = 8 _XFER = 0 _XMM_SAVE = _XFER + _XFER_SIZE _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE _INP = _INP_END + _INP_END_SIZE _CTX = _INP + _INP_SIZE _RSP = _CTX + _CTX_SIZE STACK_SIZE = _RSP + _RSP_SIZE # rotate_Xs # Rotate values of symbols X0...X3 .macro rotate_Xs X_ = X0 X0 = X1 X1 = X2 X2 = X3 X3 = X_ .endm # ROTATE_ARGS # Rotate values of symbols a...h .macro ROTATE_ARGS old_h = h TMP_ = h h = g g = f f = e e = d d = c c = b b = a a = TMP_ .endm .macro FOUR_ROUNDS_AND_SCHED disp ################################### RND N + 0 ############################ mov a, y3 # y3 = a # MAJA rorx $25, e, y0 # y0 = e >> 25 # S1A rorx $11, e, y1 # y1 = e >> 11 # S1B addl \disp(%rsp, SRND), h # h = k + w + h # -- or c, y3 # y3 = a|c # MAJA vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] mov f, y2 # y2 = f # CH rorx $13, a, T1 # T1 = a >> 13 # S0B xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 xor g, y2 # y2 = f^g # CH vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 rorx $6, e, y1 # y1 = (e >> 6) # S1 and e, y2 # y2 = (f^g)&e # CH xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 rorx $22, a, y1 # y1 = a >> 22 # S0A add h, d # d = k + w + h + d # -- and b, y3 # y3 = (a|c)&b # MAJA vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 rorx $2, a, T1 # T1 = (a >> 2) # S0 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH vpsrld $7, XTMP1, XTMP2 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 mov a, T1 # T1 = a # MAJB and c, T1 # T1 = a&c # MAJB add y0, y2 # y2 = S1 + CH # -- vpslld $(32-7), XTMP1, XTMP3 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ add y1, h # h = k + w + h + S0 # -- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 vpsrld $18, XTMP1, XTMP2 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- add y3, h # h = t1 + S0 + MAJ # -- ROTATE_ARGS ################################### RND N + 1 ############################ mov a, y3 # y3 = a # MAJA rorx $25, e, y0 # y0 = e >> 25 # S1A rorx $11, e, y1 # y1 = e >> 11 # S1B offset = \disp + 1*4 addl offset(%rsp, SRND), h # h = k + w + h # -- or c, y3 # y3 = a|c # MAJA vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 mov f, y2 # y2 = f # CH rorx $13, a, T1 # T1 = a >> 13 # S0B xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 xor g, y2 # y2 = f^g # CH rorx $6, e, y1 # y1 = (e >> 6) # S1 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 rorx $22, a, y1 # y1 = a >> 22 # S0A and e, y2 # y2 = (f^g)&e # CH add h, d # d = k + w + h + d # -- vpslld $(32-18), XTMP1, XTMP1 and b, y3 # y3 = (a|c)&b # MAJA xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 vpxor XTMP1, XTMP3, XTMP3 rorx $2, a, T1 # T1 = (a >> 2) # S0 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 mov a, T1 # T1 = a # MAJB and c, T1 # T1 = a&c # MAJB add y0, y2 # y2 = S1 + CH # -- vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ add y1, h # h = k + w + h + S0 # -- vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- add y3, h # h = t1 + S0 + MAJ # -- vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} ROTATE_ARGS ################################### RND N + 2 ############################ mov a, y3 # y3 = a # MAJA rorx $25, e, y0 # y0 = e >> 25 # S1A offset = \disp + 2*4 addl offset(%rsp, SRND), h # h = k + w + h # -- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} rorx $11, e, y1 # y1 = e >> 11 # S1B or c, y3 # y3 = a|c # MAJA mov f, y2 # y2 = f # CH xor g, y2 # y2 = f^g # CH rorx $13, a, T1 # T1 = a >> 13 # S0B xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} and e, y2 # y2 = (f^g)&e # CH rorx $6, e, y1 # y1 = (e >> 6) # S1 vpxor XTMP3, XTMP2, XTMP2 add h, d # d = k + w + h + d # -- and b, y3 # y3 = (a|c)&b # MAJA xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 rorx $22, a, y1 # y1 = a >> 22 # S0A vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} xor g, y2 # y2 = CH = ((f^g)&e)^g # CH vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 rorx $2, a ,T1 # T1 = (a >> 2) # S0 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 mov a, T1 # T1 = a # MAJB and c, T1 # T1 = a&c # MAJB add y0, y2 # y2 = S1 + CH # -- vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ add y1,h # h = k + w + h + S0 # -- add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- add y3,h # h = t1 + S0 + MAJ # -- ROTATE_ARGS ################################### RND N + 3 ############################ mov a, y3 # y3 = a # MAJA rorx $25, e, y0 # y0 = e >> 25 # S1A rorx $11, e, y1 # y1 = e >> 11 # S1B offset = \disp + 3*4 addl offset(%rsp, SRND), h # h = k + w + h # -- or c, y3 # y3 = a|c # MAJA vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} mov f, y2 # y2 = f # CH rorx $13, a, T1 # T1 = a >> 13 # S0B xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 xor g, y2 # y2 = f^g # CH vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} rorx $6, e, y1 # y1 = (e >> 6) # S1 and e, y2 # y2 = (f^g)&e # CH add h, d # d = k + w + h + d # -- and b, y3 # y3 = (a|c)&b # MAJA vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH vpxor XTMP3, XTMP2, XTMP2 rorx $22, a, y1 # y1 = a >> 22 # S0A add y0, y2 # y2 = S1 + CH # -- vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- rorx $2, a, T1 # T1 = (a >> 2) # S0 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 mov a, T1 # T1 = a # MAJB and c, T1 # T1 = a&c # MAJB or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ add y1, h # h = k + w + h + S0 # -- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- add y3, h # h = t1 + S0 + MAJ # -- ROTATE_ARGS rotate_Xs .endm .macro DO_4ROUNDS disp ################################### RND N + 0 ########################### mov f, y2 # y2 = f # CH rorx $25, e, y0 # y0 = e >> 25 # S1A rorx $11, e, y1 # y1 = e >> 11 # S1B xor g, y2 # y2 = f^g # CH xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 rorx $6, e, y1 # y1 = (e >> 6) # S1 and e, y2 # y2 = (f^g)&e # CH xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 rorx $13, a, T1 # T1 = a >> 13 # S0B xor g, y2 # y2 = CH = ((f^g)&e)^g # CH rorx $22, a, y1 # y1 = a >> 22 # S0A mov a, y3 # y3 = a # MAJA xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 rorx $2, a, T1 # T1 = (a >> 2) # S0 addl \disp(%rsp, SRND), h # h = k + w + h # -- or c, y3 # y3 = a|c # MAJA xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 mov a, T1 # T1 = a # MAJB and b, y3 # y3 = (a|c)&b # MAJA and c, T1 # T1 = a&c # MAJB add y0, y2 # y2 = S1 + CH # -- add h, d # d = k + w + h + d # -- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ add y1, h # h = k + w + h + S0 # -- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- ROTATE_ARGS ################################### RND N + 1 ########################### add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- mov f, y2 # y2 = f # CH rorx $25, e, y0 # y0 = e >> 25 # S1A rorx $11, e, y1 # y1 = e >> 11 # S1B xor g, y2 # y2 = f^g # CH xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 rorx $6, e, y1 # y1 = (e >> 6) # S1 and e, y2 # y2 = (f^g)&e # CH add y3, old_h # h = t1 + S0 + MAJ # -- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 rorx $13, a, T1 # T1 = a >> 13 # S0B xor g, y2 # y2 = CH = ((f^g)&e)^g # CH rorx $22, a, y1 # y1 = a >> 22 # S0A mov a, y3 # y3 = a # MAJA xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 rorx $2, a, T1 # T1 = (a >> 2) # S0 offset = 4*1 + \disp addl offset(%rsp, SRND), h # h = k + w + h # -- or c, y3 # y3 = a|c # MAJA xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 mov a, T1 # T1 = a # MAJB and b, y3 # y3 = (a|c)&b # MAJA and c, T1 # T1 = a&c # MAJB add y0, y2 # y2 = S1 + CH # -- add h, d # d = k + w + h + d # -- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ add y1, h # h = k + w + h + S0 # -- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- ROTATE_ARGS ################################### RND N + 2 ############################## add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- mov f, y2 # y2 = f # CH rorx $25, e, y0 # y0 = e >> 25 # S1A rorx $11, e, y1 # y1 = e >> 11 # S1B xor g, y2 # y2 = f^g # CH xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 rorx $6, e, y1 # y1 = (e >> 6) # S1 and e, y2 # y2 = (f^g)&e # CH add y3, old_h # h = t1 + S0 + MAJ # -- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 rorx $13, a, T1 # T1 = a >> 13 # S0B xor g, y2 # y2 = CH = ((f^g)&e)^g # CH rorx $22, a, y1 # y1 = a >> 22 # S0A mov a, y3 # y3 = a # MAJA xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 rorx $2, a, T1 # T1 = (a >> 2) # S0 offset = 4*2 + \disp addl offset(%rsp, SRND), h # h = k + w + h # -- or c, y3 # y3 = a|c # MAJA xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 mov a, T1 # T1 = a # MAJB and b, y3 # y3 = (a|c)&b # MAJA and c, T1 # T1 = a&c # MAJB add y0, y2 # y2 = S1 + CH # -- add h, d # d = k + w + h + d # -- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ add y1, h # h = k + w + h + S0 # -- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- ROTATE_ARGS ################################### RND N + 3 ########################### add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- mov f, y2 # y2 = f # CH rorx $25, e, y0 # y0 = e >> 25 # S1A rorx $11, e, y1 # y1 = e >> 11 # S1B xor g, y2 # y2 = f^g # CH xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 rorx $6, e, y1 # y1 = (e >> 6) # S1 and e, y2 # y2 = (f^g)&e # CH add y3, old_h # h = t1 + S0 + MAJ # -- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 rorx $13, a, T1 # T1 = a >> 13 # S0B xor g, y2 # y2 = CH = ((f^g)&e)^g # CH rorx $22, a, y1 # y1 = a >> 22 # S0A mov a, y3 # y3 = a # MAJA xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 rorx $2, a, T1 # T1 = (a >> 2) # S0 offset = 4*3 + \disp addl offset(%rsp, SRND), h # h = k + w + h # -- or c, y3 # y3 = a|c # MAJA xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 mov a, T1 # T1 = a # MAJB and b, y3 # y3 = (a|c)&b # MAJA and c, T1 # T1 = a&c # MAJB add y0, y2 # y2 = S1 + CH # -- add h, d # d = k + w + h + d # -- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ add y1, h # h = k + w + h + S0 # -- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- add y3, h # h = t1 + S0 + MAJ # -- ROTATE_ARGS .endm ######################################################################## ## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) ## arg 1 : pointer to state ## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## .text .align 32 sha256_transform_rorx: push %rbp mov %rsp,%rbp .profilable pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 mov %rsp, %rax subq $STACK_SIZE, %rsp and $-32, %rsp # align rsp to 32 byte boundary mov %rax, _RSP(%rsp) shl $6, NUM_BLKS # convert to bytes jz .Ldone_hash lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block mov NUM_BLKS, _INP_END(%rsp) cmp NUM_BLKS, INP je .Lonly_one_block ## load initial digest mov (CTX), a mov 4*1(CTX), b mov 4*2(CTX), c mov 4*3(CTX), d mov 4*4(CTX), e mov 4*5(CTX), f mov 4*6(CTX), g mov 4*7(CTX), h vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK vmovdqa _SHUF_00BA(%rip), SHUF_00BA vmovdqa _SHUF_DC00(%rip), SHUF_DC00 mov CTX, _CTX(%rsp) .Loop0: ## Load first 16 dwords from two blocks VMOVDQ 0*32(INP),XTMP0 VMOVDQ 1*32(INP),XTMP1 VMOVDQ 2*32(INP),XTMP2 VMOVDQ 3*32(INP),XTMP3 ## byte swap data vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 ## transpose data into high/low halves vperm2i128 $0x20, XTMP2, XTMP0, X0 vperm2i128 $0x31, XTMP2, XTMP0, X1 vperm2i128 $0x20, XTMP3, XTMP1, X2 vperm2i128 $0x31, XTMP3, XTMP1, X3 .Llast_block_enter: add $64, INP mov INP, _INP(%rsp) ## schedule 48 input dwords, by doing 3 rounds of 12 each xor SRND, SRND .align 16 .Loop1: vpaddd kSha256x2+0*32(SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 0*32 vpaddd kSha256x2+1*32(SRND), X0, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 1*32 vpaddd kSha256x2+2*32(SRND), X0, XFER vmovdqa XFER, 2*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 2*32 vpaddd kSha256x2+3*32(SRND), X0, XFER vmovdqa XFER, 3*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 3*32 add $4*32, SRND cmp $3*4*32, SRND jb .Loop1 .Loop2: ## Do last 16 rounds with no scheduling vpaddd kSha256x2+0*32(SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 0*32 vpaddd kSha256x2+1*32(SRND), X1, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 1*32 add $2*32, SRND vmovdqa X2, X0 vmovdqa X3, X1 cmp $4*4*32, SRND jb .Loop2 mov _CTX(%rsp), CTX mov _INP(%rsp), INP addm (4*0)(CTX),a addm (4*1)(CTX),b addm (4*2)(CTX),c addm (4*3)(CTX),d addm (4*4)(CTX),e addm (4*5)(CTX),f addm (4*6)(CTX),g addm (4*7)(CTX),h cmp _INP_END(%rsp), INP ja .Ldone_hash #### Do second block using previously scheduled results xor SRND, SRND .align 16 .Loop3: DO_4ROUNDS _XFER + 0*32 + 16 DO_4ROUNDS _XFER + 1*32 + 16 add $2*32, SRND cmp $4*4*32, SRND jb .Loop3 mov _CTX(%rsp), CTX mov _INP(%rsp), INP add $64, INP addm (4*0)(CTX),a addm (4*1)(CTX),b addm (4*2)(CTX),c addm (4*3)(CTX),d addm (4*4)(CTX),e addm (4*5)(CTX),f addm (4*6)(CTX),g addm (4*7)(CTX),h cmp _INP_END(%rsp), INP jb .Loop0 ja .Ldone_hash .Ldo_last_block: VMOVDQ 0*16(INP),XWORD0 VMOVDQ 1*16(INP),XWORD1 VMOVDQ 2*16(INP),XWORD2 VMOVDQ 3*16(INP),XWORD3 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 jmp .Llast_block_enter .Lonly_one_block: ## load initial digest mov (4*0)(CTX),a mov (4*1)(CTX),b mov (4*2)(CTX),c mov (4*3)(CTX),d mov (4*4)(CTX),e mov (4*5)(CTX),f mov (4*6)(CTX),g mov (4*7)(CTX),h vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK vmovdqa _SHUF_00BA(%rip), SHUF_00BA vmovdqa _SHUF_DC00(%rip), SHUF_DC00 mov CTX, _CTX(%rsp) jmp .Ldo_last_block .Ldone_hash: mov _RSP(%rsp), %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx pop %rbp ret .endfn sha256_transform_rorx,globl .rodata.cst32 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 .octa 0x0c0d0e0f08090a0b0405060700010203 # shuffle xBxA -> 00BA .rodata.cst32 _SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 # shuffle xDxC -> DC00 .rodata.cst32 _SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF .bss .align 64 kSha256x2: .zero 512 .endobj kSha256x2,globl .previous .init.start 201,_init_kSha256x2 push $64 pop %rcx ezlea kSha256,dx ezlea kSha256x2,ax 0: movaps -16(%rdx,%rcx,4),%xmm0 movaps %xmm0,-16(%rax,%rcx,8) movaps %xmm0,-32(%rax,%rcx,8) sub $4,%ecx jnz 0b .init.end 201,_init_kSha256x2