/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ │ vi: set noet ft=asm ts=8 sw=8 fenc=utf-8 :vi │ ╞══════════════════════════════════════════════════════════════════════════════╡ │ │ │ Copyright 2015 Intel Corporation │ │ │ │ Redistribution and use in source and binary forms, with or without │ │ modification, are permitted provided that the following conditions │ │ are met: │ │ │ │ * Redistributions of source code must retain the above copyright │ │ notice, this list of conditions and the following disclaimer. │ │ * Redistributions in binary form must reproduce the above copyright │ │ notice, this list of conditions and the following disclaimer in │ │ the documentation and/or other materials provided with the │ │ distribution. │ │ * Neither the name of Intel Corporation nor the names of its │ │ contributors may be used to endorse or promote products derived │ │ from this software without specific prior written permission. │ │ │ │ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS │ │ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT │ │ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR │ │ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT │ │ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, │ │ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT │ │ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │ │ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │ │ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT │ │ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE │ │ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. │ │ │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/macros.internal.h" .section .notice,"aR",@progbits .asciz "\n\n\ Intel SHA-NI (BSD-3 License)\n\ Copyright 2015 Intel Corporation\n\ Sean Gulley \n\ Tim Chen " .text .balign 32 #define FRAME_SIZE 32 #define DIGEST_PTR %rdi #define DATA_PTR %rsi #define NUM_BLKS %rdx #define ABCD %xmm0 #define E0 %xmm1 /* Need two E's b/c they ping pong */ #define E1 %xmm2 #define MSG0 %xmm3 #define MSG1 %xmm4 #define MSG2 %xmm5 #define MSG3 %xmm6 #define SHUF_MASK %xmm7 // Performs Intel® SHA-NI™ optimized SHA-1 update. // // The function takes a pointer to the current hash values, a // pointer to the input data, and a number of 64 byte blocks to // process. Once all blocks have been processed, the digest pointer // is updated with the resulting hash value. The function only // processes complete blocks, there is no functionality to store // partial blocks. All message padding and hash value // initialization must be done outside the update function. // // The indented lines in the loop are instructions related to // rounds processing. The non-indented lines are instructions // related to the message schedule. // // void sha1_transform_ni(uint32_t digest[static 5], // const void *data, // uint32_t numBlocks); // // @param %rdi points to output digest // @param %rsi points to input data // @param %rdx is number of 64-byte blocks to process // @see X86_HAVE(SHA) .ftrace1 sha1_transform_ni: .ftrace2 push %rbp mov %rsp,%rbp sub $FRAME_SIZE,%rsp shl $6,NUM_BLKS # convert to bytes jz .Ldone_hash add DATA_PTR,NUM_BLKS # pointer to end of data // load initial hash values movdqa UPPER_WORD_MASK(%rip),E1 pinsrd $3,1*16(DIGEST_PTR),E0 movdqu 0*16(DIGEST_PTR),ABCD pand E1,E0 pshufd $0x1B,ABCD,ABCD movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip),SHUF_MASK .Lloop0: // Save hash values for addition after rounds movdqa E0,(0*16)(%rsp) movdqa ABCD,(1*16)(%rsp) // Rounds 0-3 movdqu 0*16(DATA_PTR),MSG0 pshufb SHUF_MASK,MSG0 paddd MSG0,E0 movdqa ABCD,E1 sha1rnds4 $0,E0,ABCD // Rounds 4-7 movdqu 1*16(DATA_PTR),MSG1 pshufb SHUF_MASK,MSG1 sha1nexte MSG1,E1 movdqa ABCD,E0 sha1rnds4 $0,E1,ABCD sha1msg1 MSG1,MSG0 // Rounds 8-11 movdqu 2*16(DATA_PTR),MSG2 pshufb SHUF_MASK,MSG2 sha1nexte MSG2,E0 movdqa ABCD,E1 sha1rnds4 $0,E0,ABCD sha1msg1 MSG2,MSG1 pxor MSG2,MSG0 // Rounds 12-15 movdqu 3*16(DATA_PTR),MSG3 pshufb SHUF_MASK,MSG3 sha1nexte MSG3,E1 movdqa ABCD,E0 sha1msg2 MSG3,MSG0 sha1rnds4 $0,E1,ABCD sha1msg1 MSG3,MSG2 pxor MSG3,MSG1 // Rounds 16-19 sha1nexte MSG0,E0 movdqa ABCD,E1 sha1msg2 MSG0,MSG1 sha1rnds4 $0,E0,ABCD sha1msg1 MSG0,MSG3 pxor MSG0,MSG2 // Rounds 20-23 sha1nexte MSG1,E1 movdqa ABCD,E0 sha1msg2 MSG1,MSG2 sha1rnds4 $1,E1,ABCD sha1msg1 MSG1,MSG0 pxor MSG1,MSG3 // Rounds 24-27 sha1nexte MSG2,E0 movdqa ABCD,E1 sha1msg2 MSG2,MSG3 sha1rnds4 $1,E0,ABCD sha1msg1 MSG2,MSG1 pxor MSG2,MSG0 // Rounds 28-31 sha1nexte MSG3,E1 movdqa ABCD,E0 sha1msg2 MSG3,MSG0 sha1rnds4 $1,E1,ABCD sha1msg1 MSG3,MSG2 pxor MSG3,MSG1 // Rounds 32-35 sha1nexte MSG0,E0 movdqa ABCD,E1 sha1msg2 MSG0,MSG1 sha1rnds4 $1,E0,ABCD sha1msg1 MSG0,MSG3 pxor MSG0,MSG2 // Rounds 36-39 sha1nexte MSG1,E1 movdqa ABCD,E0 sha1msg2 MSG1,MSG2 sha1rnds4 $1,E1,ABCD sha1msg1 MSG1,MSG0 pxor MSG1,MSG3 // Rounds 40-43 sha1nexte MSG2,E0 movdqa ABCD,E1 sha1msg2 MSG2,MSG3 sha1rnds4 $2,E0,ABCD sha1msg1 MSG2,MSG1 pxor MSG2,MSG0 // Rounds 44-47 sha1nexte MSG3,E1 movdqa ABCD,E0 sha1msg2 MSG3,MSG0 sha1rnds4 $2,E1,ABCD sha1msg1 MSG3,MSG2 pxor MSG3,MSG1 // Rounds 48-51 sha1nexte MSG0,E0 movdqa ABCD,E1 sha1msg2 MSG0,MSG1 sha1rnds4 $2,E0,ABCD sha1msg1 MSG0,MSG3 pxor MSG0,MSG2 // Rounds 52-55 sha1nexte MSG1,E1 movdqa ABCD,E0 sha1msg2 MSG1,MSG2 sha1rnds4 $2,E1,ABCD sha1msg1 MSG1,MSG0 pxor MSG1,MSG3 // Rounds 56-59 sha1nexte MSG2,E0 movdqa ABCD,E1 sha1msg2 MSG2,MSG3 sha1rnds4 $2,E0,ABCD sha1msg1 MSG2,MSG1 pxor MSG2,MSG0 // Rounds 60-63 sha1nexte MSG3,E1 movdqa ABCD,E0 sha1msg2 MSG3,MSG0 sha1rnds4 $3,E1,ABCD sha1msg1 MSG3,MSG2 pxor MSG3,MSG1 // Rounds 64-67 sha1nexte MSG0,E0 movdqa ABCD,E1 sha1msg2 MSG0,MSG1 sha1rnds4 $3,E0,ABCD sha1msg1 MSG0,MSG3 pxor MSG0,MSG2 // Rounds 68-71 sha1nexte MSG1,E1 movdqa ABCD,E0 sha1msg2 MSG1,MSG2 sha1rnds4 $3,E1,ABCD pxor MSG1,MSG3 // Rounds 72-75 sha1nexte MSG2,E0 movdqa ABCD,E1 sha1msg2 MSG2,MSG3 sha1rnds4 $3,E0,ABCD // Rounds 76-79 sha1nexte MSG3,E1 movdqa ABCD,E0 sha1rnds4 $3,E1,ABCD // Add current hash values with previously saved sha1nexte (0*16)(%rsp),E0 paddd (1*16)(%rsp),ABCD // Increment data pointer and loop if more to process add $64,DATA_PTR cmp NUM_BLKS,DATA_PTR jne .Lloop0 // Write hash values back in the correct order pshufd $0x1B,ABCD,ABCD movdqu ABCD,0*16(DIGEST_PTR) pextrd $3,E0,1*16(DIGEST_PTR) .Ldone_hash: leave ret .endfn sha1_transform_ni,globl .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 .balign 16 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x000102030405060708090a0b0c0d0e0f .section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16 .balign 16 UPPER_WORD_MASK: .octa 0xFFFFFFFF000000000000000000000000