mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 19:43:32 +00:00
39bf41f4eb
- Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
318 lines
10 KiB
ArmAsm
318 lines
10 KiB
ArmAsm
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
|
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
│ │
|
|
│ Copyright 2015 Intel Corporation │
|
|
│ │
|
|
│ Redistribution and use in source and binary forms, with or without │
|
|
│ modification, are permitted provided that the following conditions │
|
|
│ are met: │
|
|
│ │
|
|
│ * Redistributions of source code must retain the above copyright │
|
|
│ notice, this list of conditions and the following disclaimer. │
|
|
│ * Redistributions in binary form must reproduce the above copyright │
|
|
│ notice, this list of conditions and the following disclaimer in │
|
|
│ the documentation and/or other materials provided with the │
|
|
│ distribution. │
|
|
│ * Neither the name of Intel Corporation nor the names of its │
|
|
│ contributors may be used to endorse or promote products derived │
|
|
│ from this software without specific prior written permission. │
|
|
│ │
|
|
│ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS │
|
|
│ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT │
|
|
│ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR │
|
|
│ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT │
|
|
│ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, │
|
|
│ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT │
|
|
│ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │
|
|
│ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │
|
|
│ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT │
|
|
│ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE │
|
|
│ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. │
|
|
│ │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "libc/macros.internal.h"
|
|
|
|
.text
|
|
.align 32
|
|
.ident "\n\
|
|
Intel SHA-NI (BSD-3 License)\n\
|
|
Copyright 2015 Intel Corporation\n\
|
|
Sean Gulley <sean.m.gulley@intel.com>\n\
|
|
Tim Chen <tim.c.chen@linux.intel.com>\n"
|
|
.include "libc/disclaimer.inc"
|
|
|
|
#define DIGEST_PTR %rdi /* 1st arg */
|
|
#define DATA_PTR %rsi /* 2nd arg */
|
|
#define NUM_BLKS %rdx /* 3rd arg */
|
|
#define SHA256CONSTANTS %rax
|
|
#define MSG %xmm0
|
|
#define STATE0 %xmm1
|
|
#define STATE1 %xmm2
|
|
#define MSGTMP0 %xmm3
|
|
#define MSGTMP1 %xmm4
|
|
#define MSGTMP2 %xmm5
|
|
#define MSGTMP3 %xmm6
|
|
#define MSGTMP4 %xmm7
|
|
#define SHUF_MASK %xmm8
|
|
#define ABEF_SAVE %xmm9
|
|
#define CDGH_SAVE %xmm10
|
|
|
|
// Performs Intel® SHA-NI™ optimized SHA-256 update.
|
|
//
|
|
// The function takes a pointer to the current hash values, a
|
|
// pointer to the input data, and a number of 64 byte blocks to
|
|
// process. Once all blocks have been processed, the digest pointer
|
|
// is updated with the resulting hash value. The function only
|
|
// processes complete blocks, there is no functionality to store
|
|
// partial blocks. All message padding and hash value
|
|
// initialization must be done outside the update function.
|
|
//
|
|
// The indented lines in the loop are instructions related to
|
|
// rounds processing. The non-indented lines are instructions
|
|
// related to the message schedule.
|
|
//
|
|
// void sha256_transform_ni(uint32_t digest[static 8],
|
|
// const void *data,
|
|
// int32_t numBlocks);
|
|
//
|
|
// @param %rdi points to output digest
|
|
// @param %rsi points to input data
|
|
// @param %rdx is number of blocks to process
|
|
// @see X86_HAVE(SHA)
|
|
sha256_transform_ni:
|
|
.leafprologue
|
|
.profilable
|
|
shl $6,NUM_BLKS # convert to bytes
|
|
jz .Ldone_hash
|
|
add DATA_PTR,NUM_BLKS # pointer to end of data
|
|
|
|
// Load initial hash values
|
|
// Need to reorder these appropriately
|
|
// DCBA, HGFE -> ABEF, CDGH
|
|
movdqu 0*16(DIGEST_PTR),STATE0
|
|
movdqu 1*16(DIGEST_PTR),STATE1
|
|
|
|
pshufd $0xB1,STATE0,STATE0 # CDAB
|
|
pshufd $0x1B,STATE1,STATE1 # EFGH
|
|
movdqa STATE0,MSGTMP4
|
|
palignr $8,STATE1,STATE0 # ABEF
|
|
pblendw $0xF0,MSGTMP4,STATE1 # CDGH
|
|
|
|
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip),SHUF_MASK
|
|
lea kSha256(%rip),SHA256CONSTANTS
|
|
|
|
.Lloop0:
|
|
|
|
// Save hash values for addition after rounds
|
|
movdqa STATE0,ABEF_SAVE
|
|
movdqa STATE1,CDGH_SAVE
|
|
|
|
// Rounds 0-3
|
|
movdqu 0*16(DATA_PTR),MSG
|
|
pshufb SHUF_MASK,MSG
|
|
movdqa MSG,MSGTMP0
|
|
paddd 0*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
|
|
// Rounds 4-7
|
|
movdqu 1*16(DATA_PTR),MSG
|
|
pshufb SHUF_MASK,MSG
|
|
movdqa MSG,MSGTMP1
|
|
paddd 1*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
sha256msg1 MSGTMP1,MSGTMP0
|
|
|
|
// Rounds 8-11
|
|
movdqu 2*16(DATA_PTR),MSG
|
|
pshufb SHUF_MASK,MSG
|
|
movdqa MSG,MSGTMP2
|
|
paddd 2*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
sha256msg1 MSGTMP2,MSGTMP1
|
|
|
|
// Rounds 12-15
|
|
movdqu 3*16(DATA_PTR),MSG
|
|
pshufb SHUF_MASK,MSG
|
|
movdqa MSG,MSGTMP3
|
|
paddd 3*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
movdqa MSGTMP3,MSGTMP4
|
|
palignr $4,MSGTMP2,MSGTMP4
|
|
paddd MSGTMP4,MSGTMP0
|
|
sha256msg2 MSGTMP3,MSGTMP0
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
sha256msg1 MSGTMP3,MSGTMP2
|
|
|
|
// Rounds 16-19
|
|
movdqa MSGTMP0,MSG
|
|
paddd 4*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
movdqa MSGTMP0,MSGTMP4
|
|
palignr $4,MSGTMP3,MSGTMP4
|
|
paddd MSGTMP4,MSGTMP1
|
|
sha256msg2 MSGTMP0,MSGTMP1
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
sha256msg1 MSGTMP0,MSGTMP3
|
|
|
|
// Rounds 20-23
|
|
movdqa MSGTMP1,MSG
|
|
paddd 5*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
movdqa MSGTMP1,MSGTMP4
|
|
palignr $4,MSGTMP0,MSGTMP4
|
|
paddd MSGTMP4,MSGTMP2
|
|
sha256msg2 MSGTMP1,MSGTMP2
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
sha256msg1 MSGTMP1,MSGTMP0
|
|
|
|
// Rounds 24-27
|
|
movdqa MSGTMP2,MSG
|
|
paddd 6*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
movdqa MSGTMP2,MSGTMP4
|
|
palignr $4,MSGTMP1,MSGTMP4
|
|
paddd MSGTMP4,MSGTMP3
|
|
sha256msg2 MSGTMP2,MSGTMP3
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
sha256msg1 MSGTMP2,MSGTMP1
|
|
|
|
// Rounds 28-31
|
|
movdqa MSGTMP3,MSG
|
|
paddd 7*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
movdqa MSGTMP3,MSGTMP4
|
|
palignr $4,MSGTMP2,MSGTMP4
|
|
paddd MSGTMP4,MSGTMP0
|
|
sha256msg2 MSGTMP3,MSGTMP0
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
sha256msg1 MSGTMP3,MSGTMP2
|
|
|
|
// Rounds 32-35
|
|
movdqa MSGTMP0,MSG
|
|
paddd 8*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
movdqa MSGTMP0,MSGTMP4
|
|
palignr $4,MSGTMP3,MSGTMP4
|
|
paddd MSGTMP4,MSGTMP1
|
|
sha256msg2 MSGTMP0,MSGTMP1
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
sha256msg1 MSGTMP0,MSGTMP3
|
|
|
|
// Rounds 36-39
|
|
movdqa MSGTMP1,MSG
|
|
paddd 9*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
movdqa MSGTMP1,MSGTMP4
|
|
palignr $4,MSGTMP0,MSGTMP4
|
|
paddd MSGTMP4,MSGTMP2
|
|
sha256msg2 MSGTMP1,MSGTMP2
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
sha256msg1 MSGTMP1,MSGTMP0
|
|
|
|
// Rounds 40-43
|
|
movdqa MSGTMP2,MSG
|
|
paddd 10*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
movdqa MSGTMP2,MSGTMP4
|
|
palignr $4,MSGTMP1,MSGTMP4
|
|
paddd MSGTMP4,MSGTMP3
|
|
sha256msg2 MSGTMP2,MSGTMP3
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
sha256msg1 MSGTMP2,MSGTMP1
|
|
|
|
// Rounds 44-47
|
|
movdqa MSGTMP3,MSG
|
|
paddd 11*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
movdqa MSGTMP3,MSGTMP4
|
|
palignr $4,MSGTMP2,MSGTMP4
|
|
paddd MSGTMP4,MSGTMP0
|
|
sha256msg2 MSGTMP3,MSGTMP0
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
sha256msg1 MSGTMP3,MSGTMP2
|
|
|
|
// Rounds 48-51
|
|
movdqa MSGTMP0,MSG
|
|
paddd 12*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
movdqa MSGTMP0,MSGTMP4
|
|
palignr $4,MSGTMP3,MSGTMP4
|
|
paddd MSGTMP4,MSGTMP1
|
|
sha256msg2 MSGTMP0,MSGTMP1
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
sha256msg1 MSGTMP0,MSGTMP3
|
|
|
|
// Rounds 52-55
|
|
movdqa MSGTMP1,MSG
|
|
paddd 13*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
movdqa MSGTMP1,MSGTMP4
|
|
palignr $4,MSGTMP0,MSGTMP4
|
|
paddd MSGTMP4,MSGTMP2
|
|
sha256msg2 MSGTMP1,MSGTMP2
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
|
|
// Rounds 56-59
|
|
movdqa MSGTMP2,MSG
|
|
paddd 14*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
movdqa MSGTMP2,MSGTMP4
|
|
palignr $4,MSGTMP1,MSGTMP4
|
|
paddd MSGTMP4,MSGTMP3
|
|
sha256msg2 MSGTMP2,MSGTMP3
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
|
|
// Rounds 60-63
|
|
movdqa MSGTMP3,MSG
|
|
paddd 15*16(SHA256CONSTANTS),MSG
|
|
sha256rnds2 STATE0,STATE1
|
|
pshufd $0x0E,MSG,MSG
|
|
sha256rnds2 STATE1,STATE0
|
|
|
|
// Add current hash values with previously saved
|
|
paddd ABEF_SAVE,STATE0
|
|
paddd CDGH_SAVE,STATE1
|
|
|
|
// Increment data pointer and loop if more to process
|
|
add $64,DATA_PTR
|
|
cmp NUM_BLKS,DATA_PTR
|
|
jne .Lloop0
|
|
|
|
// Write hash values back in the correct order
|
|
pshufd $0x1B,STATE0,STATE0 # FEBA
|
|
pshufd $0xB1,STATE1,STATE1 # DCHG
|
|
movdqa STATE0,MSGTMP4
|
|
pblendw $0xF0,STATE1,STATE0 # DCBA
|
|
palignr $8,MSGTMP4,STATE1 # HGFE
|
|
|
|
movdqu STATE0,0*16(DIGEST_PTR)
|
|
movdqu STATE1,1*16(DIGEST_PTR)
|
|
|
|
.Ldone_hash:
|
|
.leafepilogue
|
|
.endfn sha256_transform_ni,globl
|
|
|
|
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK,"aM",@progbits,16
|
|
.align 16
|
|
PSHUFFLE_BYTE_FLIP_MASK:
|
|
.octa 0x0c0d0e0f08090a0b0405060700010203
|
|
.endobj PSHUFFLE_BYTE_FLIP_MASK
|