mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-07-27 13:00:28 +00:00
Make numerous improvements
- Python static hello world now 1.8mb - Python static fully loaded now 10mb - Python HTTPS client now uses MbedTLS - Python REPL now completes import stmts - Increase stack size for Python for now - Begin synthesizing posixpath and ntpath - Restore Python \N{UNICODE NAME} support - Restore Python NFKD symbol normalization - Add optimized code path for Intel SHA-NI - Get more Python unit tests passing faster - Get Python help() pagination working on NT - Python hashlib now supports MbedTLS PBKDF2 - Make memcpy/memmove/memcmp/bcmp/etc. faster - Add Mersenne Twister and Vigna to LIBC_RAND - Provide privileged __printf() for error code - Fix zipos opendir() so that it reports ENOTDIR - Add basic chmod() implementation for Windows NT - Add Cosmo's best functions to Python cosmo module - Pin function trace indent depth to that of caller - Show memory diagram on invalid access in MODE=dbg - Differentiate stack overflow on crash in MODE=dbg - Add stb_truetype and tools for analyzing font files - Upgrade to UNICODE 13 and reduce its binary footprint - COMPILE.COM now logs resource usage of build commands - Start implementing basic poll() support on bare metal - Set getauxval(AT_EXECFN) to GetModuleFileName() on NT - Add descriptions to strerror() in non-TINY build modes - Add COUNTBRANCH() macro to help with micro-optimizations - Make error / backtrace / asan / memory code more unbreakable - Add fast perfect C implementation of μ-Law and a-Law audio codecs - Make strtol() functions consistent with other libc implementations - Improve Linenoise implementation (see also github.com/jart/bestline) - COMPILE.COM now suppresses stdout/stderr of successful build commands
This commit is contained in:
parent
fa7b4f5bd1
commit
39bf41f4eb
806 changed files with 77494 additions and 63859 deletions
|
@ -14,6 +14,7 @@ struct RlDecode {
|
|||
};
|
||||
|
||||
void rldecode(void *dest, const struct RlDecode *) hidden;
|
||||
void rldecode2(void *dest, const struct RlDecode *) hidden;
|
||||
const uint8_t *lz4check(const void *data) hidden;
|
||||
void *lz4cpy(void *dest, const void *blockdata, size_t blocksize) hidden;
|
||||
void *lz4decode(void *dest, const void *src) hidden;
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ Copyright 2021 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
|
@ -18,36 +18,23 @@
|
|||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
|
||||
// Copies memory.
|
||||
//
|
||||
// DEST and SRC may overlap.
|
||||
//
|
||||
// @param rdi is dest
|
||||
// @param rsi is src
|
||||
// @param rdx is number of bytes
|
||||
// @return original rdi copied to rax
|
||||
// @clob flags,rcx
|
||||
// @asyncsignalsafe
|
||||
memmove:
|
||||
mov %rdi,%rax
|
||||
// 𝑠𝑙𝑖𝑑𝑒
|
||||
.endfn MemMove,globl,hidden
|
||||
|
||||
MemMove:
|
||||
.leafprologue
|
||||
.profilable
|
||||
push %rdi
|
||||
push %rsi
|
||||
mov %rdx,%rcx
|
||||
cmp %rsi,%rdi
|
||||
jb 1f
|
||||
lea -1(%rdi,%rcx),%rdi
|
||||
lea -1(%rsi,%rcx),%rsi
|
||||
std
|
||||
1: rep movsb
|
||||
cld
|
||||
pop %rsi
|
||||
pop %rdi
|
||||
.leafepilogue
|
||||
.endfn memmove,globl
|
||||
.source __FILE__
|
||||
.rodata
|
||||
.align 64
|
||||
kSha256:
|
||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
.endobj kSha256,globl
|
|
@ -1,556 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
@fileoverview Cosmopolitan Memory Copying
|
||||
|
||||
Of all the functions in the technology industry, none are more
|
||||
critical than the Kernighan & Ritchie Memory Copy API for the C
|
||||
Language, 1972 model: more commonly known as memcpy(). It's the
|
||||
world's most popular function──one all programmers love.
|
||||
|
||||
This implementation is the fastest and nearly the tiniest too.
|
||||
It doesn't break when copying backwards or on misaligned data.
|
||||
It's so easy that even a child could use it, and they do.
|
||||
*/
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/macros.internal.h"
|
||||
|
||||
// Copies memory.
|
||||
//
|
||||
// DEST and SRC must not overlap, unless DEST≤SRC.
|
||||
//
|
||||
// @param rdi is dest
|
||||
// @param rsi is src
|
||||
// @param rdx is number of bytes
|
||||
// @return original rdi copied to rax
|
||||
// @mode long
|
||||
// @asyncsignalsafe
|
||||
memcpy: mov %rdi,%rax
|
||||
// 𝑠𝑙𝑖𝑑𝑒
|
||||
.align 16
|
||||
.endfn memcpy,globl
|
||||
|
||||
// Copies memory w/ minimal impact ABI.
|
||||
//
|
||||
// @param rdi is dest
|
||||
// @param rsi is src
|
||||
// @param rdx is number of bytes
|
||||
// @clob flags,rcx,xmm3,xmm4
|
||||
// @mode long
|
||||
MemCpy: .leafprologue
|
||||
.profilable
|
||||
mov $.Lmemcpytab.ro.size,%ecx
|
||||
cmp %rcx,%rdx
|
||||
cmovb %rdx,%rcx
|
||||
jmp *memcpytab(,%rcx,8)
|
||||
.Lanchorpoint:
|
||||
.L32r: cmp $1024,%rdx
|
||||
jae .Lerms
|
||||
.L32: vmovdqu -32(%rsi,%rdx),%ymm4
|
||||
mov $32,%rcx
|
||||
0: add $32,%rcx
|
||||
vmovdqu -64(%rsi,%rcx),%ymm3
|
||||
vmovdqu %ymm3,-64(%rdi,%rcx)
|
||||
cmp %rcx,%rdx
|
||||
ja 0b
|
||||
vmovdqu %ymm4,-32(%rdi,%rdx)
|
||||
vxorps %ymm4,%ymm4,%ymm4
|
||||
vxorps %ymm3,%ymm3,%ymm3
|
||||
jmp .L0
|
||||
.L16r: cmp $1024,%rdx
|
||||
jae .Lerms
|
||||
.L16: movdqu -16(%rsi,%rdx),%xmm4
|
||||
mov $16,%rcx
|
||||
0: add $16,%rcx
|
||||
movdqu -32(%rsi,%rcx),%xmm3
|
||||
movdqu %xmm3,-32(%rdi,%rcx)
|
||||
cmp %rcx,%rdx
|
||||
ja 0b
|
||||
movdqu %xmm4,-16(%rdi,%rdx)
|
||||
pxor %xmm4,%xmm4
|
||||
pxor %xmm3,%xmm3
|
||||
jmp .L0
|
||||
.L8: push %rbx
|
||||
mov (%rsi),%rcx
|
||||
mov -8(%rsi,%rdx),%rbx
|
||||
mov %rcx,(%rdi)
|
||||
mov %rbx,-8(%rdi,%rdx)
|
||||
1: pop %rbx
|
||||
.L0: .leafepilogue
|
||||
.L4: push %rbx
|
||||
mov (%rsi),%ecx
|
||||
mov -4(%rsi,%rdx),%ebx
|
||||
mov %ecx,(%rdi)
|
||||
mov %ebx,-4(%rdi,%rdx)
|
||||
jmp 1b
|
||||
.L3: push %rbx
|
||||
mov (%rsi),%cx
|
||||
mov -2(%rsi,%rdx),%bx
|
||||
mov %cx,(%rdi)
|
||||
mov %bx,-2(%rdi,%rdx)
|
||||
jmp 1b
|
||||
.L2: mov (%rsi),%cx
|
||||
mov %cx,(%rdi)
|
||||
jmp .L0
|
||||
.L1: mov (%rsi),%cl
|
||||
mov %cl,(%rdi)
|
||||
jmp .L0
|
||||
.Lerms:
|
||||
#ifdef TINY
|
||||
cmp $1024*1024,%rdx
|
||||
#else
|
||||
cmp kHalfCache3(%rip),%rdx
|
||||
#endif
|
||||
ja .Lnts
|
||||
push %rdi
|
||||
push %rsi
|
||||
mov %rdx,%rcx
|
||||
rep movsb
|
||||
pop %rsi
|
||||
pop %rdi
|
||||
jmp .L0
|
||||
.Lnts: movdqu (%rsi),%xmm3
|
||||
movdqu %xmm3,(%rdi)
|
||||
lea 16(%rdi),%rcx
|
||||
and $-16,%rcx
|
||||
sub %rdi,%rcx
|
||||
add %rcx,%rdi
|
||||
add %rcx,%rsi
|
||||
sub %rcx,%rdx
|
||||
mov $16,%rcx
|
||||
0: add $16,%rcx
|
||||
movdqu -32(%rsi,%rcx),%xmm3
|
||||
movntdq %xmm3,-32(%rdi,%rcx)
|
||||
cmp %rcx,%rdx
|
||||
ja 0b
|
||||
sfence
|
||||
movdqu -16(%rsi,%rdx),%xmm3
|
||||
movdqu %xmm3,-16(%rdi,%rdx)
|
||||
pxor %xmm3,%xmm3
|
||||
jmp .L0
|
||||
.endfn MemCpy,globl,hidden
|
||||
.source __FILE__
|
||||
|
||||
.initro 300,_init_memcpy
|
||||
memcpytab.ro:
|
||||
.byte .L0-.Lanchorpoint
|
||||
.byte .L1-.Lanchorpoint
|
||||
.byte .L2-.Lanchorpoint
|
||||
.byte .L3-.Lanchorpoint
|
||||
.rept 4
|
||||
.byte .L4-.Lanchorpoint
|
||||
.endr
|
||||
.rept 8
|
||||
.byte .L8-.Lanchorpoint
|
||||
.endr
|
||||
.rept 16
|
||||
.byte .L16-.Lanchorpoint
|
||||
.endr
|
||||
.equ .Lmemcpytab.ro.size,.-memcpytab.ro
|
||||
.endobj memcpytab.ro
|
||||
.if .Lmemcpytab.ro.size % 8
|
||||
.error "moar jmptab"
|
||||
.endif
|
||||
.byte .L16-.Lanchorpoint # SSE2
|
||||
.byte .L16r-.Lanchorpoint # SSE2 + ERMS
|
||||
.byte .L32-.Lanchorpoint # AVX
|
||||
.byte .L32r-.Lanchorpoint # AVX + ERMS
|
||||
.byte 0,0,0,0
|
||||
.previous
|
||||
|
||||
.initbss 300,_init_memcpy
|
||||
memcpytab:
|
||||
.rept .Lmemcpytab.ro.size
|
||||
.quad 0
|
||||
.endr
|
||||
.quad 0
|
||||
.endobj memcpytab
|
||||
.previous
|
||||
|
||||
.init.start 300,_init_memcpy
|
||||
pushpop .Lmemcpytab.ro.size,%rcx
|
||||
ezlea .Lanchorpoint,dx
|
||||
testb X86_HAVE(AVX)+kCpuids(%rip)
|
||||
call memjmpinit
|
||||
.init.end 300,_init_memcpy
|
||||
|
||||
/* your memcpy() 375 bytes
|
||||
bionic memcpy() 1,429 bytes
|
||||
glibc memcpy() 27,216 bytes
|
||||
musl memcpy() 49 bytes
|
||||
newlib memcpy() 300 bytes
|
||||
|
||||
benchmarks on intel core i7-6700 @ 3.40GHz (skylake)
|
||||
includes function call overhead (unless marked otherwise)
|
||||
|
||||
your memcpy(𝑛) for #c per n where c ≈ 0.293ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 297.000 35.125 35.203 92
|
||||
1 35.000 35.625 35.016 93
|
||||
2 27.500 17.438 17.555 185
|
||||
3 21.000 11.875 12.057 270
|
||||
4 16.250 8.719 8.809 369
|
||||
7 5.000 4.946 5.069 641
|
||||
8 7.375 4.422 4.365 745
|
||||
15 4.067 2.342 2.336 1391
|
||||
16 4.188 2.242 2.257 1440 «
|
||||
31 8.032 1.157 1.147 2835
|
||||
32 2.031 1.723 1.325 2454
|
||||
63 1.000 0.589 0.589 5523
|
||||
64 0.578 0.580 0.577 5630 «
|
||||
127 0.638 0.377 0.320 10151
|
||||
128 0.289 0.296 0.307 10605
|
||||
255 0.404 0.202 0.194 16741
|
||||
256 0.160 0.165 0.166 19574 «
|
||||
511 0.159 0.123 0.110 29458
|
||||
512 0.139 0.098 0.097 33571 «
|
||||
1023 0.107 0.086 0.074 44111
|
||||
1024 0.103 0.084 0.082 39489
|
||||
2047 0.057 0.056 0.057 57450
|
||||
2048 0.055 0.055 0.055 59269
|
||||
4095 0.044 0.044 0.044 74051
|
||||
4096 0.043 0.043 0.043 75300 «
|
||||
8191 0.036 0.036 0.036 91301
|
||||
8192 0.036 0.035 0.035 92411
|
||||
16383 0.033 0.032 0.032 102163
|
||||
16384 0.034 0.032 0.032 102145 « (L1)/2
|
||||
32767 0.098 0.081 0.077 42271
|
||||
32768 0.077 0.077 0.076 42781
|
||||
65535 0.088 0.075 0.072 44973
|
||||
65536 0.074 0.072 0.071 45520
|
||||
131071 0.086 0.075 0.072 44869
|
||||
131072 0.077 0.073 0.072 45076 « (L2)/2
|
||||
262143 0.095 0.096 0.095 34116
|
||||
262144 0.096 0.096 0.095 34160
|
||||
524287 0.102 0.109 0.111 29359
|
||||
524288 0.107 0.109 0.108 30033
|
||||
1048575 0.102 0.103 0.104 31112
|
||||
1048576 0.101 0.103 0.103 31605
|
||||
2097151 0.104 0.103 0.109 29929
|
||||
2097152 0.108 0.110 0.103 31652
|
||||
4194303 0.192 0.172 0.172 18950
|
||||
4194304 0.168 0.161 0.160 20311 « (L3)/2
|
||||
8388607 0.339 0.329 0.344 9461 « RAM
|
||||
8388608 0.384 0.369 0.341 9545
|
||||
|
||||
Bionic memcpy() for #c per n where c ≈ 0.293ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 347.000 40.625 35.984 90
|
||||
1 37.000 35.625 36.734 89
|
||||
2 28.500 18.688 18.383 177
|
||||
3 11.667 12.375 12.359 263
|
||||
4 12.250 9.406 9.020 361
|
||||
7 5.000 5.018 5.118 636
|
||||
8 11.625 5.828 4.779 681
|
||||
15 3.533 3.158 2.620 1243
|
||||
16 4.688 2.742 2.884 1129 «
|
||||
31 1.903 1.262 1.172 2778
|
||||
32 1.344 1.113 1.125 2895
|
||||
63 1.444 0.633 0.591 5513
|
||||
64 0.766 0.580 0.581 5605 «
|
||||
127 0.512 0.383 0.318 10229
|
||||
128 0.461 0.315 0.311 10463
|
||||
255 0.475 0.216 0.193 16840
|
||||
256 0.371 0.236 0.199 16397 «
|
||||
511 0.295 0.144 0.120 27223
|
||||
512 0.240 0.151 0.126 25937 «
|
||||
1023 0.142 0.101 0.088 36947
|
||||
1024 0.126 0.108 0.091 35889
|
||||
2047 0.088 0.074 0.072 45475
|
||||
2048 0.089 0.077 0.073 44380
|
||||
4095 0.081 0.065 0.064 50766
|
||||
4096 0.068 0.066 0.065 50246 «
|
||||
8191 0.063 0.061 0.060 54075
|
||||
8192 0.065 0.061 0.061 53731
|
||||
16383 0.082 0.066 0.061 53765
|
||||
16384 0.067 0.063 0.062 52765 « (L1)/2
|
||||
32767 0.102 0.085 0.085 38406
|
||||
32768 0.086 0.085 0.085 38473
|
||||
65535 0.098 0.085 0.085 38292
|
||||
65536 0.086 0.085 0.085 38369
|
||||
131071 0.438 0.177 0.089 36716
|
||||
131072 0.092 0.090 0.093 34880 « (L2)/2
|
||||
262143 0.306 0.146 0.127 25601
|
||||
262144 0.126 0.168 0.127 25704
|
||||
524287 0.213 0.152 0.136 23993
|
||||
524288 0.132 0.159 0.133 24570
|
||||
1048575 0.127 0.129 0.130 25117
|
||||
1048576 0.128 0.129 0.130 25107
|
||||
2097151 0.127 0.127 0.129 25199
|
||||
2097152 0.127 0.136 0.134 24274
|
||||
4194303 0.216 0.192 0.228 14237
|
||||
4194304 0.351 0.351 0.356 9139 « (L3)/2
|
||||
8388607 0.323 0.293 0.298 10903 « RAM
|
||||
8388608 0.365 0.296 0.300 10844
|
||||
|
||||
GCC builtin (Inline REP MOVSB) for #c per n where c ≈ 0.293ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 53.000 50.625 50.453 64
|
||||
1 47.000 49.375 49.141 66
|
||||
2 23.500 25.062 24.898 131
|
||||
3 15.667 16.792 16.880 193
|
||||
4 11.750 12.531 12.957 251
|
||||
7 7.000 7.125 7.190 452
|
||||
8 6.125 7.578 6.322 514
|
||||
15 3.133 3.325 3.372 964
|
||||
16 3.062 3.117 3.132 1038 «
|
||||
31 1.645 1.601 1.620 2007
|
||||
32 1.531 1.559 1.585 2051
|
||||
63 0.778 0.796 0.802 4056
|
||||
64 0.766 0.768 0.767 4238 «
|
||||
127 0.480 0.446 0.448 7259
|
||||
128 0.445 0.419 0.423 7693
|
||||
255 0.239 0.239 0.236 13781
|
||||
256 0.238 0.225 0.225 14466 «
|
||||
511 0.127 0.133 0.132 24555
|
||||
512 0.123 0.127 0.128 25377 «
|
||||
1023 0.079 0.081 0.081 40346
|
||||
1024 0.075 0.077 0.078 41714
|
||||
2047 0.053 0.055 0.055 59575
|
||||
2048 0.053 0.053 0.053 60795
|
||||
4095 0.042 0.043 0.043 75843
|
||||
4096 0.042 0.042 0.042 77153
|
||||
8191 0.035 0.036 0.036 91518
|
||||
8192 0.035 0.035 0.035 92603
|
||||
16383 0.032 0.032 0.032 102407
|
||||
16384 0.033 0.032 0.032 102864 « (L1)/2
|
||||
32767 0.106 0.082 0.078 41486
|
||||
32768 0.079 0.078 0.079 41290
|
||||
65535 0.090 0.077 0.075 43565
|
||||
65536 0.074 0.074 0.073 44299
|
||||
131071 0.091 0.078 0.075 43196
|
||||
131072 0.078 0.076 0.074 43673 « (L2)/2
|
||||
262143 0.097 0.099 0.098 33192
|
||||
262144 0.098 0.098 0.098 33193
|
||||
524287 0.105 0.111 0.111 29212
|
||||
524288 0.109 0.111 0.111 29211
|
||||
1048575 0.107 0.108 0.108 30069
|
||||
1048576 0.106 0.112 0.105 30886
|
||||
2097151 0.105 0.103 0.103 31621
|
||||
2097152 0.102 0.103 0.104 31280
|
||||
4194303 0.180 0.158 0.176 18456
|
||||
4194304 0.167 0.155 0.154 21098 « (L3)/2
|
||||
8388607 0.538 0.576 0.557 5834 « RAM
|
||||
8388608 0.750 0.579 0.552 5893
|
||||
|
||||
glibc memcpy() for #c per n where c ≈ 0.293ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 139.000 90.125 84.891 38
|
||||
1 83.000 82.125 84.359 39
|
||||
2 61.500 46.438 45.164 72
|
||||
3 41.667 32.458 31.245 104
|
||||
4 32.750 26.156 24.410 133
|
||||
7 20.143 16.732 16.033 203
|
||||
8 13.375 8.328 6.908 471
|
||||
15 8.200 6.408 5.753 565
|
||||
16 4.438 3.570 3.466 938 «
|
||||
31 3.258 2.891 2.786 1167
|
||||
32 2.281 1.801 1.732 1878
|
||||
63 1.635 1.431 1.374 2367
|
||||
64 1.109 0.896 0.868 3747 «
|
||||
127 0.921 0.792 0.779 4176
|
||||
128 0.508 0.511 0.494 6589
|
||||
255 0.451 0.407 0.402 8081
|
||||
256 0.324 0.269 0.260 12498 «
|
||||
511 0.249 0.218 0.212 15335
|
||||
512 0.178 0.149 0.146 22297 «
|
||||
1023 0.138 0.124 0.121 26947
|
||||
1024 0.087 0.089 0.087 37238
|
||||
2047 0.084 0.077 0.076 43046
|
||||
2048 0.066 0.059 0.058 56120
|
||||
4095 0.058 0.054 0.054 60706
|
||||
4096 0.050 0.046 0.046 71092 «
|
||||
8191 0.043 0.042 0.042 78259
|
||||
8192 0.037 0.037 0.037 87409
|
||||
16383 0.037 0.036 0.035 92065
|
||||
16384 0.034 0.034 0.033 97942 « (L1)/2
|
||||
32767 0.104 0.084 0.080 40572
|
||||
32768 0.079 0.079 0.079 41055
|
||||
65535 0.094 0.080 0.076 42885
|
||||
65536 0.077 0.075 0.075 43423
|
||||
131071 0.092 0.080 0.078 41498
|
||||
131072 0.082 0.078 0.077 42350 « (L2)/2
|
||||
262143 0.100 0.101 0.287 11342
|
||||
262144 0.099 0.099 0.098 33177
|
||||
524287 0.106 0.111 0.110 29609
|
||||
524288 0.107 0.119 0.110 29608
|
||||
1048575 0.104 0.105 0.106 30626
|
||||
1048576 0.104 0.111 0.105 30878
|
||||
2097151 0.103 0.103 0.103 31606
|
||||
2097152 0.102 0.103 0.103 31644
|
||||
4194303 0.174 0.160 0.165 19714
|
||||
4194304 0.166 0.157 0.154 21110 « (L3)/2
|
||||
8388607 0.537 0.554 0.565 5750 « RAM
|
||||
8388608 0.701 0.537 0.552 5884
|
||||
|
||||
musl memcpy() for #c per n where c ≈ 0.293ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 97.000 80.625 79.891 41
|
||||
1 77.000 78.875 78.266 42
|
||||
2 49.500 44.062 42.102 77
|
||||
3 33.667 32.792 30.651 106
|
||||
4 29.750 24.281 24.137 135
|
||||
7 19.000 16.161 15.734 207
|
||||
8 12.125 7.766 6.721 484
|
||||
15 8.867 5.892 5.714 569
|
||||
16 5.062 3.742 3.458 940
|
||||
31 3.645 2.915 2.715 1198
|
||||
32 2.156 1.723 1.663 1956
|
||||
63 1.540 1.367 1.333 2440
|
||||
64 1.078 0.873 0.833 3905
|
||||
127 0.874 0.771 0.737 4415
|
||||
128 0.617 0.487 0.481 6766
|
||||
255 0.443 0.390 0.382 8504
|
||||
256 0.316 0.259 0.259 12545
|
||||
511 0.245 0.232 0.237 13742
|
||||
512 0.174 0.159 0.208 15668
|
||||
1023 0.181 0.193 0.182 17821
|
||||
1024 0.155 0.123 0.114 28579
|
||||
2047 0.102 0.092 0.085 38219
|
||||
2048 0.064 0.073 0.070 46577
|
||||
4095 0.058 0.067 0.065 50272
|
||||
4096 0.049 0.055 0.055 59467
|
||||
8191 0.057 0.052 0.049 66468
|
||||
8192 0.053 0.050 0.051 63557
|
||||
16383 0.082 0.065 0.064 50897
|
||||
16384 0.066 0.065 0.061 53697 « (L1)/2
|
||||
32767 0.121 0.100 0.114 28555
|
||||
32768 0.093 0.091 0.114 28615
|
||||
65535 0.118 0.102 0.142 22858
|
||||
65536 0.108 0.274 0.097 33432
|
||||
131071 0.117 0.109 0.109 29905
|
||||
131072 0.110 0.195 0.113 28692 « (L2)/2
|
||||
262143 0.283 0.166 0.122 26638
|
||||
262144 0.130 0.144 0.123 26544
|
||||
524287 0.210 0.153 0.130 25079
|
||||
524288 0.126 0.128 0.123 26422
|
||||
1048575 0.139 0.107 0.106 30803
|
||||
1048576 0.104 0.105 0.106 30683
|
||||
2097151 0.103 0.103 0.103 31564
|
||||
2097152 0.102 0.103 0.103 31531
|
||||
4194303 0.242 0.158 0.169 19238
|
||||
4194304 0.166 0.161 0.154 21072 « (L3)/2
|
||||
8388607 0.533 0.549 0.599 5422 « RAM
|
||||
8388608 0.768 0.630 0.560 5801
|
||||
|
||||
newlib (aka. cygwin) memcpy() for #c per n where c ≈ 0.293ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 61.000 52.875 53.141 61
|
||||
1 49.000 49.875 50.328 65
|
||||
2 24.500 24.812 26.727 122
|
||||
3 15.667 20.125 16.943 192
|
||||
4 12.750 15.281 13.090 248
|
||||
7 7.000 7.375 7.431 438
|
||||
8 5.875 6.422 6.377 510
|
||||
15 3.267 3.375 3.447 943
|
||||
16 10.062 6.945 6.386 509
|
||||
31 2.548 2.488 2.545 1278
|
||||
32 3.156 3.207 3.201 1016
|
||||
63 1.190 1.220 1.229 2646
|
||||
64 1.578 1.588 1.599 2033
|
||||
127 0.717 0.690 0.685 4744
|
||||
128 0.820 0.856 0.857 3795
|
||||
255 0.357 0.359 0.358 9077
|
||||
256 0.629 0.461 0.426 7630
|
||||
511 0.260 0.219 0.204 15947
|
||||
512 0.330 0.299 0.268 12113
|
||||
1023 0.269 0.175 0.162 20042
|
||||
1024 0.315 0.201 0.196 16633
|
||||
2047 0.349 0.241 0.236 13790
|
||||
2048 0.332 0.269 0.264 12295
|
||||
4095 0.349 0.295 0.287 11348
|
||||
4096 0.361 0.313 0.303 10748
|
||||
8191 0.361 0.317 0.322 10110
|
||||
8192 0.369 0.326 0.319 10201
|
||||
16383 0.321 0.322 0.327 9940
|
||||
16384 0.309 0.330 0.329 9878 « (L1)/2
|
||||
32767 0.291 0.303 0.307 10599
|
||||
32768 0.314 0.304 0.305 10667
|
||||
65535 0.373 0.311 0.313 10396
|
||||
65536 0.305 0.750 0.421 7729
|
||||
131071 0.329 0.427 0.384 8470
|
||||
131072 0.329 0.388 0.361 9020 « (L2)/2
|
||||
262143 0.520 0.389 0.425 7646
|
||||
262144 0.364 0.400 0.368 8843
|
||||
524287 0.449 0.389 0.389 8353
|
||||
524288 0.384 0.379 0.384 8466
|
||||
1048575 0.436 0.397 0.401 8107
|
||||
1048576 0.431 0.397 0.401 8112
|
||||
2097151 0.417 0.567 0.434 7498
|
||||
2097152 0.457 0.503 0.427 7621
|
||||
4194303 0.328 0.348 0.368 8822
|
||||
4194304 0.343 0.352 0.352 9221 « (L3)/2
|
||||
8388607 0.313 0.319 0.326 9957 « RAM
|
||||
8388608 0.366 0.320 0.328 9910
|
||||
|
||||
openbsd memcpy() for #c per n where c ≈ 0.293ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 73.000 41.375 41.484 78
|
||||
1 39.000 39.875 41.641 78
|
||||
2 28.500 20.688 21.227 153
|
||||
3 27.000 15.875 15.557 209
|
||||
4 16.750 12.656 12.520 260
|
||||
7 20.429 10.982 10.292 316
|
||||
8 8.625 5.234 5.576 583
|
||||
15 7.267 4.758 4.920 661
|
||||
16 4.312 2.742 2.747 1183
|
||||
31 4.613 2.891 2.555 1272
|
||||
32 2.844 1.520 1.441 2256
|
||||
63 2.397 1.268 1.328 2449
|
||||
64 1.547 0.822 0.769 4226
|
||||
127 1.189 0.782 0.671 4842
|
||||
128 0.727 0.532 0.460 7066
|
||||
255 0.631 0.463 0.414 7856
|
||||
256 0.543 0.374 0.302 10775
|
||||
511 0.542 0.316 0.276 11785
|
||||
512 0.354 0.260 0.224 14494
|
||||
1023 0.267 0.245 0.229 14201
|
||||
1024 0.251 0.200 0.197 16496
|
||||
2047 0.214 0.226 0.181 17941
|
||||
2048 0.189 0.167 0.166 19575
|
||||
4095 0.200 0.168 0.163 19957
|
||||
4096 0.165 0.155 0.153 21219
|
||||
8191 0.158 0.153 0.151 21578
|
||||
8192 0.153 0.148 0.147 22138
|
||||
16383 0.173 0.148 0.146 22319
|
||||
16384 0.153 0.487 0.188 17298 « (L1)/2
|
||||
32767 0.161 0.151 0.192 16893
|
||||
32768 0.151 0.314 0.213 15275
|
||||
65535 0.157 0.154 0.148 21969
|
||||
65536 0.147 0.145 0.145 22493
|
||||
131071 0.152 0.151 0.154 21145
|
||||
131072 0.148 0.229 0.158 20564 « (L2)/2
|
||||
262143 0.320 0.183 0.162 20031
|
||||
262144 0.330 0.205 0.167 19503
|
||||
524287 0.159 0.171 0.163 19913
|
||||
524288 0.250 0.189 0.162 20120
|
||||
1048575 0.157 0.164 0.161 20182
|
||||
1048576 0.155 0.156 0.157 20672
|
||||
2097151 0.161 0.158 0.157 20644
|
||||
2097152 0.158 0.157 0.165 19727
|
||||
4194303 0.327 0.256 0.238 13684
|
||||
4194304 0.232 0.220 0.236 13749 « (L3)/2
|
||||
8388607 0.721 0.689 0.586 5549 « RAM
|
||||
8388608 0.943 0.569 0.593 5481 */
|
|
@ -1,33 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
.source __FILE__
|
||||
|
||||
// Copies memory.
|
||||
//
|
||||
// DEST and SRC must not overlap unless DEST ≤ SRC.
|
||||
//
|
||||
// @param rdi is dest
|
||||
// @param rsi is src
|
||||
// @param rdx is number of bytes
|
||||
// @return original rdi + rdx copied to rax
|
||||
mempcpy:
|
||||
lea (%rdi,%rdx),%rax
|
||||
jmp MemCpy
|
||||
.endfn mempcpy,globl
|
|
@ -1,406 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 sw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
@fileoverview Cosmopolitan Memory Setter
|
||||
|
||||
This sets one bit per picosecond on a $900 Skylake workstation,
|
||||
which is about 110 GBps. */
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/nexgen32e/macros.h"
|
||||
#include "libc/macros.internal.h"
|
||||
|
||||
// Sets memory.
|
||||
//
|
||||
// @param rdi is dest
|
||||
// @param esi is the byte to set
|
||||
// @param edx is the number of bytes to set
|
||||
// @return original rdi copied to rax
|
||||
// @mode long
|
||||
// @asyncsignalsafe
|
||||
memset: mov %rdi,%rax
|
||||
// 𝑠𝑙𝑖𝑑𝑒
|
||||
.align 16
|
||||
.endfn memset,globl
|
||||
|
||||
// Sets memory w/ minimal-impact ABI.
|
||||
//
|
||||
// @param rdi is dest
|
||||
// @param esi is the byte to set
|
||||
// @param edx is the number of bytes to set
|
||||
// @clob flags,rcx,xmm3
|
||||
// @mode long
|
||||
MemSet: .leafprologue
|
||||
.profilable
|
||||
mov $.Lmemsettab.ro.size,%ecx
|
||||
cmp %rcx,%rdx
|
||||
cmovb %rdx,%rcx
|
||||
jmp *memsettab(,%rcx,8)
|
||||
.Lanchorpoint:
|
||||
.L32r: cmp $1024,%rdx
|
||||
jae .Lerms
|
||||
.L32: vmovd %esi,%xmm3
|
||||
vpbroadcastb %xmm3,%ymm3
|
||||
mov $32,%ecx
|
||||
1: lea 32(%rcx),%rcx
|
||||
vmovdqu %ymm3,-64(%rdi,%rcx)
|
||||
cmp %rcx,%rdx
|
||||
ja 1b
|
||||
vmovdqu %ymm3,-32(%rdi,%rdx)
|
||||
vpxor %ymm3,%ymm3,%ymm3
|
||||
jmp .L0
|
||||
.L16r: cmp $1024,%rdx
|
||||
jae .Lerms
|
||||
.L16: movd %esi,%xmm3
|
||||
pbroadcastb %xmm3
|
||||
mov $16,%ecx
|
||||
1: lea 16(%rcx),%rcx
|
||||
movdqu %xmm3,-32(%rdi,%rcx)
|
||||
cmp %rcx,%rdx
|
||||
ja 1b
|
||||
movdqu %xmm3,-16(%rdi,%rdx)
|
||||
pxor %xmm3,%xmm3
|
||||
.L0: .leafepilogue
|
||||
.L8: movzbl %sil,%ecx
|
||||
imul .Lb8(%rip),%rcx
|
||||
mov %rcx,(%rdi)
|
||||
mov %rcx,-8(%rdi,%rdx)
|
||||
jmp .L0
|
||||
.L4: movzbl %sil,%ecx
|
||||
imul $0x01010101,%ecx,%ecx
|
||||
mov %ecx,(%rdi)
|
||||
mov %ecx,-4(%rdi,%rdx)
|
||||
jmp .L0
|
||||
.L3: mov %sil,2(%rdi)
|
||||
.L2: mov %sil,1(%rdi)
|
||||
.L1: mov %sil,(%rdi)
|
||||
jmp .L0
|
||||
.Lerms: push %rax
|
||||
push %rdi
|
||||
mov %esi,%eax
|
||||
mov %rdx,%rcx
|
||||
rep stosb
|
||||
pop %rdi
|
||||
pop %rax
|
||||
jmp .L0
|
||||
.endfn MemSet,globl,hidden
|
||||
.source __FILE__
|
||||
|
||||
.rodata.cst8
|
||||
.Lb8: .quad 0x0101010101010101
|
||||
.previous
|
||||
|
||||
.initro 300,_init_memset
|
||||
memsettab.ro:
|
||||
.byte .L0 - .Lanchorpoint
|
||||
.byte .L1 - .Lanchorpoint
|
||||
.byte .L2 - .Lanchorpoint
|
||||
.byte .L3 - .Lanchorpoint
|
||||
.rept 4
|
||||
.byte .L4 - .Lanchorpoint
|
||||
.endr
|
||||
.rept 8
|
||||
.byte .L8 - .Lanchorpoint
|
||||
.endr
|
||||
.rept 16
|
||||
.byte .L16 - .Lanchorpoint
|
||||
.endr
|
||||
.equ .Lmemsettab.ro.size,.-memsettab.ro
|
||||
.endobj memsettab.ro
|
||||
.if .Lmemsettab.ro.size % 8
|
||||
.error "moar jmptab"
|
||||
.endif
|
||||
.byte .L16 - .Lanchorpoint # SSE2
|
||||
.byte .L16r - .Lanchorpoint # SSE2 + ERMS
|
||||
.byte .L32 - .Lanchorpoint # AVX2
|
||||
.byte .L32r - .Lanchorpoint # AVX2 + ERMS
|
||||
.byte 0,0,0,0
|
||||
.previous
|
||||
|
||||
.initbss 300,_init_memset
|
||||
memsettab:
|
||||
.rept .Lmemsettab.ro.size
|
||||
.quad 0
|
||||
.endr
|
||||
.quad 0
|
||||
.endobj memsettab
|
||||
.previous
|
||||
|
||||
.init.start 300,_init_memset
|
||||
pushpop .Lmemsettab.ro.size,%rcx
|
||||
ezlea .Lanchorpoint,dx
|
||||
testb X86_HAVE(AVX2)+kCpuids(%rip)
|
||||
call memjmpinit
|
||||
.init.end 300,_init_memset
|
||||
|
||||
/* benchmarks on intel core i7-6700 @ 3.40GHz (skylake)
|
||||
includes function call overhead (unless marked otherwise)
|
||||
|
||||
Your memset() for #c per n where c ≈ 0.273ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 73.000 35.125 36.141 97
|
||||
1 35.000 36.375 35.984 97
|
||||
2 28.500 19.938 18.820 185
|
||||
3 19.000 12.458 12.651 276
|
||||
4 15.750 10.719 9.566 365
|
||||
7 5.000 5.411 5.730 609
|
||||
8 8.375 4.953 4.697 743
|
||||
15 4.200 2.408 2.407 1450
|
||||
16 7.188 2.539 2.382 1465 «
|
||||
31 1.129 1.206 1.183 2950
|
||||
32 15.156 2.012 1.292 2702
|
||||
63 4.016 0.986 0.663 5264
|
||||
64 3.547 0.967 0.684 5104
|
||||
127 2.087 0.562 0.338 10311
|
||||
128 1.805 0.499 0.336 10393
|
||||
255 0.412 0.180 0.183 19119
|
||||
256 0.160 0.170 0.169 20650
|
||||
511 0.162 0.134 0.108 32214
|
||||
512 0.100 0.106 0.104 33507
|
||||
1023 0.110 0.095 0.082 42574
|
||||
1024 0.099 0.080 0.078 44944
|
||||
2047 0.155 0.154 0.154 22624
|
||||
2048 0.052 0.052 0.053 66266
|
||||
4095 0.098 0.099 0.099 35142
|
||||
4096 0.042 0.042 0.041 84250
|
||||
8191 0.072 0.073 0.072 48157
|
||||
8192 0.034 0.034 0.034 101332
|
||||
16383 0.059 0.059 0.059 58997
|
||||
16384 0.031 0.031 0.031 112972
|
||||
32767 0.054 0.054 0.054 65053
|
||||
32768 0.029 0.029 0.029 119433
|
||||
65535 0.069 0.069 0.068 51690
|
||||
65536 0.057 0.057 0.057 61434
|
||||
131071 0.066 0.066 0.066 53001
|
||||
131072 0.057 0.058 0.057 60716
|
||||
262143 0.066 0.065 0.065 53462
|
||||
262144 0.060 0.058 0.058 60104
|
||||
524287 0.067 0.068 0.072 48784
|
||||
524288 0.063 0.062 0.061 56957
|
||||
1048575 0.068 0.068 0.069 50353
|
||||
1048576 0.062 0.060 0.062 56661
|
||||
2097151 0.066 0.066 0.067 52421
|
||||
2097152 0.060 0.060 0.061 57672
|
||||
4194303 0.072 0.067 0.067 51910
|
||||
4194304 0.062 0.061 0.062 56327
|
||||
8388607 0.129 0.111 0.111 31368
|
||||
8388608 0.136 0.119 0.111 31519
|
||||
|
||||
glibc memset() for #c per n where c ≈ 0.273ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 121.000 39.125 35.547 98
|
||||
1 33.000 35.875 35.172 99
|
||||
2 17.500 18.312 18.070 193
|
||||
3 16.333 14.542 12.411 281
|
||||
4 12.250 9.344 9.215 379
|
||||
7 7.571 5.732 5.453 640
|
||||
8 4.625 4.641 4.623 755
|
||||
15 4.467 3.158 2.478 1408
|
||||
16 2.312 2.289 2.468 1414
|
||||
31 2.290 1.367 1.278 2731
|
||||
32 1.219 1.176 1.182 2952
|
||||
63 0.905 0.696 0.656 5320
|
||||
64 0.672 0.658 0.660 5285
|
||||
127 1.299 0.723 0.673 5183
|
||||
128 0.508 0.423 0.424 8227
|
||||
255 0.490 0.428 0.417 8367
|
||||
256 0.293 0.233 0.243 14349
|
||||
511 0.284 0.232 0.234 14902
|
||||
512 0.154 0.131 0.131 26626
|
||||
1023 0.155 0.137 0.135 25839
|
||||
1024 0.089 0.078 0.080 43875
|
||||
2047 0.103 0.092 0.090 38672
|
||||
2048 0.060 0.054 0.054 65116
|
||||
4095 0.073 0.068 0.068 51405
|
||||
4096 0.046 0.042 0.042 82162
|
||||
8191 0.060 0.058 0.057 60739
|
||||
8192 0.036 0.034 0.034 101467
|
||||
16383 0.052 0.052 0.051 68594
|
||||
16384 0.031 0.031 0.031 112603
|
||||
32767 0.053 0.050 0.049 70850
|
||||
32768 0.032 0.029 0.029 119617
|
||||
65535 0.067 0.067 0.067 52015
|
||||
65536 0.058 0.058 0.058 60440
|
||||
131071 0.067 0.066 0.065 53518
|
||||
131072 0.059 0.058 0.058 60281
|
||||
262143 0.066 0.065 0.065 54005
|
||||
262144 0.058 0.058 0.058 60121
|
||||
524287 0.067 0.067 0.067 52349
|
||||
524288 0.061 0.061 0.064 54699
|
||||
1048575 0.068 0.067 0.067 51876
|
||||
1048576 0.061 0.061 0.061 56775
|
||||
2097151 0.068 0.068 0.068 51379
|
||||
2097152 0.062 0.062 0.062 56513
|
||||
4194303 0.069 0.068 0.069 50580
|
||||
4194304 0.063 0.064 0.063 55751
|
||||
8388607 0.120 0.118 0.120 28998
|
||||
8388608 0.137 0.123 0.117 29936
|
||||
|
||||
GCC (Inline REP STOSB) for #c per n where c ≈ 0.273ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 413.000 434.125 441.453 8
|
||||
1 431.000 436.125 438.953 8
|
||||
2 223.500 224.438 224.836 16
|
||||
3 149.000 150.042 623.786 6
|
||||
4 108.750 109.531 110.559 32
|
||||
7 62.714 63.196 63.266 55
|
||||
8 56.375 56.641 56.838 61
|
||||
15 30.467 30.708 30.761 113
|
||||
16 24.062 24.023 24.038 145
|
||||
31 14.548 14.859 14.876 235
|
||||
32 9.719 9.691 9.730 359
|
||||
63 7.286 7.312 7.339 476
|
||||
64 3.609 3.705 3.721 938
|
||||
127 1.976 2.058 2.067 1689
|
||||
128 0.414 0.405 0.409 8532
|
||||
255 0.890 0.907 0.911 3832
|
||||
256 0.215 0.217 0.218 16039
|
||||
511 0.476 0.481 0.480 7273
|
||||
512 0.119 0.119 0.119 29270
|
||||
1023 0.257 0.260 0.260 13409
|
||||
1024 0.073 0.073 0.074 47442
|
||||
2047 0.150 0.150 0.151 23189
|
||||
2048 0.049 0.050 0.050 69424
|
||||
4095 0.096 0.097 0.097 36142
|
||||
4096 0.040 0.040 0.040 87842
|
||||
8191 0.071 0.071 0.071 49061
|
||||
8192 0.034 0.033 0.034 104099
|
||||
16383 0.058 0.059 0.058 59697
|
||||
16384 0.030 0.031 0.030 114585
|
||||
32767 0.053 0.053 0.053 66161
|
||||
32768 0.029 0.029 0.029 120750
|
||||
65535 0.069 0.069 0.069 50520
|
||||
65536 0.058 0.058 0.058 60100
|
||||
131071 0.068 0.067 0.085 40964
|
||||
131072 0.076 0.072 0.063 55514
|
||||
262143 0.067 0.093 0.090 38681
|
||||
262144 0.073 0.062 0.077 45384
|
||||
524287 0.107 0.093 0.066 52689
|
||||
524288 0.061 0.060 0.062 56294
|
||||
1048575 0.066 0.066 0.066 52990
|
||||
1048576 0.061 0.061 0.061 57248
|
||||
2097151 0.067 0.075 0.067 51887
|
||||
2097152 0.061 0.061 0.061 56878
|
||||
4194303 0.068 0.100 0.069 50623
|
||||
4194304 0.061 0.061 0.061 57195
|
||||
8388607 0.117 0.121 0.119 29441
|
||||
8388608 0.118 0.119 0.162 21587
|
||||
|
||||
Musl memset() for #c per n where c ≈ 0.273ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 49.000 35.625 35.172 99
|
||||
1 33.000 34.625 35.109 99
|
||||
2 17.500 17.562 18.023 194
|
||||
3 20.333 14.042 12.411 281
|
||||
4 11.250 9.219 9.301 375
|
||||
7 11.857 6.018 5.417 644
|
||||
8 4.125 4.516 4.592 760
|
||||
15 4.200 2.692 2.480 1407
|
||||
16 2.312 2.273 2.310 1511
|
||||
31 2.097 1.786 1.342 2600
|
||||
32 1.219 1.238 1.242 2811
|
||||
63 0.841 0.815 0.686 5085
|
||||
64 0.641 0.666 0.665 5246
|
||||
127 1.000 0.718 0.690 5061
|
||||
128 0.477 0.435 0.413 8451
|
||||
255 0.459 0.418 0.403 8670
|
||||
256 0.285 0.233 0.232 15051
|
||||
511 0.256 0.230 0.228 15285
|
||||
512 0.158 0.129 0.128 27170
|
||||
1023 0.134 0.140 0.138 25296
|
||||
1024 0.089 0.077 0.078 44891
|
||||
2047 0.094 0.088 0.088 39837
|
||||
2048 0.060 0.052 0.053 66075
|
||||
4095 0.071 0.068 0.068 51359
|
||||
4096 0.045 0.043 0.042 83178
|
||||
8191 0.059 0.058 0.057 60868
|
||||
8192 0.037 0.035 0.034 102662
|
||||
16383 0.052 0.051 0.051 68658
|
||||
16384 0.032 0.031 0.031 113568
|
||||
32767 0.050 0.049 0.049 71296
|
||||
32768 0.030 0.029 0.029 120029
|
||||
65535 0.067 0.067 0.068 50983
|
||||
65536 0.059 0.059 0.058 59665
|
||||
131071 0.067 0.067 0.067 52014
|
||||
131072 0.059 0.060 0.059 59211
|
||||
262143 0.067 0.066 0.066 52877
|
||||
262144 0.059 0.060 0.085 40900
|
||||
524287 0.067 0.066 0.065 53688
|
||||
524288 0.059 0.059 0.059 59112
|
||||
1048575 0.066 0.066 0.066 53181
|
||||
1048576 0.060 0.060 0.060 58300
|
||||
2097151 0.066 0.066 0.067 52439
|
||||
2097152 0.060 0.068 0.060 57924
|
||||
4194303 0.069 0.067 0.080 43425
|
||||
4194304 0.062 0.080 0.062 56085
|
||||
8388607 0.126 0.118 0.133 26207
|
||||
8388608 0.127 0.119 0.118 29643
|
||||
|
||||
Newlib memset() for #c per n where c ≈ 0.273ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 443.000 440.875 440.078 8
|
||||
1 437.000 437.375 440.453 8
|
||||
2 226.500 226.438 227.461 15
|
||||
3 150.333 150.625 151.151 23
|
||||
4 113.250 113.281 113.770 31
|
||||
7 66.714 67.232 66.998 52
|
||||
8 58.375 58.828 58.811 59
|
||||
15 31.000 30.858 31.264 112
|
||||
16 31.438 28.523 28.317 123
|
||||
31 27.839 29.536 50.533 69
|
||||
32 11.281 10.918 11.068 315
|
||||
63 12.302 11.907 11.863 294
|
||||
64 4.703 4.396 4.404 793
|
||||
127 2.732 2.719 2.712 1287
|
||||
128 0.852 0.729 0.736 4742
|
||||
255 1.188 1.178 1.171 2981
|
||||
256 0.652 0.416 0.381 9171
|
||||
511 1.474 1.629 1.662 2099
|
||||
512 0.287 0.264 0.246 14204
|
||||
1023 0.873 0.934 0.947 3684
|
||||
1024 0.196 0.179 0.178 19604
|
||||
2047 0.544 0.545 0.626 5572
|
||||
2048 0.257 0.257 0.253 13779
|
||||
4095 0.426 0.427 0.430 8110
|
||||
4096 0.282 0.296 0.293 11917
|
||||
8191 0.374 0.370 0.371 9402
|
||||
8192 0.297 0.310 0.400 8717
|
||||
16383 0.346 0.345 0.433 8062
|
||||
16384 0.313 0.312 0.311 11223
|
||||
32767 0.334 0.332 0.332 10505
|
||||
32768 0.313 0.313 0.358 9759
|
||||
65535 0.335 0.327 0.330 10589
|
||||
65536 0.330 0.312 0.337 10347
|
||||
131071 0.350 0.339 0.355 9825
|
||||
131072 0.334 0.329 0.359 9728
|
||||
262143 0.346 0.352 0.357 9785
|
||||
262144 0.350 0.375 0.482 7243
|
||||
524287 0.348 0.346 0.360 9691
|
||||
524288 0.347 0.346 0.385 9063
|
||||
1048575 0.358 0.375 0.383 9114
|
||||
1048576 0.355 0.382 0.388 8987
|
||||
2097151 0.362 0.368 0.390 8956
|
||||
2097152 0.363 0.375 0.387 9016
|
||||
4194303 0.361 0.379 0.385 9073
|
||||
4194304 0.366 0.376 0.385 9074
|
||||
8388607 0.363 0.366 0.372 9391
|
||||
8388608 0.419 0.374 0.370 9428 */
|
|
@ -49,7 +49,8 @@ o/$(MODE)/libc/nexgen32e/tinystrncmp.ncabi.o: \
|
|||
|
||||
o/$(MODE)/libc/nexgen32e/errno.o: \
|
||||
OVERRIDE_CFLAGS += \
|
||||
$(NO_MAGIC)
|
||||
$(NO_MAGIC) \
|
||||
-fno-sanitize=all
|
||||
|
||||
LIBC_NEXGEN32E_LIBS = $(foreach x,$(LIBC_NEXGEN32E_ARTIFACTS),$($(x)))
|
||||
LIBC_NEXGEN32E_SRCS = $(foreach x,$(LIBC_NEXGEN32E_ARTIFACTS),$($(x)_SRCS))
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#ifndef COSMOPOLITAN_LIBC_NEXGEN32E_RDTSCP_H_
|
||||
#define COSMOPOLITAN_LIBC_NEXGEN32E_RDTSCP_H_
|
||||
#include "libc/bits/bits.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
COSMOPOLITAN_C_START_
|
||||
|
||||
|
|
13
libc/nexgen32e/sha.h
Normal file
13
libc/nexgen32e/sha.h
Normal file
|
@ -0,0 +1,13 @@
|
|||
#ifndef COSMOPOLITAN_LIBC_NEXGEN32E_SHA_H_
|
||||
#define COSMOPOLITAN_LIBC_NEXGEN32E_SHA_H_
|
||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
COSMOPOLITAN_C_START_
|
||||
|
||||
void sha1_transform_avx2(uint32_t[hasatleast 5], const void *, unsigned);
|
||||
void sha1_transform_ni(uint32_t[hasatleast 5], const void *, unsigned);
|
||||
void sha256_transform_rorx(uint32_t[hasatleast 8], const void *, unsigned);
|
||||
void sha256_transform_ni(uint32_t[hasatleast 8], const void *, unsigned);
|
||||
|
||||
COSMOPOLITAN_C_END_
|
||||
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
||||
#endif /* COSMOPOLITAN_LIBC_NEXGEN32E_SHA_H_ */
|
|
@ -1,49 +1,36 @@
|
|||
/*
|
||||
* BSD LICENSE
|
||||
*
|
||||
* Copyright(c) 2014 Intel Corporation.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* - Neither the name of Intel Corporation nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
|
||||
*
|
||||
* This implementation is based on the previous SSSE3 release:
|
||||
* Visit http://software.intel.com/en-us/articles/
|
||||
* and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
|
||||
*
|
||||
* Updates 20-byte SHA-1 record at start of 'state', from 'input', for
|
||||
* even number of 'blocks' consecutive 64-byte blocks.
|
||||
*
|
||||
* extern "C" void sha1_transform_avx2(
|
||||
* struct sha1_state *state, const uint8_t *input, int blocks );
|
||||
*/
|
||||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ │
|
||||
│ Copyright 2014 Intel Corporation │
|
||||
│ │
|
||||
│ Redistribution and use in source and binary forms, with or without │
|
||||
│ modification, are permitted provided that the following conditions │
|
||||
│ are met: │
|
||||
│ │
|
||||
│ * Redistributions of source code must retain the above copyright │
|
||||
│ notice, this list of conditions and the following disclaimer. │
|
||||
│ * Redistributions in binary form must reproduce the above copyright │
|
||||
│ notice, this list of conditions and the following disclaimer in │
|
||||
│ the documentation and/or other materials provided with the │
|
||||
│ distribution. │
|
||||
│ * Neither the name of Intel Corporation nor the names of its │
|
||||
│ contributors may be used to endorse or promote products derived │
|
||||
│ from this software without specific prior written permission. │
|
||||
│ │
|
||||
│ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS │
|
||||
│ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT │
|
||||
│ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR │
|
||||
│ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT │
|
||||
│ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, │
|
||||
│ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT │
|
||||
│ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │
|
||||
│ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │
|
||||
│ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT │
|
||||
│ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE │
|
||||
│ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
|
||||
.ident "\n\
|
||||
|
@ -71,7 +58,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
#define REG_RTB %rbx
|
||||
#define REG_T1 %r11d
|
||||
#define xmm_mov vmovups
|
||||
#define avx2_zeroupper vzeroupper
|
||||
#define RND_F1 1
|
||||
#define RND_F2 2
|
||||
#define RND_F3 3
|
||||
|
@ -84,16 +70,13 @@ Copyright 2014 Intel Corporation\n"
|
|||
.set E, REG_E
|
||||
.set TB, REG_TB
|
||||
.set TA, REG_TA
|
||||
|
||||
.set RA, REG_RA
|
||||
.set RB, REG_RB
|
||||
.set RC, REG_RC
|
||||
.set RD, REG_RD
|
||||
.set RE, REG_RE
|
||||
|
||||
.set RTA, REG_RTA
|
||||
.set RTB, REG_RTB
|
||||
|
||||
.set T1, REG_T1
|
||||
.endm
|
||||
|
||||
|
@ -177,7 +160,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
PRECALC_RESET_WY
|
||||
PRECALC_ROTATE_WY
|
||||
.endif
|
||||
|
||||
/* message scheduling pre-compute for rounds 0-15 */
|
||||
.if ((i & 7) == 0)
|
||||
/*
|
||||
|
@ -194,7 +176,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
|
||||
.elseif ((i & 7) == 7)
|
||||
vmovdqu WY_TMP, PRECALC_WK(i&~7)
|
||||
|
||||
PRECALC_ROTATE_WY
|
||||
.endif
|
||||
.endm
|
||||
|
@ -236,7 +217,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
vpxor WY_TMP2, WY_TMP, WY
|
||||
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
|
||||
vmovdqu WY_TMP, PRECALC_WK(i&~7)
|
||||
|
||||
PRECALC_ROTATE_WY
|
||||
.endif
|
||||
.endm
|
||||
|
@ -250,7 +230,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
* allows more efficient vectorization
|
||||
* since w[i]=>w[i-3] dependency is broken
|
||||
*/
|
||||
|
||||
.if ((i & 7) == 0)
|
||||
/*
|
||||
* blended AVX2 and ALU instruction scheduling
|
||||
|
@ -272,14 +251,12 @@ Copyright 2014 Intel Corporation\n"
|
|||
.elseif ((i & 7) == 7)
|
||||
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
|
||||
vmovdqu WY_TMP, PRECALC_WK(i&~7)
|
||||
|
||||
PRECALC_ROTATE_WY
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro PRECALC r, s
|
||||
.set i, \r
|
||||
|
||||
.if (i < 40)
|
||||
.set K_XMM, 32*0
|
||||
.elseif (i < 80)
|
||||
|
@ -289,7 +266,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
.else
|
||||
.set K_XMM, 32*3
|
||||
.endif
|
||||
|
||||
.if (i<32)
|
||||
PRECALC_00_15 \s
|
||||
.elseif (i<64)
|
||||
|
@ -307,7 +283,6 @@ Copyright 2014 Intel Corporation\n"
|
|||
.set B, TB
|
||||
.set TB, A
|
||||
.set A, T_REG
|
||||
|
||||
.set T_REG, RE
|
||||
.set RE, RD
|
||||
.set RD, RC
|
||||
|
@ -317,9 +292,8 @@ Copyright 2014 Intel Corporation\n"
|
|||
.set RA, T_REG
|
||||
.endm
|
||||
|
||||
/* Macro relies on saved ROUND_Fx */
|
||||
|
||||
.macro RND_FUN f, r
|
||||
// Macro relies on saved ROUND_Fx
|
||||
.macro RND_FUN f, r
|
||||
.if (\f == RND_F1)
|
||||
ROUND_F1 \r
|
||||
.elseif (\f == RND_F2)
|
||||
|
@ -332,11 +306,11 @@ Copyright 2014 Intel Corporation\n"
|
|||
.macro RR r
|
||||
.set round_id, (\r % 80)
|
||||
|
||||
.if (round_id == 0) /* Precalculate F for first round */
|
||||
.if (round_id == 0) # Precalculate F for first round
|
||||
.set ROUND_FUNC, RND_F1
|
||||
mov B, TB
|
||||
|
||||
rorx $(32-30), B, B /* b>>>2 */
|
||||
rorx $(32-30), B, B # b>>>2
|
||||
andn D, TB, T1
|
||||
and C, TB
|
||||
xor T1, TB
|
||||
|
@ -362,40 +336,38 @@ Copyright 2014 Intel Corporation\n"
|
|||
.macro ROUND_F1 r
|
||||
add WK(\r), E
|
||||
|
||||
andn C, A, T1 /* ~b&d */
|
||||
lea (RE,RTB), E /* Add F from the previous round */
|
||||
andn C, A, T1 # ~b&d
|
||||
lea (RE,RTB), E # Add F from the previous round
|
||||
|
||||
rorx $(32-5), A, TA /* T2 = A >>> 5 */
|
||||
rorx $(32-30),A, TB /* b>>>2 for next round */
|
||||
rorx $(32-5), A, TA # T2 = A >>> 5
|
||||
rorx $(32-30),A, TB # b>>>2 for next round
|
||||
|
||||
PRECALC (\r) /* msg scheduling for next 2 blocks */
|
||||
PRECALC (\r) # msg scheduling for next 2 blocks
|
||||
|
||||
/*
|
||||
* Calculate F for the next round
|
||||
* (b & c) ^ andn[b, d]
|
||||
*/
|
||||
and B, A /* b&c */
|
||||
xor T1, A /* F1 = (b&c) ^ (~b&d) */
|
||||
// Calculate F for the next round
|
||||
// (b & c) ^ andn[b, d]
|
||||
and B, A # b&c
|
||||
xor T1, A # F1 = (b&c) ^ (~b&d)
|
||||
|
||||
lea (RE,RTA), E /* E += A >>> 5 */
|
||||
lea (RE,RTA), E # E += A >>> 5
|
||||
.endm
|
||||
|
||||
.macro ROUND_F2 r
|
||||
add WK(\r), E
|
||||
lea (RE,RTB), E /* Add F from the previous round */
|
||||
lea (RE,RTB), E # Add F from the previous round
|
||||
|
||||
/* Calculate F for the next round */
|
||||
rorx $(32-5), A, TA /* T2 = A >>> 5 */
|
||||
rorx $(32-5), A, TA # T2 = A >>> 5
|
||||
.if ((round_id) < 79)
|
||||
rorx $(32-30), A, TB /* b>>>2 for next round */
|
||||
rorx $(32-30), A, TB # b>>>2 for next round
|
||||
.endif
|
||||
PRECALC (\r) /* msg scheduling for next 2 blocks */
|
||||
PRECALC (\r) # msg scheduling for next 2 blocks
|
||||
|
||||
.if ((round_id) < 79)
|
||||
xor B, A
|
||||
.endif
|
||||
|
||||
add TA, E /* E += A >>> 5 */
|
||||
add TA, E # E += A >>> 5
|
||||
|
||||
.if ((round_id) < 79)
|
||||
xor C, A
|
||||
|
@ -404,30 +376,28 @@ Copyright 2014 Intel Corporation\n"
|
|||
|
||||
.macro ROUND_F3 r
|
||||
add WK(\r), E
|
||||
PRECALC (\r) /* msg scheduling for next 2 blocks */
|
||||
PRECALC (\r) # msg scheduling for next 2 blocks
|
||||
|
||||
lea (RE,RTB), E /* Add F from the previous round */
|
||||
lea (RE,RTB), E # Add F from the previous round
|
||||
|
||||
mov B, T1
|
||||
or A, T1
|
||||
|
||||
rorx $(32-5), A, TA /* T2 = A >>> 5 */
|
||||
rorx $(32-30), A, TB /* b>>>2 for next round */
|
||||
rorx $(32-5), A, TA # T2 = A >>> 5
|
||||
rorx $(32-30), A, TB # b>>>2 for next round
|
||||
|
||||
/* Calculate F for the next round
|
||||
* (b and c) or (d and (b or c))
|
||||
*/
|
||||
// Calculate F for the next round
|
||||
// (b and c) or (d and (b or c))
|
||||
and C, T1
|
||||
and B, A
|
||||
or T1, A
|
||||
|
||||
add TA, E /* E += A >>> 5 */
|
||||
add TA, E # E += A >>> 5
|
||||
|
||||
.endm
|
||||
|
||||
/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
|
||||
* %1 + %2 >= %3 ? %4 : 0
|
||||
*/
|
||||
// Add constant only if (%2 > %3) condition met (uses RTA as temp)
|
||||
// %1 + %2 >= %3 ? %4 : 0
|
||||
.macro ADD_IF_GE a, b, c, d
|
||||
mov \a, RTA
|
||||
add $\d, RTA
|
||||
|
@ -435,9 +405,7 @@ Copyright 2014 Intel Corporation\n"
|
|||
cmovge RTA, \a
|
||||
.endm
|
||||
|
||||
/*
|
||||
* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
|
||||
*/
|
||||
// Performs 80 rounds of SHA-1 for multiple blocks with s/w pipelining
|
||||
.macro SHA1_PIPELINED_MAIN_BODY
|
||||
|
||||
REGALLOC
|
||||
|
@ -451,7 +419,7 @@ Copyright 2014 Intel Corporation\n"
|
|||
mov %rsp, PRECALC_BUF
|
||||
lea (2*4*80+32)(%rsp), WK_BUF
|
||||
|
||||
# Precalc WK for first 2 blocks
|
||||
// Precalc WK for first 2 blocks
|
||||
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
|
||||
.set i, 0
|
||||
.rept 160
|
||||
|
@ -459,29 +427,27 @@ Copyright 2014 Intel Corporation\n"
|
|||
.set i, i + 1
|
||||
.endr
|
||||
|
||||
/* Go to next block if needed */
|
||||
// Go to next block if needed
|
||||
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
|
||||
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
|
||||
xchg WK_BUF, PRECALC_BUF
|
||||
|
||||
.align 32
|
||||
.L_loop:
|
||||
/*
|
||||
* code loops through more than one block
|
||||
* we use K_BASE value as a signal of a last block,
|
||||
* it is set below by: cmovae BUFFER_PTR, K_BASE
|
||||
*/
|
||||
|
||||
// code loops through more than one block
|
||||
// we use K_BASE value as a signal of a last block,
|
||||
// it is set below by: cmovae BUFFER_PTR, K_BASE
|
||||
test BLOCKS_CTR, BLOCKS_CTR
|
||||
jnz .L_begin
|
||||
.align 32
|
||||
jmp .L_end
|
||||
|
||||
.align 32
|
||||
.L_begin:
|
||||
|
||||
/*
|
||||
* Do first block
|
||||
* rounds: 0,2,4,6,8
|
||||
*/
|
||||
// process first block
|
||||
// rounds: 0,2,4,6,8
|
||||
.set j, 0
|
||||
.rept 5
|
||||
RR j
|
||||
|
@ -491,28 +457,26 @@ Copyright 2014 Intel Corporation\n"
|
|||
jmp .L_loop0
|
||||
.L_loop0:
|
||||
|
||||
/*
|
||||
* rounds:
|
||||
* 10,12,14,16,18
|
||||
* 20,22,24,26,28
|
||||
* 30,32,34,36,38
|
||||
* 40,42,44,46,48
|
||||
* 50,52,54,56,58
|
||||
*/
|
||||
// rounds
|
||||
// 10,12,14,16,18
|
||||
// 20,22,24,26,28
|
||||
// 30,32,34,36,38
|
||||
// 40,42,44,46,48
|
||||
// 50,52,54,56,58
|
||||
.rept 25
|
||||
RR j
|
||||
.set j, j+2
|
||||
.endr
|
||||
|
||||
/* Update Counter */
|
||||
// Update Counter */
|
||||
sub $1, BLOCKS_CTR
|
||||
/* Move to the next block only if needed*/
|
||||
|
||||
// Move to the next block only if needed*/
|
||||
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
|
||||
/*
|
||||
* rounds
|
||||
* 60,62,64,66,68
|
||||
* 70,72,74,76,78
|
||||
*/
|
||||
|
||||
// rounds
|
||||
// 60,62,64,66,68
|
||||
// 70,72,74,76,78
|
||||
.rept 10
|
||||
RR j
|
||||
.set j, j+2
|
||||
|
@ -529,12 +493,9 @@ Copyright 2014 Intel Corporation\n"
|
|||
|
||||
mov TB, B
|
||||
|
||||
/* Process second block */
|
||||
/*
|
||||
* rounds
|
||||
* 0+80, 2+80, 4+80, 6+80, 8+80
|
||||
* 10+80,12+80,14+80,16+80,18+80
|
||||
*/
|
||||
// process second block
|
||||
// 0+80, 2+80, 4+80, 6+80, 8+80
|
||||
// 10+80,12+80,14+80,16+80,18+80
|
||||
|
||||
.set j, 0
|
||||
.rept 10
|
||||
|
@ -544,11 +505,10 @@ Copyright 2014 Intel Corporation\n"
|
|||
|
||||
jmp .L_loop1
|
||||
.L_loop1:
|
||||
/*
|
||||
* rounds
|
||||
* 20+80,22+80,24+80,26+80,28+80
|
||||
* 30+80,32+80,34+80,36+80,38+80
|
||||
*/
|
||||
|
||||
// rounds
|
||||
// 20+80,22+80,24+80,26+80,28+80
|
||||
// 30+80,32+80,34+80,36+80,38+80
|
||||
.rept 10
|
||||
RR j+80
|
||||
.set j, j+2
|
||||
|
@ -557,29 +517,26 @@ Copyright 2014 Intel Corporation\n"
|
|||
jmp .L_loop2
|
||||
.L_loop2:
|
||||
|
||||
/*
|
||||
* rounds
|
||||
* 40+80,42+80,44+80,46+80,48+80
|
||||
* 50+80,52+80,54+80,56+80,58+80
|
||||
*/
|
||||
// rounds
|
||||
// 40+80,42+80,44+80,46+80,48+80
|
||||
// 50+80,52+80,54+80,56+80,58+80
|
||||
.rept 10
|
||||
RR j+80
|
||||
.set j, j+2
|
||||
.endr
|
||||
|
||||
/* update counter */
|
||||
// update counter
|
||||
sub $1, BLOCKS_CTR
|
||||
/* Move to the next block only if needed*/
|
||||
|
||||
// Move to the next block only if needed
|
||||
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
|
||||
|
||||
jmp .L_loop3
|
||||
.L_loop3:
|
||||
|
||||
/*
|
||||
* rounds
|
||||
* 60+80,62+80,64+80,66+80,68+80
|
||||
* 70+80,72+80,74+80,76+80,78+80
|
||||
*/
|
||||
// rounds
|
||||
// 60+80,62+80,64+80,66+80,68+80
|
||||
// 70+80,72+80,74+80,76+80,78+80
|
||||
.rept 10
|
||||
RR j+80
|
||||
.set j, j+2
|
||||
|
@ -619,14 +576,14 @@ Copyright 2014 Intel Corporation\n"
|
|||
|
||||
.align 128
|
||||
K_XMM_AR:
|
||||
.long K1, K1, K1, K1
|
||||
.long K1, K1, K1, K1
|
||||
.long K2, K2, K2, K2
|
||||
.long K2, K2, K2, K2
|
||||
.long K3, K3, K3, K3
|
||||
.long K3, K3, K3, K3
|
||||
.long K4, K4, K4, K4
|
||||
.long K4, K4, K4, K4
|
||||
.long K1,K1,K1,K1
|
||||
.long K1,K1,K1,K1
|
||||
.long K2,K2,K2,K2
|
||||
.long K2,K2,K2,K2
|
||||
.long K3,K3,K3,K3
|
||||
.long K3,K3,K3,K3
|
||||
.long K4,K4,K4,K4
|
||||
.long K4,K4,K4,K4
|
||||
|
||||
BSWAP_SHUFB_CTL:
|
||||
.long 0x00010203
|
||||
|
@ -639,6 +596,23 @@ BSWAP_SHUFB_CTL:
|
|||
.long 0x0c0d0e0f
|
||||
.text
|
||||
|
||||
// Performs Intel® AVX2™ optimized SHA-1 update.
|
||||
//
|
||||
// This implementation is based on the previous SSSE3 release:
|
||||
// Visit http://software.intel.com/en-us/articles/ and refer
|
||||
// to improving-the-performance-of-the-secure-hash-algorithm-1/
|
||||
//
|
||||
// Updates 20-byte SHA-1 record at start of 'state', from 'input',
|
||||
// for even number of 'blocks' consecutive 64-byte blocks.
|
||||
//
|
||||
// void sha1_transform_avx2(struct sha1_state *state,
|
||||
// const uint8_t *input,
|
||||
// int blocks);
|
||||
//
|
||||
// @param %rdi points to output digest
|
||||
// @param %rsi points to input data
|
||||
// @param %rdx is number of 64-byte blocks to process
|
||||
// @see X86_HAVE(SHA)
|
||||
sha1_transform_avx2:
|
||||
push %rbp
|
||||
mov %rsp,%rbp
|
||||
|
@ -648,33 +622,23 @@ sha1_transform_avx2:
|
|||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
RESERVE_STACK = (W_SIZE*4 + 8+24)
|
||||
|
||||
/* Align stack */
|
||||
mov %rsp, %rbx
|
||||
and $~(0x20-1), %rsp
|
||||
mov %rsp,%rbx
|
||||
and $~(0x20-1),%rsp
|
||||
push %rbx
|
||||
sub $RESERVE_STACK, %rsp
|
||||
|
||||
avx2_zeroupper
|
||||
|
||||
sub $RESERVE_STACK,%rsp
|
||||
vzeroupper
|
||||
/* Setup initial values */
|
||||
mov CTX, HASH_PTR
|
||||
mov BUF, BUFFER_PTR
|
||||
|
||||
mov BUF, BUFFER_PTR2
|
||||
mov CNT, BLOCKS_CTR
|
||||
|
||||
xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
|
||||
|
||||
mov CTX,HASH_PTR
|
||||
mov BUF,BUFFER_PTR
|
||||
mov BUF,BUFFER_PTR2
|
||||
mov CNT,BLOCKS_CTR
|
||||
xmm_mov BSWAP_SHUFB_CTL(%rip),YMM_SHUFB_BSWAP
|
||||
SHA1_PIPELINED_MAIN_BODY
|
||||
|
||||
avx2_zeroupper
|
||||
|
||||
add $RESERVE_STACK, %rsp
|
||||
vzeroupper
|
||||
add $RESERVE_STACK,%rsp
|
||||
pop %rsp
|
||||
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
|
|
286
libc/nexgen32e/sha1ni.S
Normal file
286
libc/nexgen32e/sha1ni.S
Normal file
|
@ -0,0 +1,286 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ │
|
||||
│ Copyright 2015 Intel Corporation │
|
||||
│ │
|
||||
│ Redistribution and use in source and binary forms, with or without │
|
||||
│ modification, are permitted provided that the following conditions │
|
||||
│ are met: │
|
||||
│ │
|
||||
│ * Redistributions of source code must retain the above copyright │
|
||||
│ notice, this list of conditions and the following disclaimer. │
|
||||
│ * Redistributions in binary form must reproduce the above copyright │
|
||||
│ notice, this list of conditions and the following disclaimer in │
|
||||
│ the documentation and/or other materials provided with the │
|
||||
│ distribution. │
|
||||
│ * Neither the name of Intel Corporation nor the names of its │
|
||||
│ contributors may be used to endorse or promote products derived │
|
||||
│ from this software without specific prior written permission. │
|
||||
│ │
|
||||
│ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS │
|
||||
│ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT │
|
||||
│ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR │
|
||||
│ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT │
|
||||
│ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, │
|
||||
│ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT │
|
||||
│ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │
|
||||
│ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │
|
||||
│ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT │
|
||||
│ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE │
|
||||
│ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
|
||||
.text
|
||||
.align 32
|
||||
.ident "\n\
|
||||
Intel SHA-NI (BSD-3 License)\n\
|
||||
Copyright 2015 Intel Corporation\n\
|
||||
Sean Gulley <sean.m.gulley@intel.com>\n\
|
||||
Tim Chen <tim.c.chen@linux.intel.com>\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
#define FRAME_SIZE 32
|
||||
#define DIGEST_PTR %rdi
|
||||
#define DATA_PTR %rsi
|
||||
#define NUM_BLKS %rdx
|
||||
#define ABCD %xmm0
|
||||
#define E0 %xmm1 /* Need two E's b/c they ping pong */
|
||||
#define E1 %xmm2
|
||||
#define MSG0 %xmm3
|
||||
#define MSG1 %xmm4
|
||||
#define MSG2 %xmm5
|
||||
#define MSG3 %xmm6
|
||||
#define SHUF_MASK %xmm7
|
||||
|
||||
// Performs Intel® SHA-NI™ optimized SHA-1 update.
|
||||
//
|
||||
// The function takes a pointer to the current hash values, a
|
||||
// pointer to the input data, and a number of 64 byte blocks to
|
||||
// process. Once all blocks have been processed, the digest pointer
|
||||
// is updated with the resulting hash value. The function only
|
||||
// processes complete blocks, there is no functionality to store
|
||||
// partial blocks. All message padding and hash value
|
||||
// initialization must be done outside the update function.
|
||||
//
|
||||
// The indented lines in the loop are instructions related to
|
||||
// rounds processing. The non-indented lines are instructions
|
||||
// related to the message schedule.
|
||||
//
|
||||
// void sha1_transform_ni(uint32_t digest[static 5],
|
||||
// const void *data,
|
||||
// uint32_t numBlocks);
|
||||
//
|
||||
// @param %rdi points to output digest
|
||||
// @param %rsi points to input data
|
||||
// @param %rdx is number of 64-byte blocks to process
|
||||
// @see X86_HAVE(SHA)
|
||||
sha1_transform_ni:
|
||||
push %rbp
|
||||
mov %rsp,%rbp
|
||||
.profilable
|
||||
sub $FRAME_SIZE,%rsp
|
||||
shl $6,NUM_BLKS # convert to bytes
|
||||
jz .Ldone_hash
|
||||
add DATA_PTR,NUM_BLKS # pointer to end of data
|
||||
|
||||
// load initial hash values
|
||||
movdqa UPPER_WORD_MASK(%rip),E1
|
||||
pinsrd $3,1*16(DIGEST_PTR),E0
|
||||
movdqu 0*16(DIGEST_PTR),ABCD
|
||||
pand E1,E0
|
||||
pshufd $0x1B,ABCD,ABCD
|
||||
|
||||
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip),SHUF_MASK
|
||||
|
||||
.Lloop0:
|
||||
// Save hash values for addition after rounds
|
||||
movdqa E0,(0*16)(%rsp)
|
||||
movdqa ABCD,(1*16)(%rsp)
|
||||
|
||||
// Rounds 0-3
|
||||
movdqu 0*16(DATA_PTR),MSG0
|
||||
pshufb SHUF_MASK,MSG0
|
||||
paddd MSG0,E0
|
||||
movdqa ABCD,E1
|
||||
sha1rnds4 $0,E0,ABCD
|
||||
|
||||
// Rounds 4-7
|
||||
movdqu 1*16(DATA_PTR),MSG1
|
||||
pshufb SHUF_MASK,MSG1
|
||||
sha1nexte MSG1,E1
|
||||
movdqa ABCD,E0
|
||||
sha1rnds4 $0,E1,ABCD
|
||||
sha1msg1 MSG1,MSG0
|
||||
|
||||
// Rounds 8-11
|
||||
movdqu 2*16(DATA_PTR),MSG2
|
||||
pshufb SHUF_MASK,MSG2
|
||||
sha1nexte MSG2,E0
|
||||
movdqa ABCD,E1
|
||||
sha1rnds4 $0,E0,ABCD
|
||||
sha1msg1 MSG2,MSG1
|
||||
pxor MSG2,MSG0
|
||||
|
||||
// Rounds 12-15
|
||||
movdqu 3*16(DATA_PTR),MSG3
|
||||
pshufb SHUF_MASK,MSG3
|
||||
sha1nexte MSG3,E1
|
||||
movdqa ABCD,E0
|
||||
sha1msg2 MSG3,MSG0
|
||||
sha1rnds4 $0,E1,ABCD
|
||||
sha1msg1 MSG3,MSG2
|
||||
pxor MSG3,MSG1
|
||||
|
||||
// Rounds 16-19
|
||||
sha1nexte MSG0,E0
|
||||
movdqa ABCD,E1
|
||||
sha1msg2 MSG0,MSG1
|
||||
sha1rnds4 $0,E0,ABCD
|
||||
sha1msg1 MSG0,MSG3
|
||||
pxor MSG0,MSG2
|
||||
|
||||
// Rounds 20-23
|
||||
sha1nexte MSG1,E1
|
||||
movdqa ABCD,E0
|
||||
sha1msg2 MSG1,MSG2
|
||||
sha1rnds4 $1,E1,ABCD
|
||||
sha1msg1 MSG1,MSG0
|
||||
pxor MSG1,MSG3
|
||||
|
||||
// Rounds 24-27
|
||||
sha1nexte MSG2,E0
|
||||
movdqa ABCD,E1
|
||||
sha1msg2 MSG2,MSG3
|
||||
sha1rnds4 $1,E0,ABCD
|
||||
sha1msg1 MSG2,MSG1
|
||||
pxor MSG2,MSG0
|
||||
|
||||
// Rounds 28-31
|
||||
sha1nexte MSG3,E1
|
||||
movdqa ABCD,E0
|
||||
sha1msg2 MSG3,MSG0
|
||||
sha1rnds4 $1,E1,ABCD
|
||||
sha1msg1 MSG3,MSG2
|
||||
pxor MSG3,MSG1
|
||||
|
||||
// Rounds 32-35
|
||||
sha1nexte MSG0,E0
|
||||
movdqa ABCD,E1
|
||||
sha1msg2 MSG0,MSG1
|
||||
sha1rnds4 $1,E0,ABCD
|
||||
sha1msg1 MSG0,MSG3
|
||||
pxor MSG0,MSG2
|
||||
|
||||
// Rounds 36-39
|
||||
sha1nexte MSG1,E1
|
||||
movdqa ABCD,E0
|
||||
sha1msg2 MSG1,MSG2
|
||||
sha1rnds4 $1,E1,ABCD
|
||||
sha1msg1 MSG1,MSG0
|
||||
pxor MSG1,MSG3
|
||||
|
||||
// Rounds 40-43
|
||||
sha1nexte MSG2,E0
|
||||
movdqa ABCD,E1
|
||||
sha1msg2 MSG2,MSG3
|
||||
sha1rnds4 $2,E0,ABCD
|
||||
sha1msg1 MSG2,MSG1
|
||||
pxor MSG2,MSG0
|
||||
|
||||
// Rounds 44-47
|
||||
sha1nexte MSG3,E1
|
||||
movdqa ABCD,E0
|
||||
sha1msg2 MSG3,MSG0
|
||||
sha1rnds4 $2,E1,ABCD
|
||||
sha1msg1 MSG3,MSG2
|
||||
pxor MSG3,MSG1
|
||||
|
||||
// Rounds 48-51
|
||||
sha1nexte MSG0,E0
|
||||
movdqa ABCD,E1
|
||||
sha1msg2 MSG0,MSG1
|
||||
sha1rnds4 $2,E0,ABCD
|
||||
sha1msg1 MSG0,MSG3
|
||||
pxor MSG0,MSG2
|
||||
|
||||
// Rounds 52-55
|
||||
sha1nexte MSG1,E1
|
||||
movdqa ABCD,E0
|
||||
sha1msg2 MSG1,MSG2
|
||||
sha1rnds4 $2,E1,ABCD
|
||||
sha1msg1 MSG1,MSG0
|
||||
pxor MSG1,MSG3
|
||||
|
||||
// Rounds 56-59
|
||||
sha1nexte MSG2,E0
|
||||
movdqa ABCD,E1
|
||||
sha1msg2 MSG2,MSG3
|
||||
sha1rnds4 $2,E0,ABCD
|
||||
sha1msg1 MSG2,MSG1
|
||||
pxor MSG2,MSG0
|
||||
|
||||
// Rounds 60-63
|
||||
sha1nexte MSG3,E1
|
||||
movdqa ABCD,E0
|
||||
sha1msg2 MSG3,MSG0
|
||||
sha1rnds4 $3,E1,ABCD
|
||||
sha1msg1 MSG3,MSG2
|
||||
pxor MSG3,MSG1
|
||||
|
||||
// Rounds 64-67
|
||||
sha1nexte MSG0,E0
|
||||
movdqa ABCD,E1
|
||||
sha1msg2 MSG0,MSG1
|
||||
sha1rnds4 $3,E0,ABCD
|
||||
sha1msg1 MSG0,MSG3
|
||||
pxor MSG0,MSG2
|
||||
|
||||
// Rounds 68-71
|
||||
sha1nexte MSG1,E1
|
||||
movdqa ABCD,E0
|
||||
sha1msg2 MSG1,MSG2
|
||||
sha1rnds4 $3,E1,ABCD
|
||||
pxor MSG1,MSG3
|
||||
|
||||
// Rounds 72-75
|
||||
sha1nexte MSG2,E0
|
||||
movdqa ABCD,E1
|
||||
sha1msg2 MSG2,MSG3
|
||||
sha1rnds4 $3,E0,ABCD
|
||||
|
||||
// Rounds 76-79
|
||||
sha1nexte MSG3,E1
|
||||
movdqa ABCD,E0
|
||||
sha1rnds4 $3,E1,ABCD
|
||||
|
||||
// Add current hash values with previously saved
|
||||
sha1nexte (0*16)(%rsp),E0
|
||||
paddd (1*16)(%rsp),ABCD
|
||||
|
||||
// Increment data pointer and loop if more to process
|
||||
add $64,DATA_PTR
|
||||
cmp NUM_BLKS,DATA_PTR
|
||||
jne .Lloop0
|
||||
|
||||
// Write hash values back in the correct order
|
||||
pshufd $0x1B,ABCD,ABCD
|
||||
movdqu ABCD,0*16(DIGEST_PTR)
|
||||
pextrd $3,E0,1*16(DIGEST_PTR)
|
||||
|
||||
.Ldone_hash:
|
||||
leave
|
||||
ret
|
||||
.endfn sha1_transform_ni,globl
|
||||
|
||||
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
|
||||
.align 16
|
||||
PSHUFFLE_BYTE_FLIP_MASK:
|
||||
.octa 0x000102030405060708090a0b0c0d0e0f
|
||||
|
||||
.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
|
||||
.align 16
|
||||
UPPER_WORD_MASK:
|
||||
.octa 0xFFFFFFFF000000000000000000000000
|
|
@ -50,7 +50,7 @@
|
|||
#include "libc/macros.internal.h"
|
||||
|
||||
.ident "\n\
|
||||
AVX2 SHA-256 (BSD-2 License)\n\
|
||||
AVX2 SHA2 (BSD-2 License)\n\
|
||||
Copyright 2013 Intel Corporation\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
|
@ -598,19 +598,19 @@ sha256_transform_rorx:
|
|||
|
||||
.align 16
|
||||
.Loop1:
|
||||
vpaddd K256+0*32(SRND), X0, XFER
|
||||
vpaddd kSha256x2+0*32(SRND), X0, XFER
|
||||
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
|
||||
FOUR_ROUNDS_AND_SCHED _XFER + 0*32
|
||||
|
||||
vpaddd K256+1*32(SRND), X0, XFER
|
||||
vpaddd kSha256x2+1*32(SRND), X0, XFER
|
||||
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
|
||||
FOUR_ROUNDS_AND_SCHED _XFER + 1*32
|
||||
|
||||
vpaddd K256+2*32(SRND), X0, XFER
|
||||
vpaddd kSha256x2+2*32(SRND), X0, XFER
|
||||
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
|
||||
FOUR_ROUNDS_AND_SCHED _XFER + 2*32
|
||||
|
||||
vpaddd K256+3*32(SRND), X0, XFER
|
||||
vpaddd kSha256x2+3*32(SRND), X0, XFER
|
||||
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
|
||||
FOUR_ROUNDS_AND_SCHED _XFER + 3*32
|
||||
|
||||
|
@ -620,11 +620,11 @@ sha256_transform_rorx:
|
|||
|
||||
.Loop2:
|
||||
## Do last 16 rounds with no scheduling
|
||||
vpaddd K256+0*32(SRND), X0, XFER
|
||||
vpaddd kSha256x2+0*32(SRND), X0, XFER
|
||||
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
|
||||
DO_4ROUNDS _XFER + 0*32
|
||||
|
||||
vpaddd K256+1*32(SRND), X1, XFER
|
||||
vpaddd kSha256x2+1*32(SRND), X1, XFER
|
||||
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
|
||||
DO_4ROUNDS _XFER + 1*32
|
||||
add $2*32, SRND
|
||||
|
@ -712,7 +712,6 @@ sha256_transform_rorx:
|
|||
.Ldone_hash:
|
||||
|
||||
mov _RSP(%rsp), %rsp
|
||||
|
||||
popq %r15
|
||||
popq %r14
|
||||
popq %r13
|
||||
|
@ -722,52 +721,38 @@ sha256_transform_rorx:
|
|||
ret
|
||||
.endfn sha256_transform_rorx,globl
|
||||
|
||||
.section .rodata.cst512.K256, "aM", @progbits, 512
|
||||
.align 64
|
||||
K256:
|
||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
|
||||
.rodata.cst32
|
||||
PSHUFFLE_BYTE_FLIP_MASK:
|
||||
.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
|
||||
.octa 0x0c0d0e0f08090a0b0405060700010203
|
||||
.octa 0x0c0d0e0f08090a0b0405060700010203
|
||||
|
||||
# shuffle xBxA -> 00BA
|
||||
.rodata.cst32
|
||||
_SHUF_00BA:
|
||||
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
|
||||
|
||||
# shuffle xDxC -> DC00
|
||||
.rodata.cst32
|
||||
_SHUF_DC00:
|
||||
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
|
||||
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
|
||||
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
|
||||
|
||||
.bss
|
||||
.align 64
|
||||
kSha256x2:
|
||||
.zero 512
|
||||
.endobj kSha256x2,globl
|
||||
.previous
|
||||
|
||||
.init.start 201,_init_kSha256x2
|
||||
push $64
|
||||
pop %rcx
|
||||
ezlea kSha256,dx
|
||||
ezlea kSha256x2,ax
|
||||
0: movaps -16(%rdx,%rcx,4),%xmm0
|
||||
movaps %xmm0,-16(%rax,%rcx,8)
|
||||
movaps %xmm0,-32(%rax,%rcx,8)
|
||||
sub $4,%ecx
|
||||
jnz 0b
|
||||
.init.end 201,_init_kSha256x2
|
||||
|
|
318
libc/nexgen32e/sha256ni.S
Normal file
318
libc/nexgen32e/sha256ni.S
Normal file
|
@ -0,0 +1,318 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ │
|
||||
│ Copyright 2015 Intel Corporation │
|
||||
│ │
|
||||
│ Redistribution and use in source and binary forms, with or without │
|
||||
│ modification, are permitted provided that the following conditions │
|
||||
│ are met: │
|
||||
│ │
|
||||
│ * Redistributions of source code must retain the above copyright │
|
||||
│ notice, this list of conditions and the following disclaimer. │
|
||||
│ * Redistributions in binary form must reproduce the above copyright │
|
||||
│ notice, this list of conditions and the following disclaimer in │
|
||||
│ the documentation and/or other materials provided with the │
|
||||
│ distribution. │
|
||||
│ * Neither the name of Intel Corporation nor the names of its │
|
||||
│ contributors may be used to endorse or promote products derived │
|
||||
│ from this software without specific prior written permission. │
|
||||
│ │
|
||||
│ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS │
|
||||
│ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT │
|
||||
│ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR │
|
||||
│ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT │
|
||||
│ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, │
|
||||
│ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT │
|
||||
│ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │
|
||||
│ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │
|
||||
│ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT │
|
||||
│ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE │
|
||||
│ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
|
||||
.text
|
||||
.align 32
|
||||
.ident "\n\
|
||||
Intel SHA-NI (BSD-3 License)\n\
|
||||
Copyright 2015 Intel Corporation\n\
|
||||
Sean Gulley <sean.m.gulley@intel.com>\n\
|
||||
Tim Chen <tim.c.chen@linux.intel.com>\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
#define DIGEST_PTR %rdi /* 1st arg */
|
||||
#define DATA_PTR %rsi /* 2nd arg */
|
||||
#define NUM_BLKS %rdx /* 3rd arg */
|
||||
#define SHA256CONSTANTS %rax
|
||||
#define MSG %xmm0
|
||||
#define STATE0 %xmm1
|
||||
#define STATE1 %xmm2
|
||||
#define MSGTMP0 %xmm3
|
||||
#define MSGTMP1 %xmm4
|
||||
#define MSGTMP2 %xmm5
|
||||
#define MSGTMP3 %xmm6
|
||||
#define MSGTMP4 %xmm7
|
||||
#define SHUF_MASK %xmm8
|
||||
#define ABEF_SAVE %xmm9
|
||||
#define CDGH_SAVE %xmm10
|
||||
|
||||
// Performs Intel® SHA-NI™ optimized SHA-256 update.
|
||||
//
|
||||
// The function takes a pointer to the current hash values, a
|
||||
// pointer to the input data, and a number of 64 byte blocks to
|
||||
// process. Once all blocks have been processed, the digest pointer
|
||||
// is updated with the resulting hash value. The function only
|
||||
// processes complete blocks, there is no functionality to store
|
||||
// partial blocks. All message padding and hash value
|
||||
// initialization must be done outside the update function.
|
||||
//
|
||||
// The indented lines in the loop are instructions related to
|
||||
// rounds processing. The non-indented lines are instructions
|
||||
// related to the message schedule.
|
||||
//
|
||||
// void sha256_transform_ni(uint32_t digest[static 8],
|
||||
// const void *data,
|
||||
// int32_t numBlocks);
|
||||
//
|
||||
// @param %rdi points to output digest
|
||||
// @param %rsi points to input data
|
||||
// @param %rdx is number of blocks to process
|
||||
// @see X86_HAVE(SHA)
|
||||
sha256_transform_ni:
|
||||
.leafprologue
|
||||
.profilable
|
||||
shl $6,NUM_BLKS # convert to bytes
|
||||
jz .Ldone_hash
|
||||
add DATA_PTR,NUM_BLKS # pointer to end of data
|
||||
|
||||
// Load initial hash values
|
||||
// Need to reorder these appropriately
|
||||
// DCBA, HGFE -> ABEF, CDGH
|
||||
movdqu 0*16(DIGEST_PTR),STATE0
|
||||
movdqu 1*16(DIGEST_PTR),STATE1
|
||||
|
||||
pshufd $0xB1,STATE0,STATE0 # CDAB
|
||||
pshufd $0x1B,STATE1,STATE1 # EFGH
|
||||
movdqa STATE0,MSGTMP4
|
||||
palignr $8,STATE1,STATE0 # ABEF
|
||||
pblendw $0xF0,MSGTMP4,STATE1 # CDGH
|
||||
|
||||
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip),SHUF_MASK
|
||||
lea kSha256(%rip),SHA256CONSTANTS
|
||||
|
||||
.Lloop0:
|
||||
|
||||
// Save hash values for addition after rounds
|
||||
movdqa STATE0,ABEF_SAVE
|
||||
movdqa STATE1,CDGH_SAVE
|
||||
|
||||
// Rounds 0-3
|
||||
movdqu 0*16(DATA_PTR),MSG
|
||||
pshufb SHUF_MASK,MSG
|
||||
movdqa MSG,MSGTMP0
|
||||
paddd 0*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
|
||||
// Rounds 4-7
|
||||
movdqu 1*16(DATA_PTR),MSG
|
||||
pshufb SHUF_MASK,MSG
|
||||
movdqa MSG,MSGTMP1
|
||||
paddd 1*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
sha256msg1 MSGTMP1,MSGTMP0
|
||||
|
||||
// Rounds 8-11
|
||||
movdqu 2*16(DATA_PTR),MSG
|
||||
pshufb SHUF_MASK,MSG
|
||||
movdqa MSG,MSGTMP2
|
||||
paddd 2*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
sha256msg1 MSGTMP2,MSGTMP1
|
||||
|
||||
// Rounds 12-15
|
||||
movdqu 3*16(DATA_PTR),MSG
|
||||
pshufb SHUF_MASK,MSG
|
||||
movdqa MSG,MSGTMP3
|
||||
paddd 3*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
movdqa MSGTMP3,MSGTMP4
|
||||
palignr $4,MSGTMP2,MSGTMP4
|
||||
paddd MSGTMP4,MSGTMP0
|
||||
sha256msg2 MSGTMP3,MSGTMP0
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
sha256msg1 MSGTMP3,MSGTMP2
|
||||
|
||||
// Rounds 16-19
|
||||
movdqa MSGTMP0,MSG
|
||||
paddd 4*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
movdqa MSGTMP0,MSGTMP4
|
||||
palignr $4,MSGTMP3,MSGTMP4
|
||||
paddd MSGTMP4,MSGTMP1
|
||||
sha256msg2 MSGTMP0,MSGTMP1
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
sha256msg1 MSGTMP0,MSGTMP3
|
||||
|
||||
// Rounds 20-23
|
||||
movdqa MSGTMP1,MSG
|
||||
paddd 5*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
movdqa MSGTMP1,MSGTMP4
|
||||
palignr $4,MSGTMP0,MSGTMP4
|
||||
paddd MSGTMP4,MSGTMP2
|
||||
sha256msg2 MSGTMP1,MSGTMP2
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
sha256msg1 MSGTMP1,MSGTMP0
|
||||
|
||||
// Rounds 24-27
|
||||
movdqa MSGTMP2,MSG
|
||||
paddd 6*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
movdqa MSGTMP2,MSGTMP4
|
||||
palignr $4,MSGTMP1,MSGTMP4
|
||||
paddd MSGTMP4,MSGTMP3
|
||||
sha256msg2 MSGTMP2,MSGTMP3
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
sha256msg1 MSGTMP2,MSGTMP1
|
||||
|
||||
// Rounds 28-31
|
||||
movdqa MSGTMP3,MSG
|
||||
paddd 7*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
movdqa MSGTMP3,MSGTMP4
|
||||
palignr $4,MSGTMP2,MSGTMP4
|
||||
paddd MSGTMP4,MSGTMP0
|
||||
sha256msg2 MSGTMP3,MSGTMP0
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
sha256msg1 MSGTMP3,MSGTMP2
|
||||
|
||||
// Rounds 32-35
|
||||
movdqa MSGTMP0,MSG
|
||||
paddd 8*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
movdqa MSGTMP0,MSGTMP4
|
||||
palignr $4,MSGTMP3,MSGTMP4
|
||||
paddd MSGTMP4,MSGTMP1
|
||||
sha256msg2 MSGTMP0,MSGTMP1
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
sha256msg1 MSGTMP0,MSGTMP3
|
||||
|
||||
// Rounds 36-39
|
||||
movdqa MSGTMP1,MSG
|
||||
paddd 9*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
movdqa MSGTMP1,MSGTMP4
|
||||
palignr $4,MSGTMP0,MSGTMP4
|
||||
paddd MSGTMP4,MSGTMP2
|
||||
sha256msg2 MSGTMP1,MSGTMP2
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
sha256msg1 MSGTMP1,MSGTMP0
|
||||
|
||||
// Rounds 40-43
|
||||
movdqa MSGTMP2,MSG
|
||||
paddd 10*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
movdqa MSGTMP2,MSGTMP4
|
||||
palignr $4,MSGTMP1,MSGTMP4
|
||||
paddd MSGTMP4,MSGTMP3
|
||||
sha256msg2 MSGTMP2,MSGTMP3
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
sha256msg1 MSGTMP2,MSGTMP1
|
||||
|
||||
// Rounds 44-47
|
||||
movdqa MSGTMP3,MSG
|
||||
paddd 11*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
movdqa MSGTMP3,MSGTMP4
|
||||
palignr $4,MSGTMP2,MSGTMP4
|
||||
paddd MSGTMP4,MSGTMP0
|
||||
sha256msg2 MSGTMP3,MSGTMP0
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
sha256msg1 MSGTMP3,MSGTMP2
|
||||
|
||||
// Rounds 48-51
|
||||
movdqa MSGTMP0,MSG
|
||||
paddd 12*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
movdqa MSGTMP0,MSGTMP4
|
||||
palignr $4,MSGTMP3,MSGTMP4
|
||||
paddd MSGTMP4,MSGTMP1
|
||||
sha256msg2 MSGTMP0,MSGTMP1
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
sha256msg1 MSGTMP0,MSGTMP3
|
||||
|
||||
// Rounds 52-55
|
||||
movdqa MSGTMP1,MSG
|
||||
paddd 13*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
movdqa MSGTMP1,MSGTMP4
|
||||
palignr $4,MSGTMP0,MSGTMP4
|
||||
paddd MSGTMP4,MSGTMP2
|
||||
sha256msg2 MSGTMP1,MSGTMP2
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
|
||||
// Rounds 56-59
|
||||
movdqa MSGTMP2,MSG
|
||||
paddd 14*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
movdqa MSGTMP2,MSGTMP4
|
||||
palignr $4,MSGTMP1,MSGTMP4
|
||||
paddd MSGTMP4,MSGTMP3
|
||||
sha256msg2 MSGTMP2,MSGTMP3
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
|
||||
// Rounds 60-63
|
||||
movdqa MSGTMP3,MSG
|
||||
paddd 15*16(SHA256CONSTANTS),MSG
|
||||
sha256rnds2 STATE0,STATE1
|
||||
pshufd $0x0E,MSG,MSG
|
||||
sha256rnds2 STATE1,STATE0
|
||||
|
||||
// Add current hash values with previously saved
|
||||
paddd ABEF_SAVE,STATE0
|
||||
paddd CDGH_SAVE,STATE1
|
||||
|
||||
// Increment data pointer and loop if more to process
|
||||
add $64,DATA_PTR
|
||||
cmp NUM_BLKS,DATA_PTR
|
||||
jne .Lloop0
|
||||
|
||||
// Write hash values back in the correct order
|
||||
pshufd $0x1B,STATE0,STATE0 # FEBA
|
||||
pshufd $0xB1,STATE1,STATE1 # DCHG
|
||||
movdqa STATE0,MSGTMP4
|
||||
pblendw $0xF0,STATE1,STATE0 # DCBA
|
||||
palignr $8,MSGTMP4,STATE1 # HGFE
|
||||
|
||||
movdqu STATE0,0*16(DIGEST_PTR)
|
||||
movdqu STATE1,1*16(DIGEST_PTR)
|
||||
|
||||
.Ldone_hash:
|
||||
.leafepilogue
|
||||
.endfn sha256_transform_ni,globl
|
||||
|
||||
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK,"aM",@progbits,16
|
||||
.align 16
|
||||
PSHUFFLE_BYTE_FLIP_MASK:
|
||||
.octa 0x0c0d0e0f08090a0b0405060700010203
|
||||
.endobj PSHUFFLE_BYTE_FLIP_MASK
|
|
@ -51,7 +51,7 @@
|
|||
#include "libc/macros.internal.h"
|
||||
|
||||
.ident "\n\
|
||||
AVX2 SHA-512 (BSD-2 License)\n\
|
||||
AVX2 SHA2 (BSD-2 License)\n\
|
||||
Copyright 2013 Intel Corporation\n"
|
||||
.include "libc/disclaimer.inc"
|
||||
|
||||
|
|
|
@ -1,74 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
|
||||
// Returns prefix length, consisting of chars not in reject.
|
||||
//
|
||||
// @param rdi is string
|
||||
// @param rsi is reject nul-terminated character set
|
||||
// @return rax is index of first byte in charset
|
||||
// @see strspn(), strtok_r()
|
||||
// @asyncsignalsafe
|
||||
strcspn:
|
||||
push %rbp
|
||||
mov %rsp,%rbp
|
||||
.profilable
|
||||
sub $16,%rsp
|
||||
push %rdi
|
||||
mov %rsi,%rdi
|
||||
call strlen
|
||||
pop %rdi
|
||||
cmp $15,%rax
|
||||
ja 4f
|
||||
push %rdi
|
||||
mov %rax,%rdx
|
||||
pxor %xmm0,%xmm0
|
||||
lea -16(%rbp),%rdi
|
||||
movdqa %xmm0,(%rdi)
|
||||
call MemCpy
|
||||
movdqa (%rdi),%xmm1
|
||||
pop %rdi
|
||||
or $-1,%rax
|
||||
0: inc %rax
|
||||
movzbl (%rdi,%rax),%ecx
|
||||
movd %ecx,%xmm0
|
||||
punpcklbw %xmm0,%xmm0
|
||||
punpcklwd %xmm0,%xmm0
|
||||
pshufd $0,%xmm0,%xmm0
|
||||
pcmpeqb %xmm1,%xmm0
|
||||
pmovmskb %xmm0,%ecx
|
||||
test %ecx,%ecx
|
||||
jz 0b
|
||||
9: leave
|
||||
ret
|
||||
1: cmp %ch,%cl
|
||||
je 9b
|
||||
inc %edx
|
||||
2: mov (%rsi,%rdx),%ch
|
||||
test %ch,%ch
|
||||
jne 1b
|
||||
inc %rax
|
||||
3: mov (%rdi,%rax),%cl
|
||||
test %cl,%cl
|
||||
je 9b
|
||||
xor %edx,%edx
|
||||
jmp 2b
|
||||
4: xor %eax,%eax
|
||||
jmp 3b
|
||||
.endfn strcspn,globl
|
|
@ -1,51 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/macros.internal.h"
|
||||
|
||||
// Returns length of NUL-terminated string.
|
||||
//
|
||||
// @param rdi is non-null NUL-terminated string pointer
|
||||
// @return rax is number of bytes (excluding NUL)
|
||||
// @clob ax,dx,cx,xmm3,xmm4
|
||||
// @note h/t agner fog
|
||||
// @asyncsignalsafe
|
||||
strlen: .leafprologue
|
||||
.profilable
|
||||
mov %rdi,%rax
|
||||
mov %edi,%ecx
|
||||
and $15,%ecx
|
||||
and $-16,%rax
|
||||
pxor %xmm4,%xmm4
|
||||
movdqa (%rax),%xmm3
|
||||
pcmpeqb %xmm4,%xmm3
|
||||
pmovmskb %xmm3,%edx
|
||||
shr %cl,%edx
|
||||
shl %cl,%edx
|
||||
bsf %edx,%edx
|
||||
jnz 2f
|
||||
1: lea 16(%rax),%rax
|
||||
movdqa (%rax),%xmm3
|
||||
pcmpeqb %xmm4,%xmm3
|
||||
pmovmskb %xmm3,%edx
|
||||
bsf %edx,%edx
|
||||
jz 1b
|
||||
2: add %rdx,%rax
|
||||
sub %rdi,%rax
|
||||
.leafepilogue
|
||||
.endfn strlen,globl
|
Loading…
Add table
Add a link
Reference in a new issue