Make numerous improvements

- Python static hello world now 1.8mb
- Python static fully loaded now 10mb
- Python HTTPS client now uses MbedTLS
- Python REPL now completes import stmts
- Increase stack size for Python for now
- Begin synthesizing posixpath and ntpath
- Restore Python \N{UNICODE NAME} support
- Restore Python NFKD symbol normalization
- Add optimized code path for Intel SHA-NI
- Get more Python unit tests passing faster
- Get Python help() pagination working on NT
- Python hashlib now supports MbedTLS PBKDF2
- Make memcpy/memmove/memcmp/bcmp/etc. faster
- Add Mersenne Twister and Vigna to LIBC_RAND
- Provide privileged __printf() for error code
- Fix zipos opendir() so that it reports ENOTDIR
- Add basic chmod() implementation for Windows NT
- Add Cosmo's best functions to Python cosmo module
- Pin function trace indent depth to that of caller
- Show memory diagram on invalid access in MODE=dbg
- Differentiate stack overflow on crash in MODE=dbg
- Add stb_truetype and tools for analyzing font files
- Upgrade to UNICODE 13 and reduce its binary footprint
- COMPILE.COM now logs resource usage of build commands
- Start implementing basic poll() support on bare metal
- Set getauxval(AT_EXECFN) to GetModuleFileName() on NT
- Add descriptions to strerror() in non-TINY build modes
- Add COUNTBRANCH() macro to help with micro-optimizations
- Make error / backtrace / asan / memory code more unbreakable
- Add fast perfect C implementation of μ-Law and a-Law audio codecs
- Make strtol() functions consistent with other libc implementations
- Improve Linenoise implementation (see also github.com/jart/bestline)
- COMPILE.COM now suppresses stdout/stderr of successful build commands
This commit is contained in:
Justine Tunney 2021-09-27 22:58:51 -07:00
parent fa7b4f5bd1
commit 39bf41f4eb
806 changed files with 77494 additions and 63859 deletions

View file

@ -14,6 +14,7 @@ struct RlDecode {
};
void rldecode(void *dest, const struct RlDecode *) hidden;
void rldecode2(void *dest, const struct RlDecode *) hidden;
const uint8_t *lz4check(const void *data) hidden;
void *lz4cpy(void *dest, const void *blockdata, size_t blocksize) hidden;
void *lz4decode(void *dest, const void *src) hidden;

View file

@ -1,7 +1,7 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Copyright 2021 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
@ -18,36 +18,23 @@
*/
#include "libc/macros.internal.h"
// Copies memory.
//
// DEST and SRC may overlap.
//
// @param rdi is dest
// @param rsi is src
// @param rdx is number of bytes
// @return original rdi copied to rax
// @clob flags,rcx
// @asyncsignalsafe
memmove:
mov %rdi,%rax
// 𝑠𝑙𝑖𝑑𝑒
.endfn MemMove,globl,hidden
MemMove:
.leafprologue
.profilable
push %rdi
push %rsi
mov %rdx,%rcx
cmp %rsi,%rdi
jb 1f
lea -1(%rdi,%rcx),%rdi
lea -1(%rsi,%rcx),%rsi
std
1: rep movsb
cld
pop %rsi
pop %rdi
.leafepilogue
.endfn memmove,globl
.source __FILE__
.rodata
.align 64
kSha256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.endobj kSha256,globl

View file

@ -1,556 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
@fileoverview Cosmopolitan Memory Copying
Of all the functions in the technology industry, none are more
critical than the Kernighan & Ritchie Memory Copy API for the C
Language, 1972 model: more commonly known as memcpy(). It's the
world's most popular functionone all programmers love.
This implementation is the fastest and nearly the tiniest too.
It doesn't break when copying backwards or on misaligned data.
It's so easy that even a child could use it, and they do.
*/
#include "libc/nexgen32e/x86feature.h"
#include "libc/macros.internal.h"
// Copies memory.
//
// DEST and SRC must not overlap, unless DESTSRC.
//
// @param rdi is dest
// @param rsi is src
// @param rdx is number of bytes
// @return original rdi copied to rax
// @mode long
// @asyncsignalsafe
memcpy: mov %rdi,%rax
// 𝑠𝑙𝑖𝑑𝑒
.align 16
.endfn memcpy,globl
// Copies memory w/ minimal impact ABI.
//
// @param rdi is dest
// @param rsi is src
// @param rdx is number of bytes
// @clob flags,rcx,xmm3,xmm4
// @mode long
MemCpy: .leafprologue
.profilable
mov $.Lmemcpytab.ro.size,%ecx
cmp %rcx,%rdx
cmovb %rdx,%rcx
jmp *memcpytab(,%rcx,8)
.Lanchorpoint:
.L32r: cmp $1024,%rdx
jae .Lerms
.L32: vmovdqu -32(%rsi,%rdx),%ymm4
mov $32,%rcx
0: add $32,%rcx
vmovdqu -64(%rsi,%rcx),%ymm3
vmovdqu %ymm3,-64(%rdi,%rcx)
cmp %rcx,%rdx
ja 0b
vmovdqu %ymm4,-32(%rdi,%rdx)
vxorps %ymm4,%ymm4,%ymm4
vxorps %ymm3,%ymm3,%ymm3
jmp .L0
.L16r: cmp $1024,%rdx
jae .Lerms
.L16: movdqu -16(%rsi,%rdx),%xmm4
mov $16,%rcx
0: add $16,%rcx
movdqu -32(%rsi,%rcx),%xmm3
movdqu %xmm3,-32(%rdi,%rcx)
cmp %rcx,%rdx
ja 0b
movdqu %xmm4,-16(%rdi,%rdx)
pxor %xmm4,%xmm4
pxor %xmm3,%xmm3
jmp .L0
.L8: push %rbx
mov (%rsi),%rcx
mov -8(%rsi,%rdx),%rbx
mov %rcx,(%rdi)
mov %rbx,-8(%rdi,%rdx)
1: pop %rbx
.L0: .leafepilogue
.L4: push %rbx
mov (%rsi),%ecx
mov -4(%rsi,%rdx),%ebx
mov %ecx,(%rdi)
mov %ebx,-4(%rdi,%rdx)
jmp 1b
.L3: push %rbx
mov (%rsi),%cx
mov -2(%rsi,%rdx),%bx
mov %cx,(%rdi)
mov %bx,-2(%rdi,%rdx)
jmp 1b
.L2: mov (%rsi),%cx
mov %cx,(%rdi)
jmp .L0
.L1: mov (%rsi),%cl
mov %cl,(%rdi)
jmp .L0
.Lerms:
#ifdef TINY
cmp $1024*1024,%rdx
#else
cmp kHalfCache3(%rip),%rdx
#endif
ja .Lnts
push %rdi
push %rsi
mov %rdx,%rcx
rep movsb
pop %rsi
pop %rdi
jmp .L0
.Lnts: movdqu (%rsi),%xmm3
movdqu %xmm3,(%rdi)
lea 16(%rdi),%rcx
and $-16,%rcx
sub %rdi,%rcx
add %rcx,%rdi
add %rcx,%rsi
sub %rcx,%rdx
mov $16,%rcx
0: add $16,%rcx
movdqu -32(%rsi,%rcx),%xmm3
movntdq %xmm3,-32(%rdi,%rcx)
cmp %rcx,%rdx
ja 0b
sfence
movdqu -16(%rsi,%rdx),%xmm3
movdqu %xmm3,-16(%rdi,%rdx)
pxor %xmm3,%xmm3
jmp .L0
.endfn MemCpy,globl,hidden
.source __FILE__
.initro 300,_init_memcpy
memcpytab.ro:
.byte .L0-.Lanchorpoint
.byte .L1-.Lanchorpoint
.byte .L2-.Lanchorpoint
.byte .L3-.Lanchorpoint
.rept 4
.byte .L4-.Lanchorpoint
.endr
.rept 8
.byte .L8-.Lanchorpoint
.endr
.rept 16
.byte .L16-.Lanchorpoint
.endr
.equ .Lmemcpytab.ro.size,.-memcpytab.ro
.endobj memcpytab.ro
.if .Lmemcpytab.ro.size % 8
.error "moar jmptab"
.endif
.byte .L16-.Lanchorpoint # SSE2
.byte .L16r-.Lanchorpoint # SSE2 + ERMS
.byte .L32-.Lanchorpoint # AVX
.byte .L32r-.Lanchorpoint # AVX + ERMS
.byte 0,0,0,0
.previous
.initbss 300,_init_memcpy
memcpytab:
.rept .Lmemcpytab.ro.size
.quad 0
.endr
.quad 0
.endobj memcpytab
.previous
.init.start 300,_init_memcpy
pushpop .Lmemcpytab.ro.size,%rcx
ezlea .Lanchorpoint,dx
testb X86_HAVE(AVX)+kCpuids(%rip)
call memjmpinit
.init.end 300,_init_memcpy
/* your memcpy() 375 bytes
bionic memcpy() 1,429 bytes
glibc memcpy() 27,216 bytes
musl memcpy() 49 bytes
newlib memcpy() 300 bytes
benchmarks on intel core i7-6700 @ 3.40GHz (skylake)
includes function call overhead (unless marked otherwise)
your memcpy(𝑛) for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 297.000 35.125 35.203 92
1 35.000 35.625 35.016 93
2 27.500 17.438 17.555 185
3 21.000 11.875 12.057 270
4 16.250 8.719 8.809 369
7 5.000 4.946 5.069 641
8 7.375 4.422 4.365 745
15 4.067 2.342 2.336 1391
16 4.188 2.242 2.257 1440 «
31 8.032 1.157 1.147 2835
32 2.031 1.723 1.325 2454
63 1.000 0.589 0.589 5523
64 0.578 0.580 0.577 5630 «
127 0.638 0.377 0.320 10151
128 0.289 0.296 0.307 10605
255 0.404 0.202 0.194 16741
256 0.160 0.165 0.166 19574 «
511 0.159 0.123 0.110 29458
512 0.139 0.098 0.097 33571 «
1023 0.107 0.086 0.074 44111
1024 0.103 0.084 0.082 39489
2047 0.057 0.056 0.057 57450
2048 0.055 0.055 0.055 59269
4095 0.044 0.044 0.044 74051
4096 0.043 0.043 0.043 75300 «
8191 0.036 0.036 0.036 91301
8192 0.036 0.035 0.035 92411
16383 0.033 0.032 0.032 102163
16384 0.034 0.032 0.032 102145 « (L1)/2
32767 0.098 0.081 0.077 42271
32768 0.077 0.077 0.076 42781
65535 0.088 0.075 0.072 44973
65536 0.074 0.072 0.071 45520
131071 0.086 0.075 0.072 44869
131072 0.077 0.073 0.072 45076 « (L2)/2
262143 0.095 0.096 0.095 34116
262144 0.096 0.096 0.095 34160
524287 0.102 0.109 0.111 29359
524288 0.107 0.109 0.108 30033
1048575 0.102 0.103 0.104 31112
1048576 0.101 0.103 0.103 31605
2097151 0.104 0.103 0.109 29929
2097152 0.108 0.110 0.103 31652
4194303 0.192 0.172 0.172 18950
4194304 0.168 0.161 0.160 20311 « (L3)/2
8388607 0.339 0.329 0.344 9461 « RAM
8388608 0.384 0.369 0.341 9545
Bionic memcpy() for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 347.000 40.625 35.984 90
1 37.000 35.625 36.734 89
2 28.500 18.688 18.383 177
3 11.667 12.375 12.359 263
4 12.250 9.406 9.020 361
7 5.000 5.018 5.118 636
8 11.625 5.828 4.779 681
15 3.533 3.158 2.620 1243
16 4.688 2.742 2.884 1129 «
31 1.903 1.262 1.172 2778
32 1.344 1.113 1.125 2895
63 1.444 0.633 0.591 5513
64 0.766 0.580 0.581 5605 «
127 0.512 0.383 0.318 10229
128 0.461 0.315 0.311 10463
255 0.475 0.216 0.193 16840
256 0.371 0.236 0.199 16397 «
511 0.295 0.144 0.120 27223
512 0.240 0.151 0.126 25937 «
1023 0.142 0.101 0.088 36947
1024 0.126 0.108 0.091 35889
2047 0.088 0.074 0.072 45475
2048 0.089 0.077 0.073 44380
4095 0.081 0.065 0.064 50766
4096 0.068 0.066 0.065 50246 «
8191 0.063 0.061 0.060 54075
8192 0.065 0.061 0.061 53731
16383 0.082 0.066 0.061 53765
16384 0.067 0.063 0.062 52765 « (L1)/2
32767 0.102 0.085 0.085 38406
32768 0.086 0.085 0.085 38473
65535 0.098 0.085 0.085 38292
65536 0.086 0.085 0.085 38369
131071 0.438 0.177 0.089 36716
131072 0.092 0.090 0.093 34880 « (L2)/2
262143 0.306 0.146 0.127 25601
262144 0.126 0.168 0.127 25704
524287 0.213 0.152 0.136 23993
524288 0.132 0.159 0.133 24570
1048575 0.127 0.129 0.130 25117
1048576 0.128 0.129 0.130 25107
2097151 0.127 0.127 0.129 25199
2097152 0.127 0.136 0.134 24274
4194303 0.216 0.192 0.228 14237
4194304 0.351 0.351 0.356 9139 « (L3)/2
8388607 0.323 0.293 0.298 10903 « RAM
8388608 0.365 0.296 0.300 10844
GCC builtin (Inline REP MOVSB) for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 53.000 50.625 50.453 64
1 47.000 49.375 49.141 66
2 23.500 25.062 24.898 131
3 15.667 16.792 16.880 193
4 11.750 12.531 12.957 251
7 7.000 7.125 7.190 452
8 6.125 7.578 6.322 514
15 3.133 3.325 3.372 964
16 3.062 3.117 3.132 1038 «
31 1.645 1.601 1.620 2007
32 1.531 1.559 1.585 2051
63 0.778 0.796 0.802 4056
64 0.766 0.768 0.767 4238 «
127 0.480 0.446 0.448 7259
128 0.445 0.419 0.423 7693
255 0.239 0.239 0.236 13781
256 0.238 0.225 0.225 14466 «
511 0.127 0.133 0.132 24555
512 0.123 0.127 0.128 25377 «
1023 0.079 0.081 0.081 40346
1024 0.075 0.077 0.078 41714
2047 0.053 0.055 0.055 59575
2048 0.053 0.053 0.053 60795
4095 0.042 0.043 0.043 75843
4096 0.042 0.042 0.042 77153
8191 0.035 0.036 0.036 91518
8192 0.035 0.035 0.035 92603
16383 0.032 0.032 0.032 102407
16384 0.033 0.032 0.032 102864 « (L1)/2
32767 0.106 0.082 0.078 41486
32768 0.079 0.078 0.079 41290
65535 0.090 0.077 0.075 43565
65536 0.074 0.074 0.073 44299
131071 0.091 0.078 0.075 43196
131072 0.078 0.076 0.074 43673 « (L2)/2
262143 0.097 0.099 0.098 33192
262144 0.098 0.098 0.098 33193
524287 0.105 0.111 0.111 29212
524288 0.109 0.111 0.111 29211
1048575 0.107 0.108 0.108 30069
1048576 0.106 0.112 0.105 30886
2097151 0.105 0.103 0.103 31621
2097152 0.102 0.103 0.104 31280
4194303 0.180 0.158 0.176 18456
4194304 0.167 0.155 0.154 21098 « (L3)/2
8388607 0.538 0.576 0.557 5834 « RAM
8388608 0.750 0.579 0.552 5893
glibc memcpy() for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 139.000 90.125 84.891 38
1 83.000 82.125 84.359 39
2 61.500 46.438 45.164 72
3 41.667 32.458 31.245 104
4 32.750 26.156 24.410 133
7 20.143 16.732 16.033 203
8 13.375 8.328 6.908 471
15 8.200 6.408 5.753 565
16 4.438 3.570 3.466 938 «
31 3.258 2.891 2.786 1167
32 2.281 1.801 1.732 1878
63 1.635 1.431 1.374 2367
64 1.109 0.896 0.868 3747 «
127 0.921 0.792 0.779 4176
128 0.508 0.511 0.494 6589
255 0.451 0.407 0.402 8081
256 0.324 0.269 0.260 12498 «
511 0.249 0.218 0.212 15335
512 0.178 0.149 0.146 22297 «
1023 0.138 0.124 0.121 26947
1024 0.087 0.089 0.087 37238
2047 0.084 0.077 0.076 43046
2048 0.066 0.059 0.058 56120
4095 0.058 0.054 0.054 60706
4096 0.050 0.046 0.046 71092 «
8191 0.043 0.042 0.042 78259
8192 0.037 0.037 0.037 87409
16383 0.037 0.036 0.035 92065
16384 0.034 0.034 0.033 97942 « (L1)/2
32767 0.104 0.084 0.080 40572
32768 0.079 0.079 0.079 41055
65535 0.094 0.080 0.076 42885
65536 0.077 0.075 0.075 43423
131071 0.092 0.080 0.078 41498
131072 0.082 0.078 0.077 42350 « (L2)/2
262143 0.100 0.101 0.287 11342
262144 0.099 0.099 0.098 33177
524287 0.106 0.111 0.110 29609
524288 0.107 0.119 0.110 29608
1048575 0.104 0.105 0.106 30626
1048576 0.104 0.111 0.105 30878
2097151 0.103 0.103 0.103 31606
2097152 0.102 0.103 0.103 31644
4194303 0.174 0.160 0.165 19714
4194304 0.166 0.157 0.154 21110 « (L3)/2
8388607 0.537 0.554 0.565 5750 « RAM
8388608 0.701 0.537 0.552 5884
musl memcpy() for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 97.000 80.625 79.891 41
1 77.000 78.875 78.266 42
2 49.500 44.062 42.102 77
3 33.667 32.792 30.651 106
4 29.750 24.281 24.137 135
7 19.000 16.161 15.734 207
8 12.125 7.766 6.721 484
15 8.867 5.892 5.714 569
16 5.062 3.742 3.458 940
31 3.645 2.915 2.715 1198
32 2.156 1.723 1.663 1956
63 1.540 1.367 1.333 2440
64 1.078 0.873 0.833 3905
127 0.874 0.771 0.737 4415
128 0.617 0.487 0.481 6766
255 0.443 0.390 0.382 8504
256 0.316 0.259 0.259 12545
511 0.245 0.232 0.237 13742
512 0.174 0.159 0.208 15668
1023 0.181 0.193 0.182 17821
1024 0.155 0.123 0.114 28579
2047 0.102 0.092 0.085 38219
2048 0.064 0.073 0.070 46577
4095 0.058 0.067 0.065 50272
4096 0.049 0.055 0.055 59467
8191 0.057 0.052 0.049 66468
8192 0.053 0.050 0.051 63557
16383 0.082 0.065 0.064 50897
16384 0.066 0.065 0.061 53697 « (L1)/2
32767 0.121 0.100 0.114 28555
32768 0.093 0.091 0.114 28615
65535 0.118 0.102 0.142 22858
65536 0.108 0.274 0.097 33432
131071 0.117 0.109 0.109 29905
131072 0.110 0.195 0.113 28692 « (L2)/2
262143 0.283 0.166 0.122 26638
262144 0.130 0.144 0.123 26544
524287 0.210 0.153 0.130 25079
524288 0.126 0.128 0.123 26422
1048575 0.139 0.107 0.106 30803
1048576 0.104 0.105 0.106 30683
2097151 0.103 0.103 0.103 31564
2097152 0.102 0.103 0.103 31531
4194303 0.242 0.158 0.169 19238
4194304 0.166 0.161 0.154 21072 « (L3)/2
8388607 0.533 0.549 0.599 5422 « RAM
8388608 0.768 0.630 0.560 5801
newlib (aka. cygwin) memcpy() for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 61.000 52.875 53.141 61
1 49.000 49.875 50.328 65
2 24.500 24.812 26.727 122
3 15.667 20.125 16.943 192
4 12.750 15.281 13.090 248
7 7.000 7.375 7.431 438
8 5.875 6.422 6.377 510
15 3.267 3.375 3.447 943
16 10.062 6.945 6.386 509
31 2.548 2.488 2.545 1278
32 3.156 3.207 3.201 1016
63 1.190 1.220 1.229 2646
64 1.578 1.588 1.599 2033
127 0.717 0.690 0.685 4744
128 0.820 0.856 0.857 3795
255 0.357 0.359 0.358 9077
256 0.629 0.461 0.426 7630
511 0.260 0.219 0.204 15947
512 0.330 0.299 0.268 12113
1023 0.269 0.175 0.162 20042
1024 0.315 0.201 0.196 16633
2047 0.349 0.241 0.236 13790
2048 0.332 0.269 0.264 12295
4095 0.349 0.295 0.287 11348
4096 0.361 0.313 0.303 10748
8191 0.361 0.317 0.322 10110
8192 0.369 0.326 0.319 10201
16383 0.321 0.322 0.327 9940
16384 0.309 0.330 0.329 9878 « (L1)/2
32767 0.291 0.303 0.307 10599
32768 0.314 0.304 0.305 10667
65535 0.373 0.311 0.313 10396
65536 0.305 0.750 0.421 7729
131071 0.329 0.427 0.384 8470
131072 0.329 0.388 0.361 9020 « (L2)/2
262143 0.520 0.389 0.425 7646
262144 0.364 0.400 0.368 8843
524287 0.449 0.389 0.389 8353
524288 0.384 0.379 0.384 8466
1048575 0.436 0.397 0.401 8107
1048576 0.431 0.397 0.401 8112
2097151 0.417 0.567 0.434 7498
2097152 0.457 0.503 0.427 7621
4194303 0.328 0.348 0.368 8822
4194304 0.343 0.352 0.352 9221 « (L3)/2
8388607 0.313 0.319 0.326 9957 « RAM
8388608 0.366 0.320 0.328 9910
openbsd memcpy() for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 73.000 41.375 41.484 78
1 39.000 39.875 41.641 78
2 28.500 20.688 21.227 153
3 27.000 15.875 15.557 209
4 16.750 12.656 12.520 260
7 20.429 10.982 10.292 316
8 8.625 5.234 5.576 583
15 7.267 4.758 4.920 661
16 4.312 2.742 2.747 1183
31 4.613 2.891 2.555 1272
32 2.844 1.520 1.441 2256
63 2.397 1.268 1.328 2449
64 1.547 0.822 0.769 4226
127 1.189 0.782 0.671 4842
128 0.727 0.532 0.460 7066
255 0.631 0.463 0.414 7856
256 0.543 0.374 0.302 10775
511 0.542 0.316 0.276 11785
512 0.354 0.260 0.224 14494
1023 0.267 0.245 0.229 14201
1024 0.251 0.200 0.197 16496
2047 0.214 0.226 0.181 17941
2048 0.189 0.167 0.166 19575
4095 0.200 0.168 0.163 19957
4096 0.165 0.155 0.153 21219
8191 0.158 0.153 0.151 21578
8192 0.153 0.148 0.147 22138
16383 0.173 0.148 0.146 22319
16384 0.153 0.487 0.188 17298 « (L1)/2
32767 0.161 0.151 0.192 16893
32768 0.151 0.314 0.213 15275
65535 0.157 0.154 0.148 21969
65536 0.147 0.145 0.145 22493
131071 0.152 0.151 0.154 21145
131072 0.148 0.229 0.158 20564 « (L2)/2
262143 0.320 0.183 0.162 20031
262144 0.330 0.205 0.167 19503
524287 0.159 0.171 0.163 19913
524288 0.250 0.189 0.162 20120
1048575 0.157 0.164 0.161 20182
1048576 0.155 0.156 0.157 20672
2097151 0.161 0.158 0.157 20644
2097152 0.158 0.157 0.165 19727
4194303 0.327 0.256 0.238 13684
4194304 0.232 0.220 0.236 13749 « (L3)/2
8388607 0.721 0.689 0.586 5549 « RAM
8388608 0.943 0.569 0.593 5481 */

View file

@ -1,33 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
.source __FILE__
// Copies memory.
//
// DEST and SRC must not overlap unless DEST SRC.
//
// @param rdi is dest
// @param rsi is src
// @param rdx is number of bytes
// @return original rdi + rdx copied to rax
mempcpy:
lea (%rdi,%rdx),%rax
jmp MemCpy
.endfn mempcpy,globl

View file

@ -1,406 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 sw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
@fileoverview Cosmopolitan Memory Setter
This sets one bit per picosecond on a $900 Skylake workstation,
which is about 110 GBps. */
#include "libc/nexgen32e/x86feature.h"
#include "libc/nexgen32e/macros.h"
#include "libc/macros.internal.h"
// Sets memory.
//
// @param rdi is dest
// @param esi is the byte to set
// @param edx is the number of bytes to set
// @return original rdi copied to rax
// @mode long
// @asyncsignalsafe
memset: mov %rdi,%rax
// 𝑠𝑙𝑖𝑑𝑒
.align 16
.endfn memset,globl
// Sets memory w/ minimal-impact ABI.
//
// @param rdi is dest
// @param esi is the byte to set
// @param edx is the number of bytes to set
// @clob flags,rcx,xmm3
// @mode long
MemSet: .leafprologue
.profilable
mov $.Lmemsettab.ro.size,%ecx
cmp %rcx,%rdx
cmovb %rdx,%rcx
jmp *memsettab(,%rcx,8)
.Lanchorpoint:
.L32r: cmp $1024,%rdx
jae .Lerms
.L32: vmovd %esi,%xmm3
vpbroadcastb %xmm3,%ymm3
mov $32,%ecx
1: lea 32(%rcx),%rcx
vmovdqu %ymm3,-64(%rdi,%rcx)
cmp %rcx,%rdx
ja 1b
vmovdqu %ymm3,-32(%rdi,%rdx)
vpxor %ymm3,%ymm3,%ymm3
jmp .L0
.L16r: cmp $1024,%rdx
jae .Lerms
.L16: movd %esi,%xmm3
pbroadcastb %xmm3
mov $16,%ecx
1: lea 16(%rcx),%rcx
movdqu %xmm3,-32(%rdi,%rcx)
cmp %rcx,%rdx
ja 1b
movdqu %xmm3,-16(%rdi,%rdx)
pxor %xmm3,%xmm3
.L0: .leafepilogue
.L8: movzbl %sil,%ecx
imul .Lb8(%rip),%rcx
mov %rcx,(%rdi)
mov %rcx,-8(%rdi,%rdx)
jmp .L0
.L4: movzbl %sil,%ecx
imul $0x01010101,%ecx,%ecx
mov %ecx,(%rdi)
mov %ecx,-4(%rdi,%rdx)
jmp .L0
.L3: mov %sil,2(%rdi)
.L2: mov %sil,1(%rdi)
.L1: mov %sil,(%rdi)
jmp .L0
.Lerms: push %rax
push %rdi
mov %esi,%eax
mov %rdx,%rcx
rep stosb
pop %rdi
pop %rax
jmp .L0
.endfn MemSet,globl,hidden
.source __FILE__
.rodata.cst8
.Lb8: .quad 0x0101010101010101
.previous
.initro 300,_init_memset
memsettab.ro:
.byte .L0 - .Lanchorpoint
.byte .L1 - .Lanchorpoint
.byte .L2 - .Lanchorpoint
.byte .L3 - .Lanchorpoint
.rept 4
.byte .L4 - .Lanchorpoint
.endr
.rept 8
.byte .L8 - .Lanchorpoint
.endr
.rept 16
.byte .L16 - .Lanchorpoint
.endr
.equ .Lmemsettab.ro.size,.-memsettab.ro
.endobj memsettab.ro
.if .Lmemsettab.ro.size % 8
.error "moar jmptab"
.endif
.byte .L16 - .Lanchorpoint # SSE2
.byte .L16r - .Lanchorpoint # SSE2 + ERMS
.byte .L32 - .Lanchorpoint # AVX2
.byte .L32r - .Lanchorpoint # AVX2 + ERMS
.byte 0,0,0,0
.previous
.initbss 300,_init_memset
memsettab:
.rept .Lmemsettab.ro.size
.quad 0
.endr
.quad 0
.endobj memsettab
.previous
.init.start 300,_init_memset
pushpop .Lmemsettab.ro.size,%rcx
ezlea .Lanchorpoint,dx
testb X86_HAVE(AVX2)+kCpuids(%rip)
call memjmpinit
.init.end 300,_init_memset
/* benchmarks on intel core i7-6700 @ 3.40GHz (skylake)
includes function call overhead (unless marked otherwise)
Your memset() for #c per n where c 0.273ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 73.000 35.125 36.141 97
1 35.000 36.375 35.984 97
2 28.500 19.938 18.820 185
3 19.000 12.458 12.651 276
4 15.750 10.719 9.566 365
7 5.000 5.411 5.730 609
8 8.375 4.953 4.697 743
15 4.200 2.408 2.407 1450
16 7.188 2.539 2.382 1465 «
31 1.129 1.206 1.183 2950
32 15.156 2.012 1.292 2702
63 4.016 0.986 0.663 5264
64 3.547 0.967 0.684 5104
127 2.087 0.562 0.338 10311
128 1.805 0.499 0.336 10393
255 0.412 0.180 0.183 19119
256 0.160 0.170 0.169 20650
511 0.162 0.134 0.108 32214
512 0.100 0.106 0.104 33507
1023 0.110 0.095 0.082 42574
1024 0.099 0.080 0.078 44944
2047 0.155 0.154 0.154 22624
2048 0.052 0.052 0.053 66266
4095 0.098 0.099 0.099 35142
4096 0.042 0.042 0.041 84250
8191 0.072 0.073 0.072 48157
8192 0.034 0.034 0.034 101332
16383 0.059 0.059 0.059 58997
16384 0.031 0.031 0.031 112972
32767 0.054 0.054 0.054 65053
32768 0.029 0.029 0.029 119433
65535 0.069 0.069 0.068 51690
65536 0.057 0.057 0.057 61434
131071 0.066 0.066 0.066 53001
131072 0.057 0.058 0.057 60716
262143 0.066 0.065 0.065 53462
262144 0.060 0.058 0.058 60104
524287 0.067 0.068 0.072 48784
524288 0.063 0.062 0.061 56957
1048575 0.068 0.068 0.069 50353
1048576 0.062 0.060 0.062 56661
2097151 0.066 0.066 0.067 52421
2097152 0.060 0.060 0.061 57672
4194303 0.072 0.067 0.067 51910
4194304 0.062 0.061 0.062 56327
8388607 0.129 0.111 0.111 31368
8388608 0.136 0.119 0.111 31519
glibc memset() for #c per n where c 0.273ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 121.000 39.125 35.547 98
1 33.000 35.875 35.172 99
2 17.500 18.312 18.070 193
3 16.333 14.542 12.411 281
4 12.250 9.344 9.215 379
7 7.571 5.732 5.453 640
8 4.625 4.641 4.623 755
15 4.467 3.158 2.478 1408
16 2.312 2.289 2.468 1414
31 2.290 1.367 1.278 2731
32 1.219 1.176 1.182 2952
63 0.905 0.696 0.656 5320
64 0.672 0.658 0.660 5285
127 1.299 0.723 0.673 5183
128 0.508 0.423 0.424 8227
255 0.490 0.428 0.417 8367
256 0.293 0.233 0.243 14349
511 0.284 0.232 0.234 14902
512 0.154 0.131 0.131 26626
1023 0.155 0.137 0.135 25839
1024 0.089 0.078 0.080 43875
2047 0.103 0.092 0.090 38672
2048 0.060 0.054 0.054 65116
4095 0.073 0.068 0.068 51405
4096 0.046 0.042 0.042 82162
8191 0.060 0.058 0.057 60739
8192 0.036 0.034 0.034 101467
16383 0.052 0.052 0.051 68594
16384 0.031 0.031 0.031 112603
32767 0.053 0.050 0.049 70850
32768 0.032 0.029 0.029 119617
65535 0.067 0.067 0.067 52015
65536 0.058 0.058 0.058 60440
131071 0.067 0.066 0.065 53518
131072 0.059 0.058 0.058 60281
262143 0.066 0.065 0.065 54005
262144 0.058 0.058 0.058 60121
524287 0.067 0.067 0.067 52349
524288 0.061 0.061 0.064 54699
1048575 0.068 0.067 0.067 51876
1048576 0.061 0.061 0.061 56775
2097151 0.068 0.068 0.068 51379
2097152 0.062 0.062 0.062 56513
4194303 0.069 0.068 0.069 50580
4194304 0.063 0.064 0.063 55751
8388607 0.120 0.118 0.120 28998
8388608 0.137 0.123 0.117 29936
GCC (Inline REP STOSB) for #c per n where c 0.273ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 413.000 434.125 441.453 8
1 431.000 436.125 438.953 8
2 223.500 224.438 224.836 16
3 149.000 150.042 623.786 6
4 108.750 109.531 110.559 32
7 62.714 63.196 63.266 55
8 56.375 56.641 56.838 61
15 30.467 30.708 30.761 113
16 24.062 24.023 24.038 145
31 14.548 14.859 14.876 235
32 9.719 9.691 9.730 359
63 7.286 7.312 7.339 476
64 3.609 3.705 3.721 938
127 1.976 2.058 2.067 1689
128 0.414 0.405 0.409 8532
255 0.890 0.907 0.911 3832
256 0.215 0.217 0.218 16039
511 0.476 0.481 0.480 7273
512 0.119 0.119 0.119 29270
1023 0.257 0.260 0.260 13409
1024 0.073 0.073 0.074 47442
2047 0.150 0.150 0.151 23189
2048 0.049 0.050 0.050 69424
4095 0.096 0.097 0.097 36142
4096 0.040 0.040 0.040 87842
8191 0.071 0.071 0.071 49061
8192 0.034 0.033 0.034 104099
16383 0.058 0.059 0.058 59697
16384 0.030 0.031 0.030 114585
32767 0.053 0.053 0.053 66161
32768 0.029 0.029 0.029 120750
65535 0.069 0.069 0.069 50520
65536 0.058 0.058 0.058 60100
131071 0.068 0.067 0.085 40964
131072 0.076 0.072 0.063 55514
262143 0.067 0.093 0.090 38681
262144 0.073 0.062 0.077 45384
524287 0.107 0.093 0.066 52689
524288 0.061 0.060 0.062 56294
1048575 0.066 0.066 0.066 52990
1048576 0.061 0.061 0.061 57248
2097151 0.067 0.075 0.067 51887
2097152 0.061 0.061 0.061 56878
4194303 0.068 0.100 0.069 50623
4194304 0.061 0.061 0.061 57195
8388607 0.117 0.121 0.119 29441
8388608 0.118 0.119 0.162 21587
Musl memset() for #c per n where c 0.273ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 49.000 35.625 35.172 99
1 33.000 34.625 35.109 99
2 17.500 17.562 18.023 194
3 20.333 14.042 12.411 281
4 11.250 9.219 9.301 375
7 11.857 6.018 5.417 644
8 4.125 4.516 4.592 760
15 4.200 2.692 2.480 1407
16 2.312 2.273 2.310 1511
31 2.097 1.786 1.342 2600
32 1.219 1.238 1.242 2811
63 0.841 0.815 0.686 5085
64 0.641 0.666 0.665 5246
127 1.000 0.718 0.690 5061
128 0.477 0.435 0.413 8451
255 0.459 0.418 0.403 8670
256 0.285 0.233 0.232 15051
511 0.256 0.230 0.228 15285
512 0.158 0.129 0.128 27170
1023 0.134 0.140 0.138 25296
1024 0.089 0.077 0.078 44891
2047 0.094 0.088 0.088 39837
2048 0.060 0.052 0.053 66075
4095 0.071 0.068 0.068 51359
4096 0.045 0.043 0.042 83178
8191 0.059 0.058 0.057 60868
8192 0.037 0.035 0.034 102662
16383 0.052 0.051 0.051 68658
16384 0.032 0.031 0.031 113568
32767 0.050 0.049 0.049 71296
32768 0.030 0.029 0.029 120029
65535 0.067 0.067 0.068 50983
65536 0.059 0.059 0.058 59665
131071 0.067 0.067 0.067 52014
131072 0.059 0.060 0.059 59211
262143 0.067 0.066 0.066 52877
262144 0.059 0.060 0.085 40900
524287 0.067 0.066 0.065 53688
524288 0.059 0.059 0.059 59112
1048575 0.066 0.066 0.066 53181
1048576 0.060 0.060 0.060 58300
2097151 0.066 0.066 0.067 52439
2097152 0.060 0.068 0.060 57924
4194303 0.069 0.067 0.080 43425
4194304 0.062 0.080 0.062 56085
8388607 0.126 0.118 0.133 26207
8388608 0.127 0.119 0.118 29643
Newlib memset() for #c per n where c 0.273ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 443.000 440.875 440.078 8
1 437.000 437.375 440.453 8
2 226.500 226.438 227.461 15
3 150.333 150.625 151.151 23
4 113.250 113.281 113.770 31
7 66.714 67.232 66.998 52
8 58.375 58.828 58.811 59
15 31.000 30.858 31.264 112
16 31.438 28.523 28.317 123
31 27.839 29.536 50.533 69
32 11.281 10.918 11.068 315
63 12.302 11.907 11.863 294
64 4.703 4.396 4.404 793
127 2.732 2.719 2.712 1287
128 0.852 0.729 0.736 4742
255 1.188 1.178 1.171 2981
256 0.652 0.416 0.381 9171
511 1.474 1.629 1.662 2099
512 0.287 0.264 0.246 14204
1023 0.873 0.934 0.947 3684
1024 0.196 0.179 0.178 19604
2047 0.544 0.545 0.626 5572
2048 0.257 0.257 0.253 13779
4095 0.426 0.427 0.430 8110
4096 0.282 0.296 0.293 11917
8191 0.374 0.370 0.371 9402
8192 0.297 0.310 0.400 8717
16383 0.346 0.345 0.433 8062
16384 0.313 0.312 0.311 11223
32767 0.334 0.332 0.332 10505
32768 0.313 0.313 0.358 9759
65535 0.335 0.327 0.330 10589
65536 0.330 0.312 0.337 10347
131071 0.350 0.339 0.355 9825
131072 0.334 0.329 0.359 9728
262143 0.346 0.352 0.357 9785
262144 0.350 0.375 0.482 7243
524287 0.348 0.346 0.360 9691
524288 0.347 0.346 0.385 9063
1048575 0.358 0.375 0.383 9114
1048576 0.355 0.382 0.388 8987
2097151 0.362 0.368 0.390 8956
2097152 0.363 0.375 0.387 9016
4194303 0.361 0.379 0.385 9073
4194304 0.366 0.376 0.385 9074
8388607 0.363 0.366 0.372 9391
8388608 0.419 0.374 0.370 9428 */

View file

@ -49,7 +49,8 @@ o/$(MODE)/libc/nexgen32e/tinystrncmp.ncabi.o: \
o/$(MODE)/libc/nexgen32e/errno.o: \
OVERRIDE_CFLAGS += \
$(NO_MAGIC)
$(NO_MAGIC) \
-fno-sanitize=all
LIBC_NEXGEN32E_LIBS = $(foreach x,$(LIBC_NEXGEN32E_ARTIFACTS),$($(x)))
LIBC_NEXGEN32E_SRCS = $(foreach x,$(LIBC_NEXGEN32E_ARTIFACTS),$($(x)_SRCS))

View file

@ -1,6 +1,7 @@
#ifndef COSMOPOLITAN_LIBC_NEXGEN32E_RDTSCP_H_
#define COSMOPOLITAN_LIBC_NEXGEN32E_RDTSCP_H_
#include "libc/bits/bits.h"
#include "libc/nexgen32e/x86feature.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_

13
libc/nexgen32e/sha.h Normal file
View file

@ -0,0 +1,13 @@
#ifndef COSMOPOLITAN_LIBC_NEXGEN32E_SHA_H_
#define COSMOPOLITAN_LIBC_NEXGEN32E_SHA_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
void sha1_transform_avx2(uint32_t[hasatleast 5], const void *, unsigned);
void sha1_transform_ni(uint32_t[hasatleast 5], const void *, unsigned);
void sha256_transform_rorx(uint32_t[hasatleast 8], const void *, unsigned);
void sha256_transform_ni(uint32_t[hasatleast 8], const void *, unsigned);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_NEXGEN32E_SHA_H_ */

View file

@ -1,49 +1,36 @@
/*
* BSD LICENSE
*
* Copyright(c) 2014 Intel Corporation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* - Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
*
* This implementation is based on the previous SSSE3 release:
* Visit http://software.intel.com/en-us/articles/
* and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
*
* Updates 20-byte SHA-1 record at start of 'state', from 'input', for
* even number of 'blocks' consecutive 64-byte blocks.
*
* extern "C" void sha1_transform_avx2(
* struct sha1_state *state, const uint8_t *input, int blocks );
*/
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2014 Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "libc/macros.internal.h"
.ident "\n\
@ -71,7 +58,6 @@ Copyright 2014 Intel Corporation\n"
#define REG_RTB %rbx
#define REG_T1 %r11d
#define xmm_mov vmovups
#define avx2_zeroupper vzeroupper
#define RND_F1 1
#define RND_F2 2
#define RND_F3 3
@ -84,16 +70,13 @@ Copyright 2014 Intel Corporation\n"
.set E, REG_E
.set TB, REG_TB
.set TA, REG_TA
.set RA, REG_RA
.set RB, REG_RB
.set RC, REG_RC
.set RD, REG_RD
.set RE, REG_RE
.set RTA, REG_RTA
.set RTB, REG_RTB
.set T1, REG_T1
.endm
@ -177,7 +160,6 @@ Copyright 2014 Intel Corporation\n"
PRECALC_RESET_WY
PRECALC_ROTATE_WY
.endif
/* message scheduling pre-compute for rounds 0-15 */
.if ((i & 7) == 0)
/*
@ -194,7 +176,6 @@ Copyright 2014 Intel Corporation\n"
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
.elseif ((i & 7) == 7)
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
.endif
.endm
@ -236,7 +217,6 @@ Copyright 2014 Intel Corporation\n"
vpxor WY_TMP2, WY_TMP, WY
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
.endif
.endm
@ -250,7 +230,6 @@ Copyright 2014 Intel Corporation\n"
* allows more efficient vectorization
* since w[i]=>w[i-3] dependency is broken
*/
.if ((i & 7) == 0)
/*
* blended AVX2 and ALU instruction scheduling
@ -272,14 +251,12 @@ Copyright 2014 Intel Corporation\n"
.elseif ((i & 7) == 7)
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
.endif
.endm
.macro PRECALC r, s
.set i, \r
.if (i < 40)
.set K_XMM, 32*0
.elseif (i < 80)
@ -289,7 +266,6 @@ Copyright 2014 Intel Corporation\n"
.else
.set K_XMM, 32*3
.endif
.if (i<32)
PRECALC_00_15 \s
.elseif (i<64)
@ -307,7 +283,6 @@ Copyright 2014 Intel Corporation\n"
.set B, TB
.set TB, A
.set A, T_REG
.set T_REG, RE
.set RE, RD
.set RD, RC
@ -317,9 +292,8 @@ Copyright 2014 Intel Corporation\n"
.set RA, T_REG
.endm
/* Macro relies on saved ROUND_Fx */
.macro RND_FUN f, r
// Macro relies on saved ROUND_Fx
.macro RND_FUN f, r
.if (\f == RND_F1)
ROUND_F1 \r
.elseif (\f == RND_F2)
@ -332,11 +306,11 @@ Copyright 2014 Intel Corporation\n"
.macro RR r
.set round_id, (\r % 80)
.if (round_id == 0) /* Precalculate F for first round */
.if (round_id == 0) # Precalculate F for first round
.set ROUND_FUNC, RND_F1
mov B, TB
rorx $(32-30), B, B /* b>>>2 */
rorx $(32-30), B, B # b>>>2
andn D, TB, T1
and C, TB
xor T1, TB
@ -362,40 +336,38 @@ Copyright 2014 Intel Corporation\n"
.macro ROUND_F1 r
add WK(\r), E
andn C, A, T1 /* ~b&d */
lea (RE,RTB), E /* Add F from the previous round */
andn C, A, T1 # ~b&d
lea (RE,RTB), E # Add F from the previous round
rorx $(32-5), A, TA /* T2 = A >>> 5 */
rorx $(32-30),A, TB /* b>>>2 for next round */
rorx $(32-5), A, TA # T2 = A >>> 5
rorx $(32-30),A, TB # b>>>2 for next round
PRECALC (\r) /* msg scheduling for next 2 blocks */
PRECALC (\r) # msg scheduling for next 2 blocks
/*
* Calculate F for the next round
* (b & c) ^ andn[b, d]
*/
and B, A /* b&c */
xor T1, A /* F1 = (b&c) ^ (~b&d) */
// Calculate F for the next round
// (b & c) ^ andn[b, d]
and B, A # b&c
xor T1, A # F1 = (b&c) ^ (~b&d)
lea (RE,RTA), E /* E += A >>> 5 */
lea (RE,RTA), E # E += A >>> 5
.endm
.macro ROUND_F2 r
add WK(\r), E
lea (RE,RTB), E /* Add F from the previous round */
lea (RE,RTB), E # Add F from the previous round
/* Calculate F for the next round */
rorx $(32-5), A, TA /* T2 = A >>> 5 */
rorx $(32-5), A, TA # T2 = A >>> 5
.if ((round_id) < 79)
rorx $(32-30), A, TB /* b>>>2 for next round */
rorx $(32-30), A, TB # b>>>2 for next round
.endif
PRECALC (\r) /* msg scheduling for next 2 blocks */
PRECALC (\r) # msg scheduling for next 2 blocks
.if ((round_id) < 79)
xor B, A
.endif
add TA, E /* E += A >>> 5 */
add TA, E # E += A >>> 5
.if ((round_id) < 79)
xor C, A
@ -404,30 +376,28 @@ Copyright 2014 Intel Corporation\n"
.macro ROUND_F3 r
add WK(\r), E
PRECALC (\r) /* msg scheduling for next 2 blocks */
PRECALC (\r) # msg scheduling for next 2 blocks
lea (RE,RTB), E /* Add F from the previous round */
lea (RE,RTB), E # Add F from the previous round
mov B, T1
or A, T1
rorx $(32-5), A, TA /* T2 = A >>> 5 */
rorx $(32-30), A, TB /* b>>>2 for next round */
rorx $(32-5), A, TA # T2 = A >>> 5
rorx $(32-30), A, TB # b>>>2 for next round
/* Calculate F for the next round
* (b and c) or (d and (b or c))
*/
// Calculate F for the next round
// (b and c) or (d and (b or c))
and C, T1
and B, A
or T1, A
add TA, E /* E += A >>> 5 */
add TA, E # E += A >>> 5
.endm
/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
* %1 + %2 >= %3 ? %4 : 0
*/
// Add constant only if (%2 > %3) condition met (uses RTA as temp)
// %1 + %2 >= %3 ? %4 : 0
.macro ADD_IF_GE a, b, c, d
mov \a, RTA
add $\d, RTA
@ -435,9 +405,7 @@ Copyright 2014 Intel Corporation\n"
cmovge RTA, \a
.endm
/*
* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
*/
// Performs 80 rounds of SHA-1 for multiple blocks with s/w pipelining
.macro SHA1_PIPELINED_MAIN_BODY
REGALLOC
@ -451,7 +419,7 @@ Copyright 2014 Intel Corporation\n"
mov %rsp, PRECALC_BUF
lea (2*4*80+32)(%rsp), WK_BUF
# Precalc WK for first 2 blocks
// Precalc WK for first 2 blocks
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
.set i, 0
.rept 160
@ -459,29 +427,27 @@ Copyright 2014 Intel Corporation\n"
.set i, i + 1
.endr
/* Go to next block if needed */
// Go to next block if needed
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
xchg WK_BUF, PRECALC_BUF
.align 32
.L_loop:
/*
* code loops through more than one block
* we use K_BASE value as a signal of a last block,
* it is set below by: cmovae BUFFER_PTR, K_BASE
*/
// code loops through more than one block
// we use K_BASE value as a signal of a last block,
// it is set below by: cmovae BUFFER_PTR, K_BASE
test BLOCKS_CTR, BLOCKS_CTR
jnz .L_begin
.align 32
jmp .L_end
.align 32
.L_begin:
/*
* Do first block
* rounds: 0,2,4,6,8
*/
// process first block
// rounds: 0,2,4,6,8
.set j, 0
.rept 5
RR j
@ -491,28 +457,26 @@ Copyright 2014 Intel Corporation\n"
jmp .L_loop0
.L_loop0:
/*
* rounds:
* 10,12,14,16,18
* 20,22,24,26,28
* 30,32,34,36,38
* 40,42,44,46,48
* 50,52,54,56,58
*/
// rounds
// 10,12,14,16,18
// 20,22,24,26,28
// 30,32,34,36,38
// 40,42,44,46,48
// 50,52,54,56,58
.rept 25
RR j
.set j, j+2
.endr
/* Update Counter */
// Update Counter */
sub $1, BLOCKS_CTR
/* Move to the next block only if needed*/
// Move to the next block only if needed*/
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
/*
* rounds
* 60,62,64,66,68
* 70,72,74,76,78
*/
// rounds
// 60,62,64,66,68
// 70,72,74,76,78
.rept 10
RR j
.set j, j+2
@ -529,12 +493,9 @@ Copyright 2014 Intel Corporation\n"
mov TB, B
/* Process second block */
/*
* rounds
* 0+80, 2+80, 4+80, 6+80, 8+80
* 10+80,12+80,14+80,16+80,18+80
*/
// process second block
// 0+80, 2+80, 4+80, 6+80, 8+80
// 10+80,12+80,14+80,16+80,18+80
.set j, 0
.rept 10
@ -544,11 +505,10 @@ Copyright 2014 Intel Corporation\n"
jmp .L_loop1
.L_loop1:
/*
* rounds
* 20+80,22+80,24+80,26+80,28+80
* 30+80,32+80,34+80,36+80,38+80
*/
// rounds
// 20+80,22+80,24+80,26+80,28+80
// 30+80,32+80,34+80,36+80,38+80
.rept 10
RR j+80
.set j, j+2
@ -557,29 +517,26 @@ Copyright 2014 Intel Corporation\n"
jmp .L_loop2
.L_loop2:
/*
* rounds
* 40+80,42+80,44+80,46+80,48+80
* 50+80,52+80,54+80,56+80,58+80
*/
// rounds
// 40+80,42+80,44+80,46+80,48+80
// 50+80,52+80,54+80,56+80,58+80
.rept 10
RR j+80
.set j, j+2
.endr
/* update counter */
// update counter
sub $1, BLOCKS_CTR
/* Move to the next block only if needed*/
// Move to the next block only if needed
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
jmp .L_loop3
.L_loop3:
/*
* rounds
* 60+80,62+80,64+80,66+80,68+80
* 70+80,72+80,74+80,76+80,78+80
*/
// rounds
// 60+80,62+80,64+80,66+80,68+80
// 70+80,72+80,74+80,76+80,78+80
.rept 10
RR j+80
.set j, j+2
@ -619,14 +576,14 @@ Copyright 2014 Intel Corporation\n"
.align 128
K_XMM_AR:
.long K1, K1, K1, K1
.long K1, K1, K1, K1
.long K2, K2, K2, K2
.long K2, K2, K2, K2
.long K3, K3, K3, K3
.long K3, K3, K3, K3
.long K4, K4, K4, K4
.long K4, K4, K4, K4
.long K1,K1,K1,K1
.long K1,K1,K1,K1
.long K2,K2,K2,K2
.long K2,K2,K2,K2
.long K3,K3,K3,K3
.long K3,K3,K3,K3
.long K4,K4,K4,K4
.long K4,K4,K4,K4
BSWAP_SHUFB_CTL:
.long 0x00010203
@ -639,6 +596,23 @@ BSWAP_SHUFB_CTL:
.long 0x0c0d0e0f
.text
// Performs Intel® AVX2 optimized SHA-1 update.
//
// This implementation is based on the previous SSSE3 release:
// Visit http://software.intel.com/en-us/articles/ and refer
// to improving-the-performance-of-the-secure-hash-algorithm-1/
//
// Updates 20-byte SHA-1 record at start of 'state', from 'input',
// for even number of 'blocks' consecutive 64-byte blocks.
//
// void sha1_transform_avx2(struct sha1_state *state,
// const uint8_t *input,
// int blocks);
//
// @param %rdi points to output digest
// @param %rsi points to input data
// @param %rdx is number of 64-byte blocks to process
// @see X86_HAVE(SHA)
sha1_transform_avx2:
push %rbp
mov %rsp,%rbp
@ -648,33 +622,23 @@ sha1_transform_avx2:
push %r13
push %r14
push %r15
RESERVE_STACK = (W_SIZE*4 + 8+24)
/* Align stack */
mov %rsp, %rbx
and $~(0x20-1), %rsp
mov %rsp,%rbx
and $~(0x20-1),%rsp
push %rbx
sub $RESERVE_STACK, %rsp
avx2_zeroupper
sub $RESERVE_STACK,%rsp
vzeroupper
/* Setup initial values */
mov CTX, HASH_PTR
mov BUF, BUFFER_PTR
mov BUF, BUFFER_PTR2
mov CNT, BLOCKS_CTR
xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
mov CTX,HASH_PTR
mov BUF,BUFFER_PTR
mov BUF,BUFFER_PTR2
mov CNT,BLOCKS_CTR
xmm_mov BSWAP_SHUFB_CTL(%rip),YMM_SHUFB_BSWAP
SHA1_PIPELINED_MAIN_BODY
avx2_zeroupper
add $RESERVE_STACK, %rsp
vzeroupper
add $RESERVE_STACK,%rsp
pop %rsp
pop %r15
pop %r14
pop %r13

286
libc/nexgen32e/sha1ni.S Normal file
View file

@ -0,0 +1,286 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2015 Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "libc/macros.internal.h"
.text
.align 32
.ident "\n\
Intel SHA-NI (BSD-3 License)\n\
Copyright 2015 Intel Corporation\n\
Sean Gulley <sean.m.gulley@intel.com>\n\
Tim Chen <tim.c.chen@linux.intel.com>\n"
.include "libc/disclaimer.inc"
#define FRAME_SIZE 32
#define DIGEST_PTR %rdi
#define DATA_PTR %rsi
#define NUM_BLKS %rdx
#define ABCD %xmm0
#define E0 %xmm1 /* Need two E's b/c they ping pong */
#define E1 %xmm2
#define MSG0 %xmm3
#define MSG1 %xmm4
#define MSG2 %xmm5
#define MSG3 %xmm6
#define SHUF_MASK %xmm7
// Performs Intel® SHA-NI optimized SHA-1 update.
//
// The function takes a pointer to the current hash values, a
// pointer to the input data, and a number of 64 byte blocks to
// process. Once all blocks have been processed, the digest pointer
// is updated with the resulting hash value. The function only
// processes complete blocks, there is no functionality to store
// partial blocks. All message padding and hash value
// initialization must be done outside the update function.
//
// The indented lines in the loop are instructions related to
// rounds processing. The non-indented lines are instructions
// related to the message schedule.
//
// void sha1_transform_ni(uint32_t digest[static 5],
// const void *data,
// uint32_t numBlocks);
//
// @param %rdi points to output digest
// @param %rsi points to input data
// @param %rdx is number of 64-byte blocks to process
// @see X86_HAVE(SHA)
sha1_transform_ni:
push %rbp
mov %rsp,%rbp
.profilable
sub $FRAME_SIZE,%rsp
shl $6,NUM_BLKS # convert to bytes
jz .Ldone_hash
add DATA_PTR,NUM_BLKS # pointer to end of data
// load initial hash values
movdqa UPPER_WORD_MASK(%rip),E1
pinsrd $3,1*16(DIGEST_PTR),E0
movdqu 0*16(DIGEST_PTR),ABCD
pand E1,E0
pshufd $0x1B,ABCD,ABCD
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip),SHUF_MASK
.Lloop0:
// Save hash values for addition after rounds
movdqa E0,(0*16)(%rsp)
movdqa ABCD,(1*16)(%rsp)
// Rounds 0-3
movdqu 0*16(DATA_PTR),MSG0
pshufb SHUF_MASK,MSG0
paddd MSG0,E0
movdqa ABCD,E1
sha1rnds4 $0,E0,ABCD
// Rounds 4-7
movdqu 1*16(DATA_PTR),MSG1
pshufb SHUF_MASK,MSG1
sha1nexte MSG1,E1
movdqa ABCD,E0
sha1rnds4 $0,E1,ABCD
sha1msg1 MSG1,MSG0
// Rounds 8-11
movdqu 2*16(DATA_PTR),MSG2
pshufb SHUF_MASK,MSG2
sha1nexte MSG2,E0
movdqa ABCD,E1
sha1rnds4 $0,E0,ABCD
sha1msg1 MSG2,MSG1
pxor MSG2,MSG0
// Rounds 12-15
movdqu 3*16(DATA_PTR),MSG3
pshufb SHUF_MASK,MSG3
sha1nexte MSG3,E1
movdqa ABCD,E0
sha1msg2 MSG3,MSG0
sha1rnds4 $0,E1,ABCD
sha1msg1 MSG3,MSG2
pxor MSG3,MSG1
// Rounds 16-19
sha1nexte MSG0,E0
movdqa ABCD,E1
sha1msg2 MSG0,MSG1
sha1rnds4 $0,E0,ABCD
sha1msg1 MSG0,MSG3
pxor MSG0,MSG2
// Rounds 20-23
sha1nexte MSG1,E1
movdqa ABCD,E0
sha1msg2 MSG1,MSG2
sha1rnds4 $1,E1,ABCD
sha1msg1 MSG1,MSG0
pxor MSG1,MSG3
// Rounds 24-27
sha1nexte MSG2,E0
movdqa ABCD,E1
sha1msg2 MSG2,MSG3
sha1rnds4 $1,E0,ABCD
sha1msg1 MSG2,MSG1
pxor MSG2,MSG0
// Rounds 28-31
sha1nexte MSG3,E1
movdqa ABCD,E0
sha1msg2 MSG3,MSG0
sha1rnds4 $1,E1,ABCD
sha1msg1 MSG3,MSG2
pxor MSG3,MSG1
// Rounds 32-35
sha1nexte MSG0,E0
movdqa ABCD,E1
sha1msg2 MSG0,MSG1
sha1rnds4 $1,E0,ABCD
sha1msg1 MSG0,MSG3
pxor MSG0,MSG2
// Rounds 36-39
sha1nexte MSG1,E1
movdqa ABCD,E0
sha1msg2 MSG1,MSG2
sha1rnds4 $1,E1,ABCD
sha1msg1 MSG1,MSG0
pxor MSG1,MSG3
// Rounds 40-43
sha1nexte MSG2,E0
movdqa ABCD,E1
sha1msg2 MSG2,MSG3
sha1rnds4 $2,E0,ABCD
sha1msg1 MSG2,MSG1
pxor MSG2,MSG0
// Rounds 44-47
sha1nexte MSG3,E1
movdqa ABCD,E0
sha1msg2 MSG3,MSG0
sha1rnds4 $2,E1,ABCD
sha1msg1 MSG3,MSG2
pxor MSG3,MSG1
// Rounds 48-51
sha1nexte MSG0,E0
movdqa ABCD,E1
sha1msg2 MSG0,MSG1
sha1rnds4 $2,E0,ABCD
sha1msg1 MSG0,MSG3
pxor MSG0,MSG2
// Rounds 52-55
sha1nexte MSG1,E1
movdqa ABCD,E0
sha1msg2 MSG1,MSG2
sha1rnds4 $2,E1,ABCD
sha1msg1 MSG1,MSG0
pxor MSG1,MSG3
// Rounds 56-59
sha1nexte MSG2,E0
movdqa ABCD,E1
sha1msg2 MSG2,MSG3
sha1rnds4 $2,E0,ABCD
sha1msg1 MSG2,MSG1
pxor MSG2,MSG0
// Rounds 60-63
sha1nexte MSG3,E1
movdqa ABCD,E0
sha1msg2 MSG3,MSG0
sha1rnds4 $3,E1,ABCD
sha1msg1 MSG3,MSG2
pxor MSG3,MSG1
// Rounds 64-67
sha1nexte MSG0,E0
movdqa ABCD,E1
sha1msg2 MSG0,MSG1
sha1rnds4 $3,E0,ABCD
sha1msg1 MSG0,MSG3
pxor MSG0,MSG2
// Rounds 68-71
sha1nexte MSG1,E1
movdqa ABCD,E0
sha1msg2 MSG1,MSG2
sha1rnds4 $3,E1,ABCD
pxor MSG1,MSG3
// Rounds 72-75
sha1nexte MSG2,E0
movdqa ABCD,E1
sha1msg2 MSG2,MSG3
sha1rnds4 $3,E0,ABCD
// Rounds 76-79
sha1nexte MSG3,E1
movdqa ABCD,E0
sha1rnds4 $3,E1,ABCD
// Add current hash values with previously saved
sha1nexte (0*16)(%rsp),E0
paddd (1*16)(%rsp),ABCD
// Increment data pointer and loop if more to process
add $64,DATA_PTR
cmp NUM_BLKS,DATA_PTR
jne .Lloop0
// Write hash values back in the correct order
pshufd $0x1B,ABCD,ABCD
movdqu ABCD,0*16(DIGEST_PTR)
pextrd $3,E0,1*16(DIGEST_PTR)
.Ldone_hash:
leave
ret
.endfn sha1_transform_ni,globl
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
.align 16
PSHUFFLE_BYTE_FLIP_MASK:
.octa 0x000102030405060708090a0b0c0d0e0f
.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
.align 16
UPPER_WORD_MASK:
.octa 0xFFFFFFFF000000000000000000000000

View file

@ -50,7 +50,7 @@
#include "libc/macros.internal.h"
.ident "\n\
AVX2 SHA-256 (BSD-2 License)\n\
AVX2 SHA2 (BSD-2 License)\n\
Copyright 2013 Intel Corporation\n"
.include "libc/disclaimer.inc"
@ -598,19 +598,19 @@ sha256_transform_rorx:
.align 16
.Loop1:
vpaddd K256+0*32(SRND), X0, XFER
vpaddd kSha256x2+0*32(SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 0*32
vpaddd K256+1*32(SRND), X0, XFER
vpaddd kSha256x2+1*32(SRND), X0, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 1*32
vpaddd K256+2*32(SRND), X0, XFER
vpaddd kSha256x2+2*32(SRND), X0, XFER
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 2*32
vpaddd K256+3*32(SRND), X0, XFER
vpaddd kSha256x2+3*32(SRND), X0, XFER
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 3*32
@ -620,11 +620,11 @@ sha256_transform_rorx:
.Loop2:
## Do last 16 rounds with no scheduling
vpaddd K256+0*32(SRND), X0, XFER
vpaddd kSha256x2+0*32(SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
DO_4ROUNDS _XFER + 0*32
vpaddd K256+1*32(SRND), X1, XFER
vpaddd kSha256x2+1*32(SRND), X1, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
DO_4ROUNDS _XFER + 1*32
add $2*32, SRND
@ -712,7 +712,6 @@ sha256_transform_rorx:
.Ldone_hash:
mov _RSP(%rsp), %rsp
popq %r15
popq %r14
popq %r13
@ -722,52 +721,38 @@ sha256_transform_rorx:
ret
.endfn sha256_transform_rorx,globl
.section .rodata.cst512.K256, "aM", @progbits, 512
.align 64
K256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.rodata.cst32
PSHUFFLE_BYTE_FLIP_MASK:
.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
.octa 0x0c0d0e0f08090a0b0405060700010203
.octa 0x0c0d0e0f08090a0b0405060700010203
# shuffle xBxA -> 00BA
.rodata.cst32
_SHUF_00BA:
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
# shuffle xDxC -> DC00
.rodata.cst32
_SHUF_DC00:
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
.bss
.align 64
kSha256x2:
.zero 512
.endobj kSha256x2,globl
.previous
.init.start 201,_init_kSha256x2
push $64
pop %rcx
ezlea kSha256,dx
ezlea kSha256x2,ax
0: movaps -16(%rdx,%rcx,4),%xmm0
movaps %xmm0,-16(%rax,%rcx,8)
movaps %xmm0,-32(%rax,%rcx,8)
sub $4,%ecx
jnz 0b
.init.end 201,_init_kSha256x2

318
libc/nexgen32e/sha256ni.S Normal file
View file

@ -0,0 +1,318 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2015 Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "libc/macros.internal.h"
.text
.align 32
.ident "\n\
Intel SHA-NI (BSD-3 License)\n\
Copyright 2015 Intel Corporation\n\
Sean Gulley <sean.m.gulley@intel.com>\n\
Tim Chen <tim.c.chen@linux.intel.com>\n"
.include "libc/disclaimer.inc"
#define DIGEST_PTR %rdi /* 1st arg */
#define DATA_PTR %rsi /* 2nd arg */
#define NUM_BLKS %rdx /* 3rd arg */
#define SHA256CONSTANTS %rax
#define MSG %xmm0
#define STATE0 %xmm1
#define STATE1 %xmm2
#define MSGTMP0 %xmm3
#define MSGTMP1 %xmm4
#define MSGTMP2 %xmm5
#define MSGTMP3 %xmm6
#define MSGTMP4 %xmm7
#define SHUF_MASK %xmm8
#define ABEF_SAVE %xmm9
#define CDGH_SAVE %xmm10
// Performs Intel® SHA-NI optimized SHA-256 update.
//
// The function takes a pointer to the current hash values, a
// pointer to the input data, and a number of 64 byte blocks to
// process. Once all blocks have been processed, the digest pointer
// is updated with the resulting hash value. The function only
// processes complete blocks, there is no functionality to store
// partial blocks. All message padding and hash value
// initialization must be done outside the update function.
//
// The indented lines in the loop are instructions related to
// rounds processing. The non-indented lines are instructions
// related to the message schedule.
//
// void sha256_transform_ni(uint32_t digest[static 8],
// const void *data,
// int32_t numBlocks);
//
// @param %rdi points to output digest
// @param %rsi points to input data
// @param %rdx is number of blocks to process
// @see X86_HAVE(SHA)
sha256_transform_ni:
.leafprologue
.profilable
shl $6,NUM_BLKS # convert to bytes
jz .Ldone_hash
add DATA_PTR,NUM_BLKS # pointer to end of data
// Load initial hash values
// Need to reorder these appropriately
// DCBA, HGFE -> ABEF, CDGH
movdqu 0*16(DIGEST_PTR),STATE0
movdqu 1*16(DIGEST_PTR),STATE1
pshufd $0xB1,STATE0,STATE0 # CDAB
pshufd $0x1B,STATE1,STATE1 # EFGH
movdqa STATE0,MSGTMP4
palignr $8,STATE1,STATE0 # ABEF
pblendw $0xF0,MSGTMP4,STATE1 # CDGH
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip),SHUF_MASK
lea kSha256(%rip),SHA256CONSTANTS
.Lloop0:
// Save hash values for addition after rounds
movdqa STATE0,ABEF_SAVE
movdqa STATE1,CDGH_SAVE
// Rounds 0-3
movdqu 0*16(DATA_PTR),MSG
pshufb SHUF_MASK,MSG
movdqa MSG,MSGTMP0
paddd 0*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
// Rounds 4-7
movdqu 1*16(DATA_PTR),MSG
pshufb SHUF_MASK,MSG
movdqa MSG,MSGTMP1
paddd 1*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
sha256msg1 MSGTMP1,MSGTMP0
// Rounds 8-11
movdqu 2*16(DATA_PTR),MSG
pshufb SHUF_MASK,MSG
movdqa MSG,MSGTMP2
paddd 2*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
sha256msg1 MSGTMP2,MSGTMP1
// Rounds 12-15
movdqu 3*16(DATA_PTR),MSG
pshufb SHUF_MASK,MSG
movdqa MSG,MSGTMP3
paddd 3*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
movdqa MSGTMP3,MSGTMP4
palignr $4,MSGTMP2,MSGTMP4
paddd MSGTMP4,MSGTMP0
sha256msg2 MSGTMP3,MSGTMP0
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
sha256msg1 MSGTMP3,MSGTMP2
// Rounds 16-19
movdqa MSGTMP0,MSG
paddd 4*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
movdqa MSGTMP0,MSGTMP4
palignr $4,MSGTMP3,MSGTMP4
paddd MSGTMP4,MSGTMP1
sha256msg2 MSGTMP0,MSGTMP1
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
sha256msg1 MSGTMP0,MSGTMP3
// Rounds 20-23
movdqa MSGTMP1,MSG
paddd 5*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
movdqa MSGTMP1,MSGTMP4
palignr $4,MSGTMP0,MSGTMP4
paddd MSGTMP4,MSGTMP2
sha256msg2 MSGTMP1,MSGTMP2
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
sha256msg1 MSGTMP1,MSGTMP0
// Rounds 24-27
movdqa MSGTMP2,MSG
paddd 6*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
movdqa MSGTMP2,MSGTMP4
palignr $4,MSGTMP1,MSGTMP4
paddd MSGTMP4,MSGTMP3
sha256msg2 MSGTMP2,MSGTMP3
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
sha256msg1 MSGTMP2,MSGTMP1
// Rounds 28-31
movdqa MSGTMP3,MSG
paddd 7*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
movdqa MSGTMP3,MSGTMP4
palignr $4,MSGTMP2,MSGTMP4
paddd MSGTMP4,MSGTMP0
sha256msg2 MSGTMP3,MSGTMP0
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
sha256msg1 MSGTMP3,MSGTMP2
// Rounds 32-35
movdqa MSGTMP0,MSG
paddd 8*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
movdqa MSGTMP0,MSGTMP4
palignr $4,MSGTMP3,MSGTMP4
paddd MSGTMP4,MSGTMP1
sha256msg2 MSGTMP0,MSGTMP1
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
sha256msg1 MSGTMP0,MSGTMP3
// Rounds 36-39
movdqa MSGTMP1,MSG
paddd 9*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
movdqa MSGTMP1,MSGTMP4
palignr $4,MSGTMP0,MSGTMP4
paddd MSGTMP4,MSGTMP2
sha256msg2 MSGTMP1,MSGTMP2
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
sha256msg1 MSGTMP1,MSGTMP0
// Rounds 40-43
movdqa MSGTMP2,MSG
paddd 10*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
movdqa MSGTMP2,MSGTMP4
palignr $4,MSGTMP1,MSGTMP4
paddd MSGTMP4,MSGTMP3
sha256msg2 MSGTMP2,MSGTMP3
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
sha256msg1 MSGTMP2,MSGTMP1
// Rounds 44-47
movdqa MSGTMP3,MSG
paddd 11*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
movdqa MSGTMP3,MSGTMP4
palignr $4,MSGTMP2,MSGTMP4
paddd MSGTMP4,MSGTMP0
sha256msg2 MSGTMP3,MSGTMP0
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
sha256msg1 MSGTMP3,MSGTMP2
// Rounds 48-51
movdqa MSGTMP0,MSG
paddd 12*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
movdqa MSGTMP0,MSGTMP4
palignr $4,MSGTMP3,MSGTMP4
paddd MSGTMP4,MSGTMP1
sha256msg2 MSGTMP0,MSGTMP1
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
sha256msg1 MSGTMP0,MSGTMP3
// Rounds 52-55
movdqa MSGTMP1,MSG
paddd 13*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
movdqa MSGTMP1,MSGTMP4
palignr $4,MSGTMP0,MSGTMP4
paddd MSGTMP4,MSGTMP2
sha256msg2 MSGTMP1,MSGTMP2
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
// Rounds 56-59
movdqa MSGTMP2,MSG
paddd 14*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
movdqa MSGTMP2,MSGTMP4
palignr $4,MSGTMP1,MSGTMP4
paddd MSGTMP4,MSGTMP3
sha256msg2 MSGTMP2,MSGTMP3
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
// Rounds 60-63
movdqa MSGTMP3,MSG
paddd 15*16(SHA256CONSTANTS),MSG
sha256rnds2 STATE0,STATE1
pshufd $0x0E,MSG,MSG
sha256rnds2 STATE1,STATE0
// Add current hash values with previously saved
paddd ABEF_SAVE,STATE0
paddd CDGH_SAVE,STATE1
// Increment data pointer and loop if more to process
add $64,DATA_PTR
cmp NUM_BLKS,DATA_PTR
jne .Lloop0
// Write hash values back in the correct order
pshufd $0x1B,STATE0,STATE0 # FEBA
pshufd $0xB1,STATE1,STATE1 # DCHG
movdqa STATE0,MSGTMP4
pblendw $0xF0,STATE1,STATE0 # DCBA
palignr $8,MSGTMP4,STATE1 # HGFE
movdqu STATE0,0*16(DIGEST_PTR)
movdqu STATE1,1*16(DIGEST_PTR)
.Ldone_hash:
.leafepilogue
.endfn sha256_transform_ni,globl
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK,"aM",@progbits,16
.align 16
PSHUFFLE_BYTE_FLIP_MASK:
.octa 0x0c0d0e0f08090a0b0405060700010203
.endobj PSHUFFLE_BYTE_FLIP_MASK

View file

@ -51,7 +51,7 @@
#include "libc/macros.internal.h"
.ident "\n\
AVX2 SHA-512 (BSD-2 License)\n\
AVX2 SHA2 (BSD-2 License)\n\
Copyright 2013 Intel Corporation\n"
.include "libc/disclaimer.inc"

View file

@ -1,74 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
// Returns prefix length, consisting of chars not in reject.
//
// @param rdi is string
// @param rsi is reject nul-terminated character set
// @return rax is index of first byte in charset
// @see strspn(), strtok_r()
// @asyncsignalsafe
strcspn:
push %rbp
mov %rsp,%rbp
.profilable
sub $16,%rsp
push %rdi
mov %rsi,%rdi
call strlen
pop %rdi
cmp $15,%rax
ja 4f
push %rdi
mov %rax,%rdx
pxor %xmm0,%xmm0
lea -16(%rbp),%rdi
movdqa %xmm0,(%rdi)
call MemCpy
movdqa (%rdi),%xmm1
pop %rdi
or $-1,%rax
0: inc %rax
movzbl (%rdi,%rax),%ecx
movd %ecx,%xmm0
punpcklbw %xmm0,%xmm0
punpcklwd %xmm0,%xmm0
pshufd $0,%xmm0,%xmm0
pcmpeqb %xmm1,%xmm0
pmovmskb %xmm0,%ecx
test %ecx,%ecx
jz 0b
9: leave
ret
1: cmp %ch,%cl
je 9b
inc %edx
2: mov (%rsi,%rdx),%ch
test %ch,%ch
jne 1b
inc %rax
3: mov (%rdi,%rax),%cl
test %cl,%cl
je 9b
xor %edx,%edx
jmp 2b
4: xor %eax,%eax
jmp 3b
.endfn strcspn,globl

View file

@ -1,51 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
// Returns length of NUL-terminated string.
//
// @param rdi is non-null NUL-terminated string pointer
// @return rax is number of bytes (excluding NUL)
// @clob ax,dx,cx,xmm3,xmm4
// @note h/t agner fog
// @asyncsignalsafe
strlen: .leafprologue
.profilable
mov %rdi,%rax
mov %edi,%ecx
and $15,%ecx
and $-16,%rax
pxor %xmm4,%xmm4
movdqa (%rax),%xmm3
pcmpeqb %xmm4,%xmm3
pmovmskb %xmm3,%edx
shr %cl,%edx
shl %cl,%edx
bsf %edx,%edx
jnz 2f
1: lea 16(%rax),%rax
movdqa (%rax),%xmm3
pcmpeqb %xmm4,%xmm3
pmovmskb %xmm3,%edx
bsf %edx,%edx
jz 1b
2: add %rdx,%rax
sub %rdi,%rax
.leafepilogue
.endfn strlen,globl