mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 19:43:32 +00:00
e75ffde09e
You can now build Cosmopolitan with Clang: make -j8 MODE=llvm o/llvm/examples/hello.com The assembler and linker code is now friendly to LLVM too. So it's not needed to configure Clang to use binutils under the hood. If you love LLVM then you can now use pure LLVM.
406 lines
20 KiB
ArmAsm
406 lines
20 KiB
ArmAsm
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||
│vi: set et ft=asm ts=8 sw=8 fenc=utf-8 :vi│
|
||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||
│ │
|
||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||
│ any purpose with or without fee is hereby granted, provided that the │
|
||
│ above copyright notice and this permission notice appear in all copies. │
|
||
│ │
|
||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||
@fileoverview Cosmopolitan Memory Setter
|
||
|
||
This sets one bit per picosecond on a $900 Skylake workstation,
|
||
which is about 110 GBps. */
|
||
#include "libc/nexgen32e/x86feature.h"
|
||
#include "libc/nexgen32e/macros.h"
|
||
#include "libc/macros.h"
|
||
|
||
// Sets memory.
|
||
//
|
||
// @param rdi is dest
|
||
// @param esi is the byte to set
|
||
// @param edx is the number of bytes to set
|
||
// @return original rdi copied to rax
|
||
// @mode long
|
||
// @asyncsignalsafe
|
||
memset: mov %rdi,%rax
|
||
// 𝑠𝑙𝑖𝑑𝑒
|
||
.align 16
|
||
.endfn memset,globl
|
||
|
||
// Sets memory w/ minimal-impact ABI.
|
||
//
|
||
// @param rdi is dest
|
||
// @param esi is the byte to set
|
||
// @param edx is the number of bytes to set
|
||
// @clob flags,rcx,xmm3
|
||
// @mode long
|
||
MemSet: .leafprologue
|
||
.profilable
|
||
mov $.Lmemsettab.ro.size,%ecx
|
||
cmp %rcx,%rdx
|
||
cmovb %rdx,%rcx
|
||
jmp *memsettab(,%rcx,8)
|
||
.Lanchorpoint:
|
||
.L32r: cmp $1024,%rdx
|
||
jae .Lerms
|
||
.L32: vmovd %esi,%xmm3
|
||
vpbroadcastb %xmm3,%ymm3
|
||
mov $32,%ecx
|
||
1: lea 32(%rcx),%rcx
|
||
vmovdqu %ymm3,-64(%rdi,%rcx)
|
||
cmp %rcx,%rdx
|
||
ja 1b
|
||
vmovdqu %ymm3,-32(%rdi,%rdx)
|
||
vpxor %ymm3,%ymm3,%ymm3
|
||
jmp .L0
|
||
.L16r: cmp $1024,%rdx
|
||
jae .Lerms
|
||
.L16: movd %esi,%xmm3
|
||
pbroadcastb %xmm3
|
||
mov $16,%ecx
|
||
1: lea 16(%rcx),%rcx
|
||
movdqu %xmm3,-32(%rdi,%rcx)
|
||
cmp %rcx,%rdx
|
||
ja 1b
|
||
movdqu %xmm3,-16(%rdi,%rdx)
|
||
pxor %xmm3,%xmm3
|
||
.L0: .leafepilogue
|
||
.L8: movzbl %sil,%ecx
|
||
imul .Lb8(%rip),%rcx
|
||
mov %rcx,(%rdi)
|
||
mov %rcx,-8(%rdi,%rdx)
|
||
jmp .L0
|
||
.L4: movzbl %sil,%ecx
|
||
imul $0x01010101,%ecx,%ecx
|
||
mov %ecx,(%rdi)
|
||
mov %ecx,-4(%rdi,%rdx)
|
||
jmp .L0
|
||
.L3: mov %sil,2(%rdi)
|
||
.L2: mov %sil,1(%rdi)
|
||
.L1: mov %sil,(%rdi)
|
||
jmp .L0
|
||
.Lerms: push %rax
|
||
push %rdi
|
||
mov %esi,%eax
|
||
mov %rdx,%rcx
|
||
rep stosb
|
||
pop %rdi
|
||
pop %rax
|
||
jmp .L0
|
||
.endfn MemSet,globl,hidden
|
||
.source __FILE__
|
||
|
||
.rodata.cst8
|
||
.Lb8: .quad 0x0101010101010101
|
||
.previous
|
||
|
||
.initro 300,_init_memset
|
||
memsettab.ro:
|
||
.byte .L0 - .Lanchorpoint
|
||
.byte .L1 - .Lanchorpoint
|
||
.byte .L2 - .Lanchorpoint
|
||
.byte .L3 - .Lanchorpoint
|
||
.rept 4
|
||
.byte .L4 - .Lanchorpoint
|
||
.endr
|
||
.rept 8
|
||
.byte .L8 - .Lanchorpoint
|
||
.endr
|
||
.rept 16
|
||
.byte .L16 - .Lanchorpoint
|
||
.endr
|
||
.equ .Lmemsettab.ro.size,.-memsettab.ro
|
||
.endobj memsettab.ro
|
||
.if .Lmemsettab.ro.size % 8
|
||
.error "moar jmptab"
|
||
.endif
|
||
.byte .L16 - .Lanchorpoint # SSE2
|
||
.byte .L16r - .Lanchorpoint # SSE2 + ERMS
|
||
.byte .L32 - .Lanchorpoint # AVX2
|
||
.byte .L32r - .Lanchorpoint # AVX2 + ERMS
|
||
.byte 0,0,0,0
|
||
.previous
|
||
|
||
.initbss 300,_init_memset
|
||
memsettab:
|
||
.rept .Lmemsettab.ro.size
|
||
.quad 0
|
||
.endr
|
||
.quad 0
|
||
.endobj memsettab
|
||
.previous
|
||
|
||
.init.start 300,_init_memset
|
||
pushpop .Lmemsettab.ro.size,%rcx
|
||
ezlea .Lanchorpoint,dx
|
||
testb X86_HAVE(AVX2)+kCpuids(%rip)
|
||
call memjmpinit
|
||
.init.end 300,_init_memset
|
||
|
||
/* benchmarks on intel core i7-6700 @ 3.40GHz (skylake)
|
||
includes function call overhead (unless marked otherwise)
|
||
|
||
Your memset() for #c per n where c ≈ 0.273ns
|
||
N x1 x8 x64 mBps
|
||
------------------------------------------------------------
|
||
1 73.000 35.125 36.141 97
|
||
1 35.000 36.375 35.984 97
|
||
2 28.500 19.938 18.820 185
|
||
3 19.000 12.458 12.651 276
|
||
4 15.750 10.719 9.566 365
|
||
7 5.000 5.411 5.730 609
|
||
8 8.375 4.953 4.697 743
|
||
15 4.200 2.408 2.407 1450
|
||
16 7.188 2.539 2.382 1465 «
|
||
31 1.129 1.206 1.183 2950
|
||
32 15.156 2.012 1.292 2702
|
||
63 4.016 0.986 0.663 5264
|
||
64 3.547 0.967 0.684 5104
|
||
127 2.087 0.562 0.338 10311
|
||
128 1.805 0.499 0.336 10393
|
||
255 0.412 0.180 0.183 19119
|
||
256 0.160 0.170 0.169 20650
|
||
511 0.162 0.134 0.108 32214
|
||
512 0.100 0.106 0.104 33507
|
||
1023 0.110 0.095 0.082 42574
|
||
1024 0.099 0.080 0.078 44944
|
||
2047 0.155 0.154 0.154 22624
|
||
2048 0.052 0.052 0.053 66266
|
||
4095 0.098 0.099 0.099 35142
|
||
4096 0.042 0.042 0.041 84250
|
||
8191 0.072 0.073 0.072 48157
|
||
8192 0.034 0.034 0.034 101332
|
||
16383 0.059 0.059 0.059 58997
|
||
16384 0.031 0.031 0.031 112972
|
||
32767 0.054 0.054 0.054 65053
|
||
32768 0.029 0.029 0.029 119433
|
||
65535 0.069 0.069 0.068 51690
|
||
65536 0.057 0.057 0.057 61434
|
||
131071 0.066 0.066 0.066 53001
|
||
131072 0.057 0.058 0.057 60716
|
||
262143 0.066 0.065 0.065 53462
|
||
262144 0.060 0.058 0.058 60104
|
||
524287 0.067 0.068 0.072 48784
|
||
524288 0.063 0.062 0.061 56957
|
||
1048575 0.068 0.068 0.069 50353
|
||
1048576 0.062 0.060 0.062 56661
|
||
2097151 0.066 0.066 0.067 52421
|
||
2097152 0.060 0.060 0.061 57672
|
||
4194303 0.072 0.067 0.067 51910
|
||
4194304 0.062 0.061 0.062 56327
|
||
8388607 0.129 0.111 0.111 31368
|
||
8388608 0.136 0.119 0.111 31519
|
||
|
||
glibc memset() for #c per n where c ≈ 0.273ns
|
||
N x1 x8 x64 mBps
|
||
------------------------------------------------------------
|
||
1 121.000 39.125 35.547 98
|
||
1 33.000 35.875 35.172 99
|
||
2 17.500 18.312 18.070 193
|
||
3 16.333 14.542 12.411 281
|
||
4 12.250 9.344 9.215 379
|
||
7 7.571 5.732 5.453 640
|
||
8 4.625 4.641 4.623 755
|
||
15 4.467 3.158 2.478 1408
|
||
16 2.312 2.289 2.468 1414
|
||
31 2.290 1.367 1.278 2731
|
||
32 1.219 1.176 1.182 2952
|
||
63 0.905 0.696 0.656 5320
|
||
64 0.672 0.658 0.660 5285
|
||
127 1.299 0.723 0.673 5183
|
||
128 0.508 0.423 0.424 8227
|
||
255 0.490 0.428 0.417 8367
|
||
256 0.293 0.233 0.243 14349
|
||
511 0.284 0.232 0.234 14902
|
||
512 0.154 0.131 0.131 26626
|
||
1023 0.155 0.137 0.135 25839
|
||
1024 0.089 0.078 0.080 43875
|
||
2047 0.103 0.092 0.090 38672
|
||
2048 0.060 0.054 0.054 65116
|
||
4095 0.073 0.068 0.068 51405
|
||
4096 0.046 0.042 0.042 82162
|
||
8191 0.060 0.058 0.057 60739
|
||
8192 0.036 0.034 0.034 101467
|
||
16383 0.052 0.052 0.051 68594
|
||
16384 0.031 0.031 0.031 112603
|
||
32767 0.053 0.050 0.049 70850
|
||
32768 0.032 0.029 0.029 119617
|
||
65535 0.067 0.067 0.067 52015
|
||
65536 0.058 0.058 0.058 60440
|
||
131071 0.067 0.066 0.065 53518
|
||
131072 0.059 0.058 0.058 60281
|
||
262143 0.066 0.065 0.065 54005
|
||
262144 0.058 0.058 0.058 60121
|
||
524287 0.067 0.067 0.067 52349
|
||
524288 0.061 0.061 0.064 54699
|
||
1048575 0.068 0.067 0.067 51876
|
||
1048576 0.061 0.061 0.061 56775
|
||
2097151 0.068 0.068 0.068 51379
|
||
2097152 0.062 0.062 0.062 56513
|
||
4194303 0.069 0.068 0.069 50580
|
||
4194304 0.063 0.064 0.063 55751
|
||
8388607 0.120 0.118 0.120 28998
|
||
8388608 0.137 0.123 0.117 29936
|
||
|
||
GCC (Inline REP STOSB) for #c per n where c ≈ 0.273ns
|
||
N x1 x8 x64 mBps
|
||
------------------------------------------------------------
|
||
1 413.000 434.125 441.453 8
|
||
1 431.000 436.125 438.953 8
|
||
2 223.500 224.438 224.836 16
|
||
3 149.000 150.042 623.786 6
|
||
4 108.750 109.531 110.559 32
|
||
7 62.714 63.196 63.266 55
|
||
8 56.375 56.641 56.838 61
|
||
15 30.467 30.708 30.761 113
|
||
16 24.062 24.023 24.038 145
|
||
31 14.548 14.859 14.876 235
|
||
32 9.719 9.691 9.730 359
|
||
63 7.286 7.312 7.339 476
|
||
64 3.609 3.705 3.721 938
|
||
127 1.976 2.058 2.067 1689
|
||
128 0.414 0.405 0.409 8532
|
||
255 0.890 0.907 0.911 3832
|
||
256 0.215 0.217 0.218 16039
|
||
511 0.476 0.481 0.480 7273
|
||
512 0.119 0.119 0.119 29270
|
||
1023 0.257 0.260 0.260 13409
|
||
1024 0.073 0.073 0.074 47442
|
||
2047 0.150 0.150 0.151 23189
|
||
2048 0.049 0.050 0.050 69424
|
||
4095 0.096 0.097 0.097 36142
|
||
4096 0.040 0.040 0.040 87842
|
||
8191 0.071 0.071 0.071 49061
|
||
8192 0.034 0.033 0.034 104099
|
||
16383 0.058 0.059 0.058 59697
|
||
16384 0.030 0.031 0.030 114585
|
||
32767 0.053 0.053 0.053 66161
|
||
32768 0.029 0.029 0.029 120750
|
||
65535 0.069 0.069 0.069 50520
|
||
65536 0.058 0.058 0.058 60100
|
||
131071 0.068 0.067 0.085 40964
|
||
131072 0.076 0.072 0.063 55514
|
||
262143 0.067 0.093 0.090 38681
|
||
262144 0.073 0.062 0.077 45384
|
||
524287 0.107 0.093 0.066 52689
|
||
524288 0.061 0.060 0.062 56294
|
||
1048575 0.066 0.066 0.066 52990
|
||
1048576 0.061 0.061 0.061 57248
|
||
2097151 0.067 0.075 0.067 51887
|
||
2097152 0.061 0.061 0.061 56878
|
||
4194303 0.068 0.100 0.069 50623
|
||
4194304 0.061 0.061 0.061 57195
|
||
8388607 0.117 0.121 0.119 29441
|
||
8388608 0.118 0.119 0.162 21587
|
||
|
||
Musl memset() for #c per n where c ≈ 0.273ns
|
||
N x1 x8 x64 mBps
|
||
------------------------------------------------------------
|
||
1 49.000 35.625 35.172 99
|
||
1 33.000 34.625 35.109 99
|
||
2 17.500 17.562 18.023 194
|
||
3 20.333 14.042 12.411 281
|
||
4 11.250 9.219 9.301 375
|
||
7 11.857 6.018 5.417 644
|
||
8 4.125 4.516 4.592 760
|
||
15 4.200 2.692 2.480 1407
|
||
16 2.312 2.273 2.310 1511
|
||
31 2.097 1.786 1.342 2600
|
||
32 1.219 1.238 1.242 2811
|
||
63 0.841 0.815 0.686 5085
|
||
64 0.641 0.666 0.665 5246
|
||
127 1.000 0.718 0.690 5061
|
||
128 0.477 0.435 0.413 8451
|
||
255 0.459 0.418 0.403 8670
|
||
256 0.285 0.233 0.232 15051
|
||
511 0.256 0.230 0.228 15285
|
||
512 0.158 0.129 0.128 27170
|
||
1023 0.134 0.140 0.138 25296
|
||
1024 0.089 0.077 0.078 44891
|
||
2047 0.094 0.088 0.088 39837
|
||
2048 0.060 0.052 0.053 66075
|
||
4095 0.071 0.068 0.068 51359
|
||
4096 0.045 0.043 0.042 83178
|
||
8191 0.059 0.058 0.057 60868
|
||
8192 0.037 0.035 0.034 102662
|
||
16383 0.052 0.051 0.051 68658
|
||
16384 0.032 0.031 0.031 113568
|
||
32767 0.050 0.049 0.049 71296
|
||
32768 0.030 0.029 0.029 120029
|
||
65535 0.067 0.067 0.068 50983
|
||
65536 0.059 0.059 0.058 59665
|
||
131071 0.067 0.067 0.067 52014
|
||
131072 0.059 0.060 0.059 59211
|
||
262143 0.067 0.066 0.066 52877
|
||
262144 0.059 0.060 0.085 40900
|
||
524287 0.067 0.066 0.065 53688
|
||
524288 0.059 0.059 0.059 59112
|
||
1048575 0.066 0.066 0.066 53181
|
||
1048576 0.060 0.060 0.060 58300
|
||
2097151 0.066 0.066 0.067 52439
|
||
2097152 0.060 0.068 0.060 57924
|
||
4194303 0.069 0.067 0.080 43425
|
||
4194304 0.062 0.080 0.062 56085
|
||
8388607 0.126 0.118 0.133 26207
|
||
8388608 0.127 0.119 0.118 29643
|
||
|
||
Newlib memset() for #c per n where c ≈ 0.273ns
|
||
N x1 x8 x64 mBps
|
||
------------------------------------------------------------
|
||
1 443.000 440.875 440.078 8
|
||
1 437.000 437.375 440.453 8
|
||
2 226.500 226.438 227.461 15
|
||
3 150.333 150.625 151.151 23
|
||
4 113.250 113.281 113.770 31
|
||
7 66.714 67.232 66.998 52
|
||
8 58.375 58.828 58.811 59
|
||
15 31.000 30.858 31.264 112
|
||
16 31.438 28.523 28.317 123
|
||
31 27.839 29.536 50.533 69
|
||
32 11.281 10.918 11.068 315
|
||
63 12.302 11.907 11.863 294
|
||
64 4.703 4.396 4.404 793
|
||
127 2.732 2.719 2.712 1287
|
||
128 0.852 0.729 0.736 4742
|
||
255 1.188 1.178 1.171 2981
|
||
256 0.652 0.416 0.381 9171
|
||
511 1.474 1.629 1.662 2099
|
||
512 0.287 0.264 0.246 14204
|
||
1023 0.873 0.934 0.947 3684
|
||
1024 0.196 0.179 0.178 19604
|
||
2047 0.544 0.545 0.626 5572
|
||
2048 0.257 0.257 0.253 13779
|
||
4095 0.426 0.427 0.430 8110
|
||
4096 0.282 0.296 0.293 11917
|
||
8191 0.374 0.370 0.371 9402
|
||
8192 0.297 0.310 0.400 8717
|
||
16383 0.346 0.345 0.433 8062
|
||
16384 0.313 0.312 0.311 11223
|
||
32767 0.334 0.332 0.332 10505
|
||
32768 0.313 0.313 0.358 9759
|
||
65535 0.335 0.327 0.330 10589
|
||
65536 0.330 0.312 0.337 10347
|
||
131071 0.350 0.339 0.355 9825
|
||
131072 0.334 0.329 0.359 9728
|
||
262143 0.346 0.352 0.357 9785
|
||
262144 0.350 0.375 0.482 7243
|
||
524287 0.348 0.346 0.360 9691
|
||
524288 0.347 0.346 0.385 9063
|
||
1048575 0.358 0.375 0.383 9114
|
||
1048576 0.355 0.382 0.388 8987
|
||
2097151 0.362 0.368 0.390 8956
|
||
2097152 0.363 0.375 0.387 9016
|
||
4194303 0.361 0.379 0.385 9073
|
||
4194304 0.366 0.376 0.385 9074
|
||
8388607 0.363 0.366 0.372 9391
|
||
8388608 0.419 0.374 0.370 9428 */
|