mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 11:37:35 +00:00
1ff9ab95ac
This change enables Address Sanitizer systemically w/ `make MODE=dbg`. Our version of Rust's `unsafe` keyword is named `noasan` which is used for two functions that do aligned memory chunking, like `strcpy.c` and we need to fix the tiny DEFLATE code, but that's it everything else is fabulous you can have all the fischer price security blankets you need Best of all is we're now able to use the ASAN data in Blinkenlights to colorize the memory dumps. See the screenshot below of a test program: https://justine.lol/blinkenlights/asan.png Which is operating on float arrays stored on the stack, with red areas indicating poisoned memory, and the green areas indicate valid memory.
551 lines
27 KiB
ArmAsm
551 lines
27 KiB
ArmAsm
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||
│ │
|
||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||
│ any purpose with or without fee is hereby granted, provided that the │
|
||
│ above copyright notice and this permission notice appear in all copies. │
|
||
│ │
|
||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||
@fileoverview Cosmopolitan Memory Copying
|
||
|
||
Of all the functions in the technology industry, none are more
|
||
critical than the Kernighan & Ritchie Memory Copy API for the C
|
||
Language, 1972 model: more commonly known as memcpy(). It's the
|
||
world's most popular function──one all programmers love.
|
||
|
||
This implementation is the fastest and nearly the tiniest too.
|
||
It doesn't break when copying backwards or on misaligned data.
|
||
It's so easy that even a child could use it, and they do.
|
||
*/
|
||
#include "libc/nexgen32e/x86feature.h"
|
||
#include "libc/macros.h"
|
||
|
||
/ Copies memory.
|
||
/
|
||
/ DEST and SRC must not overlap, unless DEST≤SRC.
|
||
/
|
||
/ @param rdi is dest
|
||
/ @param rsi is src
|
||
/ @param rdx is number of bytes
|
||
/ @return original rdi copied to rax
|
||
/ @mode long
|
||
/ @asyncsignalsafe
|
||
memcpy: mov %rdi,%rax
|
||
/ 𝑠𝑙𝑖𝑑𝑒
|
||
.align 16
|
||
.endfn memcpy,globl
|
||
|
||
/ Copies memory w/ minimal impact ABI.
|
||
/
|
||
/ @param rdi is dest
|
||
/ @param rsi is src
|
||
/ @param rdx is number of bytes
|
||
/ @clob flags,rcx,xmm3,xmm4
|
||
/ @mode long
|
||
MemCpy: .leafprologue
|
||
.profilable
|
||
mov $.Lmemcpytab.ro.size,%ecx
|
||
cmp %rcx,%rdx
|
||
cmovb %rdx,%rcx
|
||
jmp *memcpytab(,%rcx,8)
|
||
.Lanchorpoint:
|
||
.L32r: cmp $1024,%rdx
|
||
jae .Lerms
|
||
.L32: vmovdqu -32(%rsi,%rdx),%ymm4
|
||
mov $32,%rcx
|
||
0: add $32,%rcx
|
||
vmovdqu -64(%rsi,%rcx),%ymm3
|
||
vmovdqu %ymm3,-64(%rdi,%rcx)
|
||
cmp %rcx,%rdx
|
||
ja 0b
|
||
vmovdqu %ymm4,-32(%rdi,%rdx)
|
||
vxorps %ymm4,%ymm4,%ymm4
|
||
vxorps %ymm3,%ymm3,%ymm3
|
||
jmp .L0
|
||
.L16r: cmp $1024,%rdx
|
||
jae .Lerms
|
||
.L16: movdqu -16(%rsi,%rdx),%xmm4
|
||
mov $16,%rcx
|
||
0: add $16,%rcx
|
||
movdqu -32(%rsi,%rcx),%xmm3
|
||
movdqu %xmm3,-32(%rdi,%rcx)
|
||
cmp %rcx,%rdx
|
||
ja 0b
|
||
movdqu %xmm4,-16(%rdi,%rdx)
|
||
pxor %xmm4,%xmm4
|
||
pxor %xmm3,%xmm3
|
||
jmp .L0
|
||
.L8: push %rbx
|
||
mov (%rsi),%rcx
|
||
mov -8(%rsi,%rdx),%rbx
|
||
mov %rcx,(%rdi)
|
||
mov %rbx,-8(%rdi,%rdx)
|
||
1: pop %rbx
|
||
.L0: .leafepilogue
|
||
.L4: push %rbx
|
||
mov (%rsi),%ecx
|
||
mov -4(%rsi,%rdx),%ebx
|
||
mov %ecx,(%rdi)
|
||
mov %ebx,-4(%rdi,%rdx)
|
||
jmp 1b
|
||
.L3: push %rbx
|
||
mov (%rsi),%cx
|
||
mov -2(%rsi,%rdx),%bx
|
||
mov %cx,(%rdi)
|
||
mov %bx,-2(%rdi,%rdx)
|
||
jmp 1b
|
||
.L2: mov (%rsi),%cx
|
||
mov %cx,(%rdi)
|
||
jmp .L0
|
||
.L1: mov (%rsi),%cl
|
||
mov %cl,(%rdi)
|
||
jmp .L0
|
||
.Lerms: cmp kHalfCache3(%rip),%rdx
|
||
ja .Lnts
|
||
push %rdi
|
||
push %rsi
|
||
mov %rdx,%rcx
|
||
rep movsb
|
||
pop %rsi
|
||
pop %rdi
|
||
jmp .L0
|
||
.Lnts: movdqu (%rsi),%xmm3
|
||
movdqu %xmm3,(%rdi)
|
||
lea 16(%rdi),%rcx
|
||
and $-16,%rcx
|
||
sub %rdi,%rcx
|
||
add %rcx,%rdi
|
||
add %rcx,%rsi
|
||
sub %rcx,%rdx
|
||
mov $16,%rcx
|
||
0: add $16,%rcx
|
||
movdqu -32(%rsi,%rcx),%xmm3
|
||
movntdq %xmm3,-32(%rdi,%rcx)
|
||
cmp %rcx,%rdx
|
||
ja 0b
|
||
sfence
|
||
movdqu -16(%rsi,%rdx),%xmm3
|
||
movdqu %xmm3,-16(%rdi,%rdx)
|
||
pxor %xmm3,%xmm3
|
||
jmp .L0
|
||
.endfn MemCpy,globl,hidden
|
||
.source __FILE__
|
||
|
||
.initro 300,_init_memcpy
|
||
memcpytab.ro:
|
||
.byte .L0-.Lanchorpoint
|
||
.byte .L1-.Lanchorpoint
|
||
.byte .L2-.Lanchorpoint
|
||
.byte .L3-.Lanchorpoint
|
||
.rept 4
|
||
.byte .L4-.Lanchorpoint
|
||
.endr
|
||
.rept 8
|
||
.byte .L8-.Lanchorpoint
|
||
.endr
|
||
.rept 16
|
||
.byte .L16-.Lanchorpoint
|
||
.endr
|
||
.equ .Lmemcpytab.ro.size,.-memcpytab.ro
|
||
.endobj memcpytab.ro
|
||
.if .Lmemcpytab.ro.size % 8
|
||
.error "moar jmptab"
|
||
.endif
|
||
.byte .L16-.Lanchorpoint # SSE2
|
||
.byte .L16r-.Lanchorpoint # SSE2 + ERMS
|
||
.byte .L32-.Lanchorpoint # AVX
|
||
.byte .L32r-.Lanchorpoint # AVX + ERMS
|
||
.byte 0,0,0,0
|
||
.previous
|
||
|
||
.initbss 300,_init_memcpy
|
||
memcpytab:
|
||
.rept .Lmemcpytab.ro.size
|
||
.quad 0
|
||
.endr
|
||
.quad 0
|
||
.endobj memcpytab
|
||
.previous
|
||
|
||
.init.start 300,_init_memcpy
|
||
pushpop .Lmemcpytab.ro.size,%rcx
|
||
ezlea .Lanchorpoint,dx
|
||
testb X86_HAVE(AVX)+kCpuids(%rip)
|
||
call memjmpinit
|
||
.init.end 300,_init_memcpy
|
||
|
||
/* your memcpy() 375 bytes
|
||
bionic memcpy() 1,429 bytes
|
||
glibc memcpy() 27,216 bytes
|
||
musl memcpy() 49 bytes
|
||
newlib memcpy() 300 bytes
|
||
|
||
benchmarks on intel core i7-6700 @ 3.40GHz (skylake)
|
||
includes function call overhead (unless marked otherwise)
|
||
|
||
your memcpy(𝑛) for #c per n where c ≈ 0.293ns
|
||
N x1 x8 x64 mBps
|
||
------------------------------------------------------------
|
||
1 297.000 35.125 35.203 92
|
||
1 35.000 35.625 35.016 93
|
||
2 27.500 17.438 17.555 185
|
||
3 21.000 11.875 12.057 270
|
||
4 16.250 8.719 8.809 369
|
||
7 5.000 4.946 5.069 641
|
||
8 7.375 4.422 4.365 745
|
||
15 4.067 2.342 2.336 1391
|
||
16 4.188 2.242 2.257 1440 «
|
||
31 8.032 1.157 1.147 2835
|
||
32 2.031 1.723 1.325 2454
|
||
63 1.000 0.589 0.589 5523
|
||
64 0.578 0.580 0.577 5630 «
|
||
127 0.638 0.377 0.320 10151
|
||
128 0.289 0.296 0.307 10605
|
||
255 0.404 0.202 0.194 16741
|
||
256 0.160 0.165 0.166 19574 «
|
||
511 0.159 0.123 0.110 29458
|
||
512 0.139 0.098 0.097 33571 «
|
||
1023 0.107 0.086 0.074 44111
|
||
1024 0.103 0.084 0.082 39489
|
||
2047 0.057 0.056 0.057 57450
|
||
2048 0.055 0.055 0.055 59269
|
||
4095 0.044 0.044 0.044 74051
|
||
4096 0.043 0.043 0.043 75300 «
|
||
8191 0.036 0.036 0.036 91301
|
||
8192 0.036 0.035 0.035 92411
|
||
16383 0.033 0.032 0.032 102163
|
||
16384 0.034 0.032 0.032 102145 « (L1)/2
|
||
32767 0.098 0.081 0.077 42271
|
||
32768 0.077 0.077 0.076 42781
|
||
65535 0.088 0.075 0.072 44973
|
||
65536 0.074 0.072 0.071 45520
|
||
131071 0.086 0.075 0.072 44869
|
||
131072 0.077 0.073 0.072 45076 « (L2)/2
|
||
262143 0.095 0.096 0.095 34116
|
||
262144 0.096 0.096 0.095 34160
|
||
524287 0.102 0.109 0.111 29359
|
||
524288 0.107 0.109 0.108 30033
|
||
1048575 0.102 0.103 0.104 31112
|
||
1048576 0.101 0.103 0.103 31605
|
||
2097151 0.104 0.103 0.109 29929
|
||
2097152 0.108 0.110 0.103 31652
|
||
4194303 0.192 0.172 0.172 18950
|
||
4194304 0.168 0.161 0.160 20311 « (L3)/2
|
||
8388607 0.339 0.329 0.344 9461 « RAM
|
||
8388608 0.384 0.369 0.341 9545
|
||
|
||
Bionic memcpy() for #c per n where c ≈ 0.293ns
|
||
N x1 x8 x64 mBps
|
||
------------------------------------------------------------
|
||
1 347.000 40.625 35.984 90
|
||
1 37.000 35.625 36.734 89
|
||
2 28.500 18.688 18.383 177
|
||
3 11.667 12.375 12.359 263
|
||
4 12.250 9.406 9.020 361
|
||
7 5.000 5.018 5.118 636
|
||
8 11.625 5.828 4.779 681
|
||
15 3.533 3.158 2.620 1243
|
||
16 4.688 2.742 2.884 1129 «
|
||
31 1.903 1.262 1.172 2778
|
||
32 1.344 1.113 1.125 2895
|
||
63 1.444 0.633 0.591 5513
|
||
64 0.766 0.580 0.581 5605 «
|
||
127 0.512 0.383 0.318 10229
|
||
128 0.461 0.315 0.311 10463
|
||
255 0.475 0.216 0.193 16840
|
||
256 0.371 0.236 0.199 16397 «
|
||
511 0.295 0.144 0.120 27223
|
||
512 0.240 0.151 0.126 25937 «
|
||
1023 0.142 0.101 0.088 36947
|
||
1024 0.126 0.108 0.091 35889
|
||
2047 0.088 0.074 0.072 45475
|
||
2048 0.089 0.077 0.073 44380
|
||
4095 0.081 0.065 0.064 50766
|
||
4096 0.068 0.066 0.065 50246 «
|
||
8191 0.063 0.061 0.060 54075
|
||
8192 0.065 0.061 0.061 53731
|
||
16383 0.082 0.066 0.061 53765
|
||
16384 0.067 0.063 0.062 52765 « (L1)/2
|
||
32767 0.102 0.085 0.085 38406
|
||
32768 0.086 0.085 0.085 38473
|
||
65535 0.098 0.085 0.085 38292
|
||
65536 0.086 0.085 0.085 38369
|
||
131071 0.438 0.177 0.089 36716
|
||
131072 0.092 0.090 0.093 34880 « (L2)/2
|
||
262143 0.306 0.146 0.127 25601
|
||
262144 0.126 0.168 0.127 25704
|
||
524287 0.213 0.152 0.136 23993
|
||
524288 0.132 0.159 0.133 24570
|
||
1048575 0.127 0.129 0.130 25117
|
||
1048576 0.128 0.129 0.130 25107
|
||
2097151 0.127 0.127 0.129 25199
|
||
2097152 0.127 0.136 0.134 24274
|
||
4194303 0.216 0.192 0.228 14237
|
||
4194304 0.351 0.351 0.356 9139 « (L3)/2
|
||
8388607 0.323 0.293 0.298 10903 « RAM
|
||
8388608 0.365 0.296 0.300 10844
|
||
|
||
GCC builtin (Inline REP MOVSB) for #c per n where c ≈ 0.293ns
|
||
N x1 x8 x64 mBps
|
||
------------------------------------------------------------
|
||
1 53.000 50.625 50.453 64
|
||
1 47.000 49.375 49.141 66
|
||
2 23.500 25.062 24.898 131
|
||
3 15.667 16.792 16.880 193
|
||
4 11.750 12.531 12.957 251
|
||
7 7.000 7.125 7.190 452
|
||
8 6.125 7.578 6.322 514
|
||
15 3.133 3.325 3.372 964
|
||
16 3.062 3.117 3.132 1038 «
|
||
31 1.645 1.601 1.620 2007
|
||
32 1.531 1.559 1.585 2051
|
||
63 0.778 0.796 0.802 4056
|
||
64 0.766 0.768 0.767 4238 «
|
||
127 0.480 0.446 0.448 7259
|
||
128 0.445 0.419 0.423 7693
|
||
255 0.239 0.239 0.236 13781
|
||
256 0.238 0.225 0.225 14466 «
|
||
511 0.127 0.133 0.132 24555
|
||
512 0.123 0.127 0.128 25377 «
|
||
1023 0.079 0.081 0.081 40346
|
||
1024 0.075 0.077 0.078 41714
|
||
2047 0.053 0.055 0.055 59575
|
||
2048 0.053 0.053 0.053 60795
|
||
4095 0.042 0.043 0.043 75843
|
||
4096 0.042 0.042 0.042 77153
|
||
8191 0.035 0.036 0.036 91518
|
||
8192 0.035 0.035 0.035 92603
|
||
16383 0.032 0.032 0.032 102407
|
||
16384 0.033 0.032 0.032 102864 « (L1)/2
|
||
32767 0.106 0.082 0.078 41486
|
||
32768 0.079 0.078 0.079 41290
|
||
65535 0.090 0.077 0.075 43565
|
||
65536 0.074 0.074 0.073 44299
|
||
131071 0.091 0.078 0.075 43196
|
||
131072 0.078 0.076 0.074 43673 « (L2)/2
|
||
262143 0.097 0.099 0.098 33192
|
||
262144 0.098 0.098 0.098 33193
|
||
524287 0.105 0.111 0.111 29212
|
||
524288 0.109 0.111 0.111 29211
|
||
1048575 0.107 0.108 0.108 30069
|
||
1048576 0.106 0.112 0.105 30886
|
||
2097151 0.105 0.103 0.103 31621
|
||
2097152 0.102 0.103 0.104 31280
|
||
4194303 0.180 0.158 0.176 18456
|
||
4194304 0.167 0.155 0.154 21098 « (L3)/2
|
||
8388607 0.538 0.576 0.557 5834 « RAM
|
||
8388608 0.750 0.579 0.552 5893
|
||
|
||
glibc memcpy() for #c per n where c ≈ 0.293ns
|
||
N x1 x8 x64 mBps
|
||
------------------------------------------------------------
|
||
1 139.000 90.125 84.891 38
|
||
1 83.000 82.125 84.359 39
|
||
2 61.500 46.438 45.164 72
|
||
3 41.667 32.458 31.245 104
|
||
4 32.750 26.156 24.410 133
|
||
7 20.143 16.732 16.033 203
|
||
8 13.375 8.328 6.908 471
|
||
15 8.200 6.408 5.753 565
|
||
16 4.438 3.570 3.466 938 «
|
||
31 3.258 2.891 2.786 1167
|
||
32 2.281 1.801 1.732 1878
|
||
63 1.635 1.431 1.374 2367
|
||
64 1.109 0.896 0.868 3747 «
|
||
127 0.921 0.792 0.779 4176
|
||
128 0.508 0.511 0.494 6589
|
||
255 0.451 0.407 0.402 8081
|
||
256 0.324 0.269 0.260 12498 «
|
||
511 0.249 0.218 0.212 15335
|
||
512 0.178 0.149 0.146 22297 «
|
||
1023 0.138 0.124 0.121 26947
|
||
1024 0.087 0.089 0.087 37238
|
||
2047 0.084 0.077 0.076 43046
|
||
2048 0.066 0.059 0.058 56120
|
||
4095 0.058 0.054 0.054 60706
|
||
4096 0.050 0.046 0.046 71092 «
|
||
8191 0.043 0.042 0.042 78259
|
||
8192 0.037 0.037 0.037 87409
|
||
16383 0.037 0.036 0.035 92065
|
||
16384 0.034 0.034 0.033 97942 « (L1)/2
|
||
32767 0.104 0.084 0.080 40572
|
||
32768 0.079 0.079 0.079 41055
|
||
65535 0.094 0.080 0.076 42885
|
||
65536 0.077 0.075 0.075 43423
|
||
131071 0.092 0.080 0.078 41498
|
||
131072 0.082 0.078 0.077 42350 « (L2)/2
|
||
262143 0.100 0.101 0.287 11342
|
||
262144 0.099 0.099 0.098 33177
|
||
524287 0.106 0.111 0.110 29609
|
||
524288 0.107 0.119 0.110 29608
|
||
1048575 0.104 0.105 0.106 30626
|
||
1048576 0.104 0.111 0.105 30878
|
||
2097151 0.103 0.103 0.103 31606
|
||
2097152 0.102 0.103 0.103 31644
|
||
4194303 0.174 0.160 0.165 19714
|
||
4194304 0.166 0.157 0.154 21110 « (L3)/2
|
||
8388607 0.537 0.554 0.565 5750 « RAM
|
||
8388608 0.701 0.537 0.552 5884
|
||
|
||
musl memcpy() for #c per n where c ≈ 0.293ns
|
||
N x1 x8 x64 mBps
|
||
------------------------------------------------------------
|
||
1 97.000 80.625 79.891 41
|
||
1 77.000 78.875 78.266 42
|
||
2 49.500 44.062 42.102 77
|
||
3 33.667 32.792 30.651 106
|
||
4 29.750 24.281 24.137 135
|
||
7 19.000 16.161 15.734 207
|
||
8 12.125 7.766 6.721 484
|
||
15 8.867 5.892 5.714 569
|
||
16 5.062 3.742 3.458 940
|
||
31 3.645 2.915 2.715 1198
|
||
32 2.156 1.723 1.663 1956
|
||
63 1.540 1.367 1.333 2440
|
||
64 1.078 0.873 0.833 3905
|
||
127 0.874 0.771 0.737 4415
|
||
128 0.617 0.487 0.481 6766
|
||
255 0.443 0.390 0.382 8504
|
||
256 0.316 0.259 0.259 12545
|
||
511 0.245 0.232 0.237 13742
|
||
512 0.174 0.159 0.208 15668
|
||
1023 0.181 0.193 0.182 17821
|
||
1024 0.155 0.123 0.114 28579
|
||
2047 0.102 0.092 0.085 38219
|
||
2048 0.064 0.073 0.070 46577
|
||
4095 0.058 0.067 0.065 50272
|
||
4096 0.049 0.055 0.055 59467
|
||
8191 0.057 0.052 0.049 66468
|
||
8192 0.053 0.050 0.051 63557
|
||
16383 0.082 0.065 0.064 50897
|
||
16384 0.066 0.065 0.061 53697 « (L1)/2
|
||
32767 0.121 0.100 0.114 28555
|
||
32768 0.093 0.091 0.114 28615
|
||
65535 0.118 0.102 0.142 22858
|
||
65536 0.108 0.274 0.097 33432
|
||
131071 0.117 0.109 0.109 29905
|
||
131072 0.110 0.195 0.113 28692 « (L2)/2
|
||
262143 0.283 0.166 0.122 26638
|
||
262144 0.130 0.144 0.123 26544
|
||
524287 0.210 0.153 0.130 25079
|
||
524288 0.126 0.128 0.123 26422
|
||
1048575 0.139 0.107 0.106 30803
|
||
1048576 0.104 0.105 0.106 30683
|
||
2097151 0.103 0.103 0.103 31564
|
||
2097152 0.102 0.103 0.103 31531
|
||
4194303 0.242 0.158 0.169 19238
|
||
4194304 0.166 0.161 0.154 21072 « (L3)/2
|
||
8388607 0.533 0.549 0.599 5422 « RAM
|
||
8388608 0.768 0.630 0.560 5801
|
||
|
||
newlib (aka. cygwin) memcpy() for #c per n where c ≈ 0.293ns
|
||
N x1 x8 x64 mBps
|
||
------------------------------------------------------------
|
||
1 61.000 52.875 53.141 61
|
||
1 49.000 49.875 50.328 65
|
||
2 24.500 24.812 26.727 122
|
||
3 15.667 20.125 16.943 192
|
||
4 12.750 15.281 13.090 248
|
||
7 7.000 7.375 7.431 438
|
||
8 5.875 6.422 6.377 510
|
||
15 3.267 3.375 3.447 943
|
||
16 10.062 6.945 6.386 509
|
||
31 2.548 2.488 2.545 1278
|
||
32 3.156 3.207 3.201 1016
|
||
63 1.190 1.220 1.229 2646
|
||
64 1.578 1.588 1.599 2033
|
||
127 0.717 0.690 0.685 4744
|
||
128 0.820 0.856 0.857 3795
|
||
255 0.357 0.359 0.358 9077
|
||
256 0.629 0.461 0.426 7630
|
||
511 0.260 0.219 0.204 15947
|
||
512 0.330 0.299 0.268 12113
|
||
1023 0.269 0.175 0.162 20042
|
||
1024 0.315 0.201 0.196 16633
|
||
2047 0.349 0.241 0.236 13790
|
||
2048 0.332 0.269 0.264 12295
|
||
4095 0.349 0.295 0.287 11348
|
||
4096 0.361 0.313 0.303 10748
|
||
8191 0.361 0.317 0.322 10110
|
||
8192 0.369 0.326 0.319 10201
|
||
16383 0.321 0.322 0.327 9940
|
||
16384 0.309 0.330 0.329 9878 « (L1)/2
|
||
32767 0.291 0.303 0.307 10599
|
||
32768 0.314 0.304 0.305 10667
|
||
65535 0.373 0.311 0.313 10396
|
||
65536 0.305 0.750 0.421 7729
|
||
131071 0.329 0.427 0.384 8470
|
||
131072 0.329 0.388 0.361 9020 « (L2)/2
|
||
262143 0.520 0.389 0.425 7646
|
||
262144 0.364 0.400 0.368 8843
|
||
524287 0.449 0.389 0.389 8353
|
||
524288 0.384 0.379 0.384 8466
|
||
1048575 0.436 0.397 0.401 8107
|
||
1048576 0.431 0.397 0.401 8112
|
||
2097151 0.417 0.567 0.434 7498
|
||
2097152 0.457 0.503 0.427 7621
|
||
4194303 0.328 0.348 0.368 8822
|
||
4194304 0.343 0.352 0.352 9221 « (L3)/2
|
||
8388607 0.313 0.319 0.326 9957 « RAM
|
||
8388608 0.366 0.320 0.328 9910
|
||
|
||
openbsd memcpy() for #c per n where c ≈ 0.293ns
|
||
N x1 x8 x64 mBps
|
||
------------------------------------------------------------
|
||
1 73.000 41.375 41.484 78
|
||
1 39.000 39.875 41.641 78
|
||
2 28.500 20.688 21.227 153
|
||
3 27.000 15.875 15.557 209
|
||
4 16.750 12.656 12.520 260
|
||
7 20.429 10.982 10.292 316
|
||
8 8.625 5.234 5.576 583
|
||
15 7.267 4.758 4.920 661
|
||
16 4.312 2.742 2.747 1183
|
||
31 4.613 2.891 2.555 1272
|
||
32 2.844 1.520 1.441 2256
|
||
63 2.397 1.268 1.328 2449
|
||
64 1.547 0.822 0.769 4226
|
||
127 1.189 0.782 0.671 4842
|
||
128 0.727 0.532 0.460 7066
|
||
255 0.631 0.463 0.414 7856
|
||
256 0.543 0.374 0.302 10775
|
||
511 0.542 0.316 0.276 11785
|
||
512 0.354 0.260 0.224 14494
|
||
1023 0.267 0.245 0.229 14201
|
||
1024 0.251 0.200 0.197 16496
|
||
2047 0.214 0.226 0.181 17941
|
||
2048 0.189 0.167 0.166 19575
|
||
4095 0.200 0.168 0.163 19957
|
||
4096 0.165 0.155 0.153 21219
|
||
8191 0.158 0.153 0.151 21578
|
||
8192 0.153 0.148 0.147 22138
|
||
16383 0.173 0.148 0.146 22319
|
||
16384 0.153 0.487 0.188 17298 « (L1)/2
|
||
32767 0.161 0.151 0.192 16893
|
||
32768 0.151 0.314 0.213 15275
|
||
65535 0.157 0.154 0.148 21969
|
||
65536 0.147 0.145 0.145 22493
|
||
131071 0.152 0.151 0.154 21145
|
||
131072 0.148 0.229 0.158 20564 « (L2)/2
|
||
262143 0.320 0.183 0.162 20031
|
||
262144 0.330 0.205 0.167 19503
|
||
524287 0.159 0.171 0.163 19913
|
||
524288 0.250 0.189 0.162 20120
|
||
1048575 0.157 0.164 0.161 20182
|
||
1048576 0.155 0.156 0.157 20672
|
||
2097151 0.161 0.158 0.157 20644
|
||
2097152 0.158 0.157 0.165 19727
|
||
4194303 0.327 0.256 0.238 13684
|
||
4194304 0.232 0.220 0.236 13749 « (L3)/2
|
||
8388607 0.721 0.689 0.586 5549 « RAM
|
||
8388608 0.943 0.569 0.593 5481 */
|