mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-02-12 17:27:56 +00:00
I wanted a tiny scriptable meltdown proof way to run userspace programs and visualize how program execution impacts memory. It helps to explain how things like Actually Portable Executable works. It can show you how the GCC generated code is going about manipulating matrices and more. I didn't feel fully comfortable with Qemu and Bochs because I'm not smart enough to understand them. I wanted something like gVisor but with much stronger levels of assurances. I wanted a single binary that'll run, on all major operating systems with an embedded GPL barrier ZIP filesystem that is tiny enough to transpile to JavaScript and run in browsers too. https://justine.storage.googleapis.com/emulator625.mp4
206 lines
12 KiB
ArmAsm
206 lines
12 KiB
ArmAsm
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
|
│vi: set et ft=asm ts=8 sw=8 fenc=utf-8 :vi│
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
|
│ │
|
|
│ This program is free software; you can redistribute it and/or modify │
|
|
│ it under the terms of the GNU General Public License as published by │
|
|
│ the Free Software Foundation; version 2 of the License. │
|
|
│ │
|
|
│ This program is distributed in the hope that it will be useful, but │
|
|
│ WITHOUT ANY WARRANTY; without even the implied warranty of │
|
|
│ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │
|
|
│ General Public License for more details. │
|
|
│ │
|
|
│ You should have received a copy of the GNU General Public License │
|
|
│ along with this program; if not, write to the Free Software │
|
|
│ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │
|
|
│ 02110-1301 USA │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "libc/macros.h"
|
|
|
|
/ Compares memory.
|
|
/
|
|
/ @param edi first string
|
|
/ @param esi second string
|
|
/ @param edx byte size
|
|
/ @return unsigned char subtraction at stop index
|
|
/ @asyncsignalsafe
|
|
memcmp: jmp *__memcmp(%rip)
|
|
.endfn memcmp,globl
|
|
.source __FILE__
|
|
|
|
/* cosmo memcmp() avx2 for #c per n where c ≈ 0.273ns
|
|
N x1 x8 x64 mBps
|
|
------------------------------------------------------------
|
|
1 61.000 39.375 36.984 95
|
|
1 37.000 37.625 37.391 94
|
|
2 28.500 19.688 19.930 175
|
|
3 20.333 13.625 14.411 243
|
|
4 30.250 10.656 10.426 335
|
|
7 15.000 7.304 6.136 570
|
|
8 10.125 6.234 5.525 633
|
|
15 9.133 3.542 3.570 980
|
|
16 6.062 4.398 3.577 977
|
|
31 4.548 2.931 2.340 1494
|
|
32 2.594 1.520 1.492 2344
|
|
63 3.444 1.240 1.221 2864
|
|
64 1.328 0.736 0.742 4713
|
|
127 1.661 0.710 0.605 5778
|
|
128 0.820 0.452 0.396 8822
|
|
255 0.639 0.360 0.347 10080
|
|
256 0.434 0.250 0.220 15874
|
|
511 0.413 0.218 0.199 17612
|
|
512 0.201 0.176 0.138 25377
|
|
1023 0.216 0.142 0.125 28031
|
|
1024 0.132 0.097 0.096 36276
|
|
2047 0.125 0.091 0.091 38466
|
|
2048 0.093 0.079 0.075 46365
|
|
4095 0.084 0.081 0.078 44705
|
|
4096 0.069 0.069 0.069 50819
|
|
8191 0.070 0.068 0.067 51841
|
|
8192 0.063 0.062 0.062 56633
|
|
16383 0.066 0.063 0.061 56994
|
|
16384 0.059 0.058 0.058 60021
|
|
32767 0.131 0.104 0.100 34909
|
|
32768 0.120 0.084 0.079 44282
|
|
|
|
cosmo memcmp() sse2 (old cpu) for #c per n where c ≈ 0.273ns
|
|
N x1 x8 x64 mBps
|
|
------------------------------------------------------------
|
|
1 59.000 37.125 37.328 94
|
|
1 35.000 37.375 36.359 96
|
|
2 28.500 18.938 20.461 171
|
|
3 19.000 12.875 13.234 264
|
|
4 29.250 10.906 10.348 338
|
|
7 11.571 6.304 6.404 546
|
|
8 8.125 5.672 5.713 612
|
|
15 11.533 4.492 3.759 930
|
|
16 5.812 3.227 2.876 1216
|
|
31 5.516 2.367 1.797 1946
|
|
32 2.969 1.816 1.481 2361
|
|
63 3.413 0.990 0.929 3763
|
|
64 1.703 0.850 0.763 4580
|
|
127 1.614 0.531 0.533 6556
|
|
128 0.961 0.438 0.426 8205
|
|
255 0.922 0.378 0.325 10745
|
|
256 0.457 0.322 0.268 13035
|
|
511 0.331 0.253 0.216 16223
|
|
512 0.287 0.212 0.189 18460
|
|
1023 0.220 0.172 0.164 21378
|
|
1024 0.198 0.159 0.150 23357
|
|
2047 0.161 0.152 0.150 23271
|
|
2048 0.147 0.139 0.136 25732
|
|
4095 0.135 0.130 0.129 27157
|
|
4096 0.129 0.123 0.123 28499
|
|
8191 0.122 0.116 0.116 30110
|
|
8192 0.116 0.113 0.113 30863
|
|
16383 0.117 0.112 0.112 31311
|
|
16384 0.111 0.110 0.110 31802
|
|
32767 0.157 0.138 0.136 25653
|
|
32768 0.144 0.121 0.118 29590
|
|
|
|
glibc memcmp() for #c per n where c ≈ 0.273ns
|
|
N x1 x8 x64 mBps
|
|
------------------------------------------------------------
|
|
1 6875.000 39.125 35.141 100
|
|
1 33.000 35.375 35.078 100
|
|
2 138.500 20.312 18.570 188
|
|
3 26.333 13.958 12.536 279
|
|
4 53.250 12.094 9.512 368
|
|
7 13.571 5.554 5.708 613
|
|
8 19.625 5.328 5.057 691
|
|
15 6.867 3.075 2.801 1248
|
|
16 9.062 2.555 2.526 1384
|
|
31 4.484 1.319 1.313 2663
|
|
32 3.906 1.285 1.299 2691
|
|
63 2.143 0.863 0.719 4867
|
|
64 1.234 0.814 0.718 4873
|
|
127 2.071 0.493 0.428 8174
|
|
128 0.523 0.427 0.421 8310
|
|
255 0.882 0.302 0.250 13983
|
|
256 0.465 0.258 0.266 13143
|
|
511 0.417 0.189 0.164 21339
|
|
512 0.209 0.170 0.160 21862
|
|
1023 0.320 0.120 0.111 31391
|
|
1024 0.128 0.115 0.112 31106
|
|
2047 0.110 0.092 0.088 39803
|
|
2048 0.098 0.088 0.086 40837
|
|
4095 0.093 0.078 0.076 46281
|
|
4096 0.081 0.076 0.075 46400
|
|
8191 0.080 0.071 0.069 50984
|
|
8192 0.075 0.069 0.069 50970
|
|
16383 0.083 0.071 0.068 51591
|
|
16384 0.072 0.071 0.068 51736
|
|
32767 0.145 0.136 0.121 28805
|
|
32768 0.145 0.139 0.137 25469
|
|
|
|
musl memcmp() for #c per n where c ≈ 0.273ns
|
|
N x1 x8 x64 mBps
|
|
------------------------------------------------------------
|
|
1 55.000 37.625 34.484 101
|
|
1 35.000 33.625 34.203 102
|
|
2 37.500 24.562 18.648 188
|
|
3 20.333 13.625 12.766 274
|
|
4 32.750 11.531 9.527 367
|
|
7 12.714 8.482 5.828 600
|
|
8 13.125 6.234 5.330 656
|
|
15 9.000 4.892 3.391 1031
|
|
16 5.188 4.102 3.335 1048
|
|
31 4.806 2.899 2.295 1524
|
|
32 4.406 2.801 2.208 1584
|
|
63 3.794 1.808 1.689 2070
|
|
64 2.672 1.994 1.675 2088
|
|
127 1.961 1.739 1.648 2122
|
|
128 2.055 1.610 1.614 2167
|
|
255 1.463 1.381 1.401 2496
|
|
256 1.457 1.362 1.385 2525
|
|
511 1.286 1.351 1.226 2853
|
|
512 1.256 1.255 1.253 2791
|
|
1023 1.207 1.184 1.180 2964
|
|
1024 1.204 1.146 1.174 2978
|
|
2047 1.134 1.126 1.152 3036
|
|
2048 1.134 1.123 1.149 3044
|
|
4095 1.124 1.108 1.138 3074
|
|
4096 1.117 1.107 1.136 3077
|
|
8191 1.106 1.103 1.102 3174
|
|
8192 1.105 1.102 1.267 2760
|
|
16383 1.110 1.103 1.099 3182
|
|
16384 1.108 1.100 1.098 3184
|
|
32767 1.101 1.097 1.126 3105
|
|
32768 1.128 1.130 1.126 3105
|
|
|
|
newlib memcmp() for #c per n where c ≈ 0.273ns
|
|
N x1 x8 x64 mBps
|
|
------------------------------------------------------------
|
|
1 73.000 39.625 36.297 96
|
|
1 35.000 35.375 35.328 99
|
|
2 41.500 19.438 18.508 189
|
|
3 29.667 13.542 13.005 269
|
|
4 22.750 10.656 10.332 338
|
|
7 14.714 6.875 6.248 560
|
|
8 18.125 6.453 5.846 598
|
|
15 11.533 3.575 3.547 986
|
|
16 8.062 3.461 2.880 1214
|
|
31 3.839 2.931 2.689 1300
|
|
32 5.594 1.848 1.589 2200
|
|
63 3.667 2.387 2.242 1560
|
|
64 2.078 1.170 0.842 4153
|
|
127 2.228 2.111 2.126 1644
|
|
128 1.617 0.669 0.510 6858
|
|
255 2.059 1.960 1.964 1781
|
|
256 0.590 0.398 0.335 10452
|
|
511 1.841 1.814 1.811 1931
|
|
512 0.373 0.275 0.252 13860
|
|
1023 1.788 1.748 2.426 1441
|
|
1024 0.261 0.230 0.226 15474
|
|
2047 1.745 1.731 1.774 1971
|
|
2048 0.218 0.199 0.197 17741
|
|
4095 1.771 1.764 1.763 1983
|
|
4096 0.187 0.177 0.181 19353
|
|
8191 1.722 1.714 1.714 2040
|
|
8192 0.173 0.174 0.173 20252
|
|
16383 1.754 1.754 1.845 1895
|
|
16384 0.175 0.171 0.169 20692
|
|
32767 1.753 1.753 1.753 1995
|
|
32768 0.186 0.173 0.170 20510 */
|