cosmopolitan/libc/intrin/memcmp.c
Justine Tunney 226aaf3547 Improve memory safety
This commit makes numerous refinements to cosmopolitan memory handling.

The default stack size has been reduced from 2mb to 128kb. A new macro
is now provided so you can easily reconfigure the stack size to be any
value you want. Work around the breaking change by adding to your main:

    STATIC_STACK_SIZE(0x00200000);  // 2mb stack

If you're not sure how much stack you need, then you can use:

    STATIC_YOINK("stack_usage_logging");

After which you can `sort -nr o/$MODE/stack.log`. Based on the unit test
suite, nothing in the Cosmopolitan repository (except for Python) needs
a stack size greater than 30kb. There are also new macros for detecting
the size and address of the stack at runtime, e.g. GetStackAddr(). We
also now support sigaltstack() so if you want to see nice looking crash
reports whenever a stack overflow happens, you can put this in main():

    ShowCrashReports();

Under `make MODE=dbg` and `make MODE=asan` the unit testing framework
will now automatically print backtraces of memory allocations when
things like memory leaks happen. Bugs are now fixed in ASAN global
variable overrun detection. The memtrack and asan runtimes also handle
edge cases now. The new tools helped to identify a few memory leaks,
which are fixed by this change.

This change should fix an issue reported in #288 with ARG_MAX limits.
Fixing this doubled the performance of MKDEPS.COM and AR.COM yet again.
2021-10-13 17:27:13 -07:00

197 lines
8.4 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/bits/likely.h"
#include "libc/dce.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#define PMOVMSKB(x) __builtin_ia32_pmovmskb128(x)
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
static noinline antiquity int memcmp_sse(const unsigned char *p,
const unsigned char *q, size_t n) {
uint64_t w;
unsigned u, u0, u1, u2, u3;
if (n > 32) {
while (n > 16 + 16) {
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
n -= 16;
p += 16;
q += 16;
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
}
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^
0xffff)) {
return 0;
} else {
u = __builtin_ctzl(u);
return p[n - 16 + u] - q[n - 16 + u];
}
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
microarchitecture("avx") static int memcmp_avx(const unsigned char *p,
const unsigned char *q,
size_t n) {
uint64_t w;
unsigned u;
if (n > 32) {
while (n >= 16 + 64) {
w = (uint64_t)PMOVMSKB(((xmm_t *)p)[0] == ((xmm_t *)q)[0]) << 000 |
(uint64_t)PMOVMSKB(((xmm_t *)p)[1] == ((xmm_t *)q)[1]) << 020 |
(uint64_t)PMOVMSKB(((xmm_t *)p)[2] == ((xmm_t *)q)[2]) << 040 |
(uint64_t)PMOVMSKB(((xmm_t *)p)[3] == ((xmm_t *)q)[3]) << 060;
if (w == -1) {
n -= 64;
p += 64;
q += 64;
} else {
w = __builtin_ctzll(w ^ -1);
return p[w] - q[w];
}
}
while (n > 16 + 16) {
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
n -= 16;
p += 16;
q += 16;
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
}
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^
0xffff)) {
return 0;
} else {
u = __builtin_ctzl(u);
return p[n - 16 + u] - q[n - 16 + u];
}
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
/**
* Compares memory byte by byte.
*
* memcmp n=0 992 picoseconds
* memcmp n=1 1 ns/byte 738 mb/s
* memcmp n=2 661 ps/byte 1,476 mb/s
* memcmp n=3 551 ps/byte 1,771 mb/s
* memcmp n=4 248 ps/byte 3,936 mb/s
* memcmp n=5 198 ps/byte 4,920 mb/s
* memcmp n=6 165 ps/byte 5,904 mb/s
* memcmp n=7 141 ps/byte 6,889 mb/s
* memcmp n=8 124 ps/byte 7,873 mb/s
* memcmp n=9 110 ps/byte 8,857 mb/s
* memcmp n=15 44 ps/byte 22,143 mb/s
* memcmp n=16 41 ps/byte 23,619 mb/s
* memcmp n=17 77 ps/byte 12,547 mb/s
* memcmp n=31 42 ps/byte 22,881 mb/s
* memcmp n=32 41 ps/byte 23,619 mb/s
* memcmp n=33 60 ps/byte 16,238 mb/s
* memcmp n=80 53 ps/byte 18,169 mb/s
* memcmp n=128 38 ps/byte 25,194 mb/s
* memcmp n=256 32 ps/byte 30,233 mb/s
* memcmp n=16384 27 ps/byte 35,885 mb/s
* memcmp n=32768 29 ps/byte 32,851 mb/s
* memcmp n=131072 33 ps/byte 28,983 mb/s
*
* @return unsigned char subtraction at stop index
* @asyncsignalsafe
*/
int memcmp(const void *a, const void *b, size_t n) {
int c;
unsigned u;
uint32_t k, i, j;
uint64_t w, x, y;
const unsigned char *p, *q;
if ((p = a) == (q = b) || !n) return 0;
if ((c = *p - *q)) return c;
if (!IsTiny()) {
if (n <= 16) {
if (n >= 8) {
if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^
(y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) {
p += n - 8;
q += n - 8;
if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^
(y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) {
return 0;
}
}
u = __builtin_ctzll(w);
u = u & -8;
return ((x >> u) & 255) - ((y >> u) & 255);
} else if (n >= 4) {
if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^
(j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) {
p += n - 4;
q += n - 4;
if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^
(j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) {
return 0;
}
}
u = __builtin_ctzl(k);
u = u & -8;
return ((i >> u) & 255) - ((j >> u) & 255);
}
} else if (LIKELY(X86_HAVE(AVX))) {
return memcmp_avx(p, q, n);
} else {
return memcmp_sse(p, q, n);
}
}
for (; n; ++p, ++q, --n) {
if ((c = *p - *q)) {
return c;
}
}
return 0;
}