cosmopolitan/libc/intrin/memcmp.c
Justine Tunney 39bf41f4eb Make numerous improvements
- Python static hello world now 1.8mb
- Python static fully loaded now 10mb
- Python HTTPS client now uses MbedTLS
- Python REPL now completes import stmts
- Increase stack size for Python for now
- Begin synthesizing posixpath and ntpath
- Restore Python \N{UNICODE NAME} support
- Restore Python NFKD symbol normalization
- Add optimized code path for Intel SHA-NI
- Get more Python unit tests passing faster
- Get Python help() pagination working on NT
- Python hashlib now supports MbedTLS PBKDF2
- Make memcpy/memmove/memcmp/bcmp/etc. faster
- Add Mersenne Twister and Vigna to LIBC_RAND
- Provide privileged __printf() for error code
- Fix zipos opendir() so that it reports ENOTDIR
- Add basic chmod() implementation for Windows NT
- Add Cosmo's best functions to Python cosmo module
- Pin function trace indent depth to that of caller
- Show memory diagram on invalid access in MODE=dbg
- Differentiate stack overflow on crash in MODE=dbg
- Add stb_truetype and tools for analyzing font files
- Upgrade to UNICODE 13 and reduce its binary footprint
- COMPILE.COM now logs resource usage of build commands
- Start implementing basic poll() support on bare metal
- Set getauxval(AT_EXECFN) to GetModuleFileName() on NT
- Add descriptions to strerror() in non-TINY build modes
- Add COUNTBRANCH() macro to help with micro-optimizations
- Make error / backtrace / asan / memory code more unbreakable
- Add fast perfect C implementation of μ-Law and a-Law audio codecs
- Make strtol() functions consistent with other libc implementations
- Improve Linenoise implementation (see also github.com/jart/bestline)
- COMPILE.COM now suppresses stdout/stderr of successful build commands
2021-09-28 01:52:34 -07:00

210 lines
9.1 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/bits/likely.h"
#include "libc/dce.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
static noinline antiquity int memcmp_sse(const unsigned char *p,
const unsigned char *q, size_t n) {
uint64_t w;
unsigned u, u0, u1, u2, u3;
if (n > 32) {
while (n > 16 + 16) {
if (!(u = __builtin_ia32_pmovmskb128(__builtin_ia32_pcmpeqb128(
*(const xmm_t *)p, *(const xmm_t *)q)) -
0xffff)) {
n -= 16;
p += 16;
q += 16;
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
}
if (!(u = __builtin_ia32_pmovmskb128(__builtin_ia32_pcmpeqb128(
*(const xmm_t *)p, *(const xmm_t *)q)) -
0xffff)) {
if (!(u = __builtin_ia32_pmovmskb128(__builtin_ia32_pcmpeqb128(
*(const xmm_t *)(p + n - 16), *(const xmm_t *)(q + n - 16))) -
0xffff)) {
return 0;
} else {
u = __builtin_ctzl(u);
return p[n - 16 + u] - q[n - 16 + u];
}
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
microarchitecture("avx") static int memcmp_avx(const unsigned char *p,
const unsigned char *q,
size_t n) {
uint64_t w;
unsigned u, u0, u1, u2, u3;
if (n > 32) {
while (n >= 16 + 64) {
u0 = __builtin_ia32_pmovmskb128(__builtin_ia32_pcmpeqb128(
((const xmm_t *)p)[0], ((const xmm_t *)q)[0]));
u1 = __builtin_ia32_pmovmskb128(__builtin_ia32_pcmpeqb128(
((const xmm_t *)p)[1], ((const xmm_t *)q)[1]));
u2 = __builtin_ia32_pmovmskb128(__builtin_ia32_pcmpeqb128(
((const xmm_t *)p)[2], ((const xmm_t *)q)[2]));
u3 = __builtin_ia32_pmovmskb128(__builtin_ia32_pcmpeqb128(
((const xmm_t *)p)[3], ((const xmm_t *)q)[3]));
w = (uint64_t)u0 | (uint64_t)u1 << 16 | (uint64_t)u2 << 32 |
(uint64_t)u3 << 48;
if (w == -1) {
n -= 64;
p += 64;
q += 64;
} else {
w = __builtin_ctzll(w ^ -1);
return p[w] - q[w];
}
}
while (n > 16 + 16) {
if (!(u = __builtin_ia32_pmovmskb128(__builtin_ia32_pcmpeqb128(
*(const xmm_t *)p, *(const xmm_t *)q)) -
0xffff)) {
n -= 16;
p += 16;
q += 16;
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
}
if (!(u = __builtin_ia32_pmovmskb128(__builtin_ia32_pcmpeqb128(
*(const xmm_t *)p, *(const xmm_t *)q)) -
0xffff)) {
if (!(u = __builtin_ia32_pmovmskb128(__builtin_ia32_pcmpeqb128(
*(const xmm_t *)(p + n - 16), *(const xmm_t *)(q + n - 16))) -
0xffff)) {
return 0;
} else {
u = __builtin_ctzl(u);
return p[n - 16 + u] - q[n - 16 + u];
}
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
/**
* Compares memory byte by byte.
*
* memcmp n=0 992 picoseconds
* memcmp n=1 1 ns/byte 738 mb/s
* memcmp n=2 661 ps/byte 1,476 mb/s
* memcmp n=3 551 ps/byte 1,771 mb/s
* memcmp n=4 248 ps/byte 3,936 mb/s
* memcmp n=5 198 ps/byte 4,920 mb/s
* memcmp n=6 165 ps/byte 5,904 mb/s
* memcmp n=7 141 ps/byte 6,889 mb/s
* memcmp n=8 124 ps/byte 7,873 mb/s
* memcmp n=9 110 ps/byte 8,857 mb/s
* memcmp n=15 44 ps/byte 22,143 mb/s
* memcmp n=16 41 ps/byte 23,619 mb/s
* memcmp n=17 77 ps/byte 12,547 mb/s
* memcmp n=31 42 ps/byte 22,881 mb/s
* memcmp n=32 41 ps/byte 23,619 mb/s
* memcmp n=33 60 ps/byte 16,238 mb/s
* memcmp n=80 53 ps/byte 18,169 mb/s
* memcmp n=128 38 ps/byte 25,194 mb/s
* memcmp n=256 32 ps/byte 30,233 mb/s
* memcmp n=16384 27 ps/byte 35,885 mb/s
* memcmp n=32768 29 ps/byte 32,851 mb/s
* memcmp n=131072 33 ps/byte 28,983 mb/s
*
* @return unsigned char subtraction at stop index
* @asyncsignalsafe
*/
int memcmp(const void *a, const void *b, size_t n) {
int c;
unsigned u;
uint32_t k, i, j;
uint64_t w, x, y;
const unsigned char *p, *q;
if ((p = a) == (q = b)) return 0;
if (!IsTiny()) {
if (n <= 16) {
if (n >= 8) {
if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^
(y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) {
p += n - 8;
q += n - 8;
if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^
(y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) {
return 0;
}
}
u = __builtin_ctzll(w);
u = u & -8;
return ((x >> u) & 255) - ((y >> u) & 255);
} else if (n >= 4) {
if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^
(j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) {
p += n - 4;
q += n - 4;
if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^
(j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) {
return 0;
}
}
u = __builtin_ctzl(k);
u = u & -8;
return ((i >> u) & 255) - ((j >> u) & 255);
}
} else if (LIKELY(X86_HAVE(AVX))) {
return memcmp_avx(p, q, n);
} else {
return memcmp_sse(p, q, n);
}
}
for (; n; ++p, ++q, --n) {
if ((c = *p - *q)) {
return c;
}
}
return 0;
}