Make memcmp() and memchr() go fast again

Readahead within the specified size is legal, even if it overlaps a page
boundary; it's the fault of the caller if that causes a segfault.
This commit is contained in:
Justine Tunney 2023-11-29 05:17:21 -08:00
parent 70155df7a9
commit ff955aaa01
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
4 changed files with 137 additions and 49 deletions

View file

@ -17,11 +17,12 @@
PERFORMANCE OF THIS SOFTWARE. PERFORMANCE OF THIS SOFTWARE.
*/ */
#include "libc/dce.h" #include "libc/dce.h"
#include "libc/intrin/asan.internal.h"
#include "libc/nexgen32e/x86feature.h" #include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h" #include "libc/str/str.h"
#ifndef __aarch64__ #ifndef __aarch64__
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16))); typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
static inline const unsigned char *memchr_pure(const unsigned char *s, static inline const unsigned char *memchr_pure(const unsigned char *s,
unsigned char c, size_t n) { unsigned char c, size_t n) {
@ -69,15 +70,8 @@ static inline const unsigned char *memchr_sse(const unsigned char *s,
void *memchr(const void *s, int c, size_t n) { void *memchr(const void *s, int c, size_t n) {
#if defined(__x86_64__) && !defined(__chibicc__) #if defined(__x86_64__) && !defined(__chibicc__)
const void *r; const void *r;
const unsigned char *p = (const unsigned char *)s; if (IsAsan()) __asan_verify(s, n);
while (n && ((intptr_t)p & 15)) { r = memchr_sse(s, c, n);
if (*p == (unsigned char)c) {
return (void *)p;
}
++p;
--n;
}
r = memchr_sse(p, c, n);
return (void *)r; return (void *)r;
#else #else
return (void *)memchr_pure(s, c, n); return (void *)memchr_pure(s, c, n);

View file

@ -21,8 +21,88 @@
#include "libc/nexgen32e/x86feature.h" #include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h" #include "libc/str/str.h"
#define PMOVMSKB(x) __builtin_ia32_pmovmskb128(x)
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1))); typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
#if defined(__x86_64__) && !defined(__chibicc__)
static dontinline antiquity int memcmp_sse(const unsigned char *p,
const unsigned char *q, size_t n) {
unsigned u;
if (n > 32) {
while (n > 16 + 16) {
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
n -= 16;
p += 16;
q += 16;
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
}
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^
0xffff)) {
return 0;
} else {
u = __builtin_ctzl(u);
return p[n - 16 + u] - q[n - 16 + u];
}
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
_Microarchitecture("avx") static int memcmp_avx(const unsigned char *p,
const unsigned char *q,
size_t n) {
uint64_t w;
unsigned u;
if (n > 32) {
while (n >= 16 + 64) {
w = (uint64_t)PMOVMSKB(((xmm_t *)p)[0] == ((xmm_t *)q)[0]) << 000 |
(uint64_t)PMOVMSKB(((xmm_t *)p)[1] == ((xmm_t *)q)[1]) << 020 |
(uint64_t)PMOVMSKB(((xmm_t *)p)[2] == ((xmm_t *)q)[2]) << 040 |
(uint64_t)PMOVMSKB(((xmm_t *)p)[3] == ((xmm_t *)q)[3]) << 060;
if (w == -1) {
n -= 64;
p += 64;
q += 64;
} else {
w = __builtin_ctzll(w ^ -1);
return p[w] - q[w];
}
}
while (n > 16 + 16) {
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
n -= 16;
p += 16;
q += 16;
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
}
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^
0xffff)) {
return 0;
} else {
u = __builtin_ctzl(u);
return p[n - 16 + u] - q[n - 16 + u];
}
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
#endif /* __x86_64__ */
/** /**
* Compares memory byte by byte. * Compares memory byte by byte.
* *
@ -57,21 +137,64 @@ typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
*/ */
int memcmp(const void *a, const void *b, size_t n) { int memcmp(const void *a, const void *b, size_t n) {
int c; int c;
#if defined(__x86_64__) && !defined(__chibicc__)
unsigned u;
uint32_t k, i, j;
uint64_t w, x, y;
#endif
const unsigned char *p, *q; const unsigned char *p, *q;
if ((p = a) == (q = b) || !n) return 0; if ((p = a) == (q = b) || !n) return 0;
if ((c = *p - *q)) return c; if ((c = *p - *q)) return c;
#if defined(__x86_64__) && !defined(__chibicc__) #if defined(__x86_64__) && !defined(__chibicc__)
unsigned u; if (!IsTiny()) {
while (n >= 16 && (((uintptr_t)p & 0xfff) <= 0x1000 - 16 && if (n <= 16) {
((uintptr_t)q & 0xfff) <= 0x1000 - 16)) { if (n >= 8) {
if (!(u = __builtin_ia32_pmovmskb128(*(xmm_t *)p == *(xmm_t *)q) ^ if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
0xffff)) { (uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
n -= 16; (uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
p += 16; (uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^
q += 16; (y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) {
p += n - 8;
q += n - 8;
if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^
(y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) {
return 0;
}
}
u = __builtin_ctzll(w);
u = u & -8;
return ((x >> u) & 255) - ((y >> u) & 255);
} else if (n >= 4) {
if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^
(j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) {
p += n - 4;
q += n - 4;
if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^
(j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) {
return 0;
}
}
u = __builtin_ctzl(k);
u = u & -8;
return ((i >> u) & 255) - ((j >> u) & 255);
}
} else if (LIKELY(X86_HAVE(AVX))) {
return memcmp_avx(p, q, n);
} else { } else {
u = __builtin_ctzl(u); return memcmp_sse(p, q, n);
return p[u] - q[u];
} }
} }
#endif /* __x86_64__ */ #endif /* __x86_64__ */

View file

@ -16,24 +16,10 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE. PERFORMANCE OF THIS SOFTWARE.
*/ */
#include "libc/runtime/runtime.h"
#include "libc/runtime/sysconf.h"
#include "libc/str/str.h" #include "libc/str/str.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/prot.h"
#include "libc/testlib/testlib.h" #include "libc/testlib/testlib.h"
TEST(memchr, test) { TEST(memchr, test) {
const char *s = "hello"; const char *s = "hello";
ASSERT_EQ(s + 1, memchr(s, 'e', 5)); ASSERT_EQ(s + 1, memchr(s, 'e', 5));
} }
TEST(memchr, pageOverlapTorture) {
long pagesz = sysconf(_SC_PAGESIZE);
char *map = mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
ASSERT_SYS(0, 0, mprotect(map + pagesz, pagesz, PROT_NONE));
strcpy(map + pagesz - 9, "12345678");
EXPECT_EQ(map + pagesz - 1, memchr(map + pagesz - 9, 0, 79));
EXPECT_SYS(0, 0, munmap(map, pagesz * 2));
}

View file

@ -113,21 +113,6 @@ TEST(memcmp, fuzz) {
} }
} }
TEST(memcmp, pageOverlapTorture) {
long pagesz = sysconf(_SC_PAGESIZE);
char *map = mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
char *map2 = mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
ASSERT_SYS(0, 0, mprotect(map + pagesz, pagesz, PROT_NONE));
ASSERT_SYS(0, 0, mprotect(map2 + pagesz, pagesz, PROT_NONE));
strcpy(map + pagesz - 9, "12345678");
strcpy(map2 + pagesz - 9, "12345679");
EXPECT_LT(memcmp(map + pagesz - 9, map2 + pagesz - 9, 79), 0);
EXPECT_SYS(0, 0, munmap(map2, pagesz * 2));
EXPECT_SYS(0, 0, munmap(map, pagesz * 2));
}
int buncmp(const void *, const void *, size_t) asm("bcmp"); int buncmp(const void *, const void *, size_t) asm("bcmp");
int funcmp(const void *, const void *, size_t) asm("memcmp"); int funcmp(const void *, const void *, size_t) asm("memcmp");