mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 03:27:39 +00:00
Make memcmp() and memchr() go fast again
Readahead within the specified size is legal, even if it overlaps a page boundary; it's the fault of the caller if that causes a segfault.
This commit is contained in:
parent
70155df7a9
commit
ff955aaa01
4 changed files with 137 additions and 49 deletions
|
@ -17,11 +17,12 @@
|
|||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/dce.h"
|
||||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
#ifndef __aarch64__
|
||||
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
|
||||
static inline const unsigned char *memchr_pure(const unsigned char *s,
|
||||
unsigned char c, size_t n) {
|
||||
|
@ -69,15 +70,8 @@ static inline const unsigned char *memchr_sse(const unsigned char *s,
|
|||
void *memchr(const void *s, int c, size_t n) {
|
||||
#if defined(__x86_64__) && !defined(__chibicc__)
|
||||
const void *r;
|
||||
const unsigned char *p = (const unsigned char *)s;
|
||||
while (n && ((intptr_t)p & 15)) {
|
||||
if (*p == (unsigned char)c) {
|
||||
return (void *)p;
|
||||
}
|
||||
++p;
|
||||
--n;
|
||||
}
|
||||
r = memchr_sse(p, c, n);
|
||||
if (IsAsan()) __asan_verify(s, n);
|
||||
r = memchr_sse(s, c, n);
|
||||
return (void *)r;
|
||||
#else
|
||||
return (void *)memchr_pure(s, c, n);
|
||||
|
|
|
@ -21,8 +21,88 @@
|
|||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/str/str.h"
|
||||
|
||||
#define PMOVMSKB(x) __builtin_ia32_pmovmskb128(x)
|
||||
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
|
||||
#if defined(__x86_64__) && !defined(__chibicc__)
|
||||
|
||||
static dontinline antiquity int memcmp_sse(const unsigned char *p,
|
||||
const unsigned char *q, size_t n) {
|
||||
unsigned u;
|
||||
if (n > 32) {
|
||||
while (n > 16 + 16) {
|
||||
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
|
||||
n -= 16;
|
||||
p += 16;
|
||||
q += 16;
|
||||
} else {
|
||||
u = __builtin_ctzl(u);
|
||||
return p[u] - q[u];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
|
||||
if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^
|
||||
0xffff)) {
|
||||
return 0;
|
||||
} else {
|
||||
u = __builtin_ctzl(u);
|
||||
return p[n - 16 + u] - q[n - 16 + u];
|
||||
}
|
||||
} else {
|
||||
u = __builtin_ctzl(u);
|
||||
return p[u] - q[u];
|
||||
}
|
||||
}
|
||||
|
||||
_Microarchitecture("avx") static int memcmp_avx(const unsigned char *p,
|
||||
const unsigned char *q,
|
||||
size_t n) {
|
||||
uint64_t w;
|
||||
unsigned u;
|
||||
if (n > 32) {
|
||||
while (n >= 16 + 64) {
|
||||
w = (uint64_t)PMOVMSKB(((xmm_t *)p)[0] == ((xmm_t *)q)[0]) << 000 |
|
||||
(uint64_t)PMOVMSKB(((xmm_t *)p)[1] == ((xmm_t *)q)[1]) << 020 |
|
||||
(uint64_t)PMOVMSKB(((xmm_t *)p)[2] == ((xmm_t *)q)[2]) << 040 |
|
||||
(uint64_t)PMOVMSKB(((xmm_t *)p)[3] == ((xmm_t *)q)[3]) << 060;
|
||||
if (w == -1) {
|
||||
n -= 64;
|
||||
p += 64;
|
||||
q += 64;
|
||||
} else {
|
||||
w = __builtin_ctzll(w ^ -1);
|
||||
return p[w] - q[w];
|
||||
}
|
||||
}
|
||||
while (n > 16 + 16) {
|
||||
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
|
||||
n -= 16;
|
||||
p += 16;
|
||||
q += 16;
|
||||
} else {
|
||||
u = __builtin_ctzl(u);
|
||||
return p[u] - q[u];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
|
||||
if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^
|
||||
0xffff)) {
|
||||
return 0;
|
||||
} else {
|
||||
u = __builtin_ctzl(u);
|
||||
return p[n - 16 + u] - q[n - 16 + u];
|
||||
}
|
||||
} else {
|
||||
u = __builtin_ctzl(u);
|
||||
return p[u] - q[u];
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
/**
|
||||
* Compares memory byte by byte.
|
||||
*
|
||||
|
@ -57,21 +137,64 @@ typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
|||
*/
|
||||
int memcmp(const void *a, const void *b, size_t n) {
|
||||
int c;
|
||||
#if defined(__x86_64__) && !defined(__chibicc__)
|
||||
unsigned u;
|
||||
uint32_t k, i, j;
|
||||
uint64_t w, x, y;
|
||||
#endif
|
||||
const unsigned char *p, *q;
|
||||
if ((p = a) == (q = b) || !n) return 0;
|
||||
if ((c = *p - *q)) return c;
|
||||
#if defined(__x86_64__) && !defined(__chibicc__)
|
||||
unsigned u;
|
||||
while (n >= 16 && (((uintptr_t)p & 0xfff) <= 0x1000 - 16 &&
|
||||
((uintptr_t)q & 0xfff) <= 0x1000 - 16)) {
|
||||
if (!(u = __builtin_ia32_pmovmskb128(*(xmm_t *)p == *(xmm_t *)q) ^
|
||||
0xffff)) {
|
||||
n -= 16;
|
||||
p += 16;
|
||||
q += 16;
|
||||
if (!IsTiny()) {
|
||||
if (n <= 16) {
|
||||
if (n >= 8) {
|
||||
if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
|
||||
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
|
||||
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
|
||||
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^
|
||||
(y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
|
||||
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
|
||||
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
|
||||
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) {
|
||||
p += n - 8;
|
||||
q += n - 8;
|
||||
if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
|
||||
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
|
||||
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
|
||||
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^
|
||||
(y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
|
||||
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
|
||||
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
|
||||
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
u = __builtin_ctzll(w);
|
||||
u = u & -8;
|
||||
return ((x >> u) & 255) - ((y >> u) & 255);
|
||||
} else if (n >= 4) {
|
||||
if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
|
||||
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^
|
||||
(j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
|
||||
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) {
|
||||
p += n - 4;
|
||||
q += n - 4;
|
||||
if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
|
||||
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^
|
||||
(j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
|
||||
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
u = __builtin_ctzl(k);
|
||||
u = u & -8;
|
||||
return ((i >> u) & 255) - ((j >> u) & 255);
|
||||
}
|
||||
} else if (LIKELY(X86_HAVE(AVX))) {
|
||||
return memcmp_avx(p, q, n);
|
||||
} else {
|
||||
u = __builtin_ctzl(u);
|
||||
return p[u] - q[u];
|
||||
return memcmp_sse(p, q, n);
|
||||
}
|
||||
}
|
||||
#endif /* __x86_64__ */
|
||||
|
|
|
@ -16,24 +16,10 @@
|
|||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/runtime/runtime.h"
|
||||
#include "libc/runtime/sysconf.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "libc/sysv/consts/map.h"
|
||||
#include "libc/sysv/consts/prot.h"
|
||||
#include "libc/testlib/testlib.h"
|
||||
|
||||
TEST(memchr, test) {
|
||||
const char *s = "hello";
|
||||
ASSERT_EQ(s + 1, memchr(s, 'e', 5));
|
||||
}
|
||||
|
||||
TEST(memchr, pageOverlapTorture) {
|
||||
long pagesz = sysconf(_SC_PAGESIZE);
|
||||
char *map = mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
|
||||
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
||||
ASSERT_SYS(0, 0, mprotect(map + pagesz, pagesz, PROT_NONE));
|
||||
strcpy(map + pagesz - 9, "12345678");
|
||||
EXPECT_EQ(map + pagesz - 1, memchr(map + pagesz - 9, 0, 79));
|
||||
EXPECT_SYS(0, 0, munmap(map, pagesz * 2));
|
||||
}
|
||||
|
|
|
@ -113,21 +113,6 @@ TEST(memcmp, fuzz) {
|
|||
}
|
||||
}
|
||||
|
||||
TEST(memcmp, pageOverlapTorture) {
|
||||
long pagesz = sysconf(_SC_PAGESIZE);
|
||||
char *map = mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
|
||||
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
||||
char *map2 = mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
|
||||
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
||||
ASSERT_SYS(0, 0, mprotect(map + pagesz, pagesz, PROT_NONE));
|
||||
ASSERT_SYS(0, 0, mprotect(map2 + pagesz, pagesz, PROT_NONE));
|
||||
strcpy(map + pagesz - 9, "12345678");
|
||||
strcpy(map2 + pagesz - 9, "12345679");
|
||||
EXPECT_LT(memcmp(map + pagesz - 9, map2 + pagesz - 9, 79), 0);
|
||||
EXPECT_SYS(0, 0, munmap(map2, pagesz * 2));
|
||||
EXPECT_SYS(0, 0, munmap(map, pagesz * 2));
|
||||
}
|
||||
|
||||
int buncmp(const void *, const void *, size_t) asm("bcmp");
|
||||
int funcmp(const void *, const void *, size_t) asm("memcmp");
|
||||
|
||||
|
|
Loading…
Reference in a new issue