diff --git a/libc/intrin/memchr.c b/libc/intrin/memchr.c index ad5d3414c..206bd84e0 100644 --- a/libc/intrin/memchr.c +++ b/libc/intrin/memchr.c @@ -17,11 +17,12 @@ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/dce.h" +#include "libc/intrin/asan.internal.h" #include "libc/nexgen32e/x86feature.h" #include "libc/str/str.h" #ifndef __aarch64__ -typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16))); +typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1))); static inline const unsigned char *memchr_pure(const unsigned char *s, unsigned char c, size_t n) { @@ -69,15 +70,8 @@ static inline const unsigned char *memchr_sse(const unsigned char *s, void *memchr(const void *s, int c, size_t n) { #if defined(__x86_64__) && !defined(__chibicc__) const void *r; - const unsigned char *p = (const unsigned char *)s; - while (n && ((intptr_t)p & 15)) { - if (*p == (unsigned char)c) { - return (void *)p; - } - ++p; - --n; - } - r = memchr_sse(p, c, n); + if (IsAsan()) __asan_verify(s, n); + r = memchr_sse(s, c, n); return (void *)r; #else return (void *)memchr_pure(s, c, n); diff --git a/libc/intrin/memcmp.c b/libc/intrin/memcmp.c index 8688a2527..f871d8223 100644 --- a/libc/intrin/memcmp.c +++ b/libc/intrin/memcmp.c @@ -21,8 +21,88 @@ #include "libc/nexgen32e/x86feature.h" #include "libc/str/str.h" +#define PMOVMSKB(x) __builtin_ia32_pmovmskb128(x) + typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1))); +#if defined(__x86_64__) && !defined(__chibicc__) + +static dontinline antiquity int memcmp_sse(const unsigned char *p, + const unsigned char *q, size_t n) { + unsigned u; + if (n > 32) { + while (n > 16 + 16) { + if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) { + n -= 16; + p += 16; + q += 16; + } else { + u = __builtin_ctzl(u); + return p[u] - q[u]; + } + } + } + if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) { + if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^ + 0xffff)) { + return 0; + } else { + u = __builtin_ctzl(u); + return p[n - 16 + u] - q[n - 16 + u]; + } + } else { + u = __builtin_ctzl(u); + return p[u] - q[u]; + } +} + +_Microarchitecture("avx") static int memcmp_avx(const unsigned char *p, + const unsigned char *q, + size_t n) { + uint64_t w; + unsigned u; + if (n > 32) { + while (n >= 16 + 64) { + w = (uint64_t)PMOVMSKB(((xmm_t *)p)[0] == ((xmm_t *)q)[0]) << 000 | + (uint64_t)PMOVMSKB(((xmm_t *)p)[1] == ((xmm_t *)q)[1]) << 020 | + (uint64_t)PMOVMSKB(((xmm_t *)p)[2] == ((xmm_t *)q)[2]) << 040 | + (uint64_t)PMOVMSKB(((xmm_t *)p)[3] == ((xmm_t *)q)[3]) << 060; + if (w == -1) { + n -= 64; + p += 64; + q += 64; + } else { + w = __builtin_ctzll(w ^ -1); + return p[w] - q[w]; + } + } + while (n > 16 + 16) { + if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) { + n -= 16; + p += 16; + q += 16; + } else { + u = __builtin_ctzl(u); + return p[u] - q[u]; + } + } + } + if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) { + if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^ + 0xffff)) { + return 0; + } else { + u = __builtin_ctzl(u); + return p[n - 16 + u] - q[n - 16 + u]; + } + } else { + u = __builtin_ctzl(u); + return p[u] - q[u]; + } +} + +#endif /* __x86_64__ */ + /** * Compares memory byte by byte. * @@ -57,21 +137,64 @@ typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1))); */ int memcmp(const void *a, const void *b, size_t n) { int c; +#if defined(__x86_64__) && !defined(__chibicc__) + unsigned u; + uint32_t k, i, j; + uint64_t w, x, y; +#endif const unsigned char *p, *q; if ((p = a) == (q = b) || !n) return 0; if ((c = *p - *q)) return c; #if defined(__x86_64__) && !defined(__chibicc__) - unsigned u; - while (n >= 16 && (((uintptr_t)p & 0xfff) <= 0x1000 - 16 && - ((uintptr_t)q & 0xfff) <= 0x1000 - 16)) { - if (!(u = __builtin_ia32_pmovmskb128(*(xmm_t *)p == *(xmm_t *)q) ^ - 0xffff)) { - n -= 16; - p += 16; - q += 16; + if (!IsTiny()) { + if (n <= 16) { + if (n >= 8) { + if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 | + (uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 | + (uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 | + (uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^ + (y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 | + (uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 | + (uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 | + (uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) { + p += n - 8; + q += n - 8; + if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 | + (uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 | + (uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 | + (uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^ + (y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 | + (uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 | + (uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 | + (uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) { + return 0; + } + } + u = __builtin_ctzll(w); + u = u & -8; + return ((x >> u) & 255) - ((y >> u) & 255); + } else if (n >= 4) { + if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 | + (uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^ + (j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 | + (uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) { + p += n - 4; + q += n - 4; + if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 | + (uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^ + (j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 | + (uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) { + return 0; + } + } + u = __builtin_ctzl(k); + u = u & -8; + return ((i >> u) & 255) - ((j >> u) & 255); + } + } else if (LIKELY(X86_HAVE(AVX))) { + return memcmp_avx(p, q, n); } else { - u = __builtin_ctzl(u); - return p[u] - q[u]; + return memcmp_sse(p, q, n); } } #endif /* __x86_64__ */ diff --git a/test/libc/intrin/memchr_test.c b/test/libc/intrin/memchr_test.c index a2d32b3bc..d3296f7c3 100644 --- a/test/libc/intrin/memchr_test.c +++ b/test/libc/intrin/memchr_test.c @@ -16,24 +16,10 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/runtime/runtime.h" -#include "libc/runtime/sysconf.h" #include "libc/str/str.h" -#include "libc/sysv/consts/map.h" -#include "libc/sysv/consts/prot.h" #include "libc/testlib/testlib.h" TEST(memchr, test) { const char *s = "hello"; ASSERT_EQ(s + 1, memchr(s, 'e', 5)); } - -TEST(memchr, pageOverlapTorture) { - long pagesz = sysconf(_SC_PAGESIZE); - char *map = mmap(0, pagesz * 2, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - ASSERT_SYS(0, 0, mprotect(map + pagesz, pagesz, PROT_NONE)); - strcpy(map + pagesz - 9, "12345678"); - EXPECT_EQ(map + pagesz - 1, memchr(map + pagesz - 9, 0, 79)); - EXPECT_SYS(0, 0, munmap(map, pagesz * 2)); -} diff --git a/test/libc/intrin/memcmp_test.c b/test/libc/intrin/memcmp_test.c index 602337ab3..e606db663 100644 --- a/test/libc/intrin/memcmp_test.c +++ b/test/libc/intrin/memcmp_test.c @@ -113,21 +113,6 @@ TEST(memcmp, fuzz) { } } -TEST(memcmp, pageOverlapTorture) { - long pagesz = sysconf(_SC_PAGESIZE); - char *map = mmap(0, pagesz * 2, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - char *map2 = mmap(0, pagesz * 2, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - ASSERT_SYS(0, 0, mprotect(map + pagesz, pagesz, PROT_NONE)); - ASSERT_SYS(0, 0, mprotect(map2 + pagesz, pagesz, PROT_NONE)); - strcpy(map + pagesz - 9, "12345678"); - strcpy(map2 + pagesz - 9, "12345679"); - EXPECT_LT(memcmp(map + pagesz - 9, map2 + pagesz - 9, 79), 0); - EXPECT_SYS(0, 0, munmap(map2, pagesz * 2)); - EXPECT_SYS(0, 0, munmap(map, pagesz * 2)); -} - int buncmp(const void *, const void *, size_t) asm("bcmp"); int funcmp(const void *, const void *, size_t) asm("memcmp");