From e4d6eb382adb07ec3becd445fb0676e7b7e77b92 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Mon, 30 Sep 2024 05:54:34 -0700 Subject: [PATCH] Make memchr() and memccpy() faster --- libc/intrin/memchr.c | 39 ++++++++++++---------- libc/str/memccpy.c | 15 +++++---- test/libc/str/memccpy_test.c | 65 ++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 24 deletions(-) diff --git a/libc/intrin/memchr.c b/libc/intrin/memchr.c index 6680c5292..fbe1ad409 100644 --- a/libc/intrin/memchr.c +++ b/libc/intrin/memchr.c @@ -19,10 +19,10 @@ #include "libc/dce.h" #include "libc/nexgen32e/x86feature.h" #include "libc/str/str.h" +#include "third_party/aarch64/arm_neon.internal.h" +#include "third_party/intel/immintrin.internal.h" #ifndef __aarch64__ -typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1))); - static inline const unsigned char *memchr_pure(const unsigned char *s, unsigned char c, size_t n) { size_t i; @@ -35,22 +35,27 @@ static inline const unsigned char *memchr_pure(const unsigned char *s, } #if defined(__x86_64__) && !defined(__chibicc__) -static __vex const unsigned char *memchr_sse(const unsigned char *s, - unsigned char c, size_t n) { - size_t i; - unsigned m; - xmm_t v, t = {c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c}; - for (; n >= 16; n -= 16, s += 16) { - v = *(const xmm_t *)s; - m = __builtin_ia32_pmovmskb128(v == t); - if (m) { - m = __builtin_ctzll(m); - return s + m; - } +static const char *memchr_sse(const char *s, char c, size_t n) { + const char *e = s + n; + __m128i t = _mm_set1_epi8(c); + unsigned m, k = (uintptr_t)s & 15; + m = _mm_movemask_epi8( + _mm_cmpeq_epi8(_mm_load_si128((const __m128i *)((uintptr_t)s & -16)), t)); + m >>= k; + if (m) { + s += __builtin_ctz(m); + if (s < e) + return s; + return 0; } - for (i = 0; i < n; ++i) { - if (s[i] == c) { - return s + i; + for (s += 16 - k; s < e; s += 16) { + m = _mm_movemask_epi8( + _mm_cmpeq_epi8(_mm_load_si128((const __m128i *)s), t)); + if (m) { + s += __builtin_ctz(m); + if (s < e) + return s; + return 0; } } return 0; diff --git a/libc/str/memccpy.c b/libc/str/memccpy.c index e910f0929..2d42920a6 100644 --- a/libc/str/memccpy.c +++ b/libc/str/memccpy.c @@ -45,13 +45,14 @@ * @asyncsignalsafe */ void *memccpy(void *dst, const void *src, int c, size_t n) { - char *d; - size_t i; - const char *s; - for (d = dst, s = src, i = 0; i < n; ++i) { - if (((d[i] = s[i]) & 255) == (c & 255)) { - return d + i + 1; - } + const char *p; + // this memchr() call is only correct if your memchr() implementation + // offers the same readahead safety guarantees as cosmopolitan's does + if ((p = memchr(src, c, n))) { + size_t m = p + 1 - (const char *)src; + memmove(dst, src, m); + return (char *)dst + m; } + memmove(dst, src, n); return 0; } diff --git a/test/libc/str/memccpy_test.c b/test/libc/str/memccpy_test.c index 5b54c189f..aebe301bb 100644 --- a/test/libc/str/memccpy_test.c +++ b/test/libc/str/memccpy_test.c @@ -16,10 +16,18 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/assert.h" +#include "libc/calls/calls.h" +#include "libc/intrin/safemacros.h" #include "libc/mem/mem.h" +#include "libc/runtime/runtime.h" +#include "libc/runtime/sysconf.h" #include "libc/stdio/rand.h" #include "libc/stdio/stdio.h" #include "libc/str/str.h" +#include "libc/sysv/consts/map.h" +#include "libc/sysv/consts/prot.h" +#include "libc/testlib/benchmark.h" #include "libc/testlib/ezbench.h" #include "libc/testlib/testlib.h" @@ -50,6 +58,40 @@ TEST(memccpy, testZeroLength_doesNothing) { EXPECT_EQ(NULL, memccpy(buf, "hi", '\0', 0)); } +TEST(memccpy, fuzz) { + int pagesz = sysconf(_SC_PAGESIZE); + char *map1 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + npassert(map1 != MAP_FAILED); + npassert(!mprotect(map1 + pagesz, pagesz, PROT_NONE)); + char *map2 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + npassert(map2 != MAP_FAILED); + npassert(!mprotect(map2 + pagesz, pagesz, PROT_NONE)); + char *map3 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + npassert(map3 != MAP_FAILED); + npassert(!mprotect(map3 + pagesz, pagesz, PROT_NONE)); + for (int dsize = 1; dsize < 128; ++dsize) { + char *volatile dst1 = map1 + pagesz - dsize; + char *volatile dst2 = map1 + pagesz - dsize; + for (int i = 0; i < dsize; ++i) + dst1[i] = dst2[i] = rand(); + for (int ssize = 1; ssize < dsize * 2; ++ssize) { + char *volatile src = map3 + pagesz - (ssize + 1); + for (int i = 0; i < ssize; ++i) + src[i] = max(rand() & 255, 1); + src[ssize] = 0; + ASSERT_EQ(memccpy_pure(dst1, src, 0, dsize), + memccpy(dst2, src, 0, dsize)); + ASSERT_EQ(0, memcmp(dst1, dst2, dsize)); + } + } + npassert(!munmap(map3, pagesz * 2)); + npassert(!munmap(map2, pagesz * 2)); + npassert(!munmap(map1, pagesz * 2)); +} + TEST(memccpy, memcpy) { unsigned n, n1, n2; char *b1, *b2, *b3, *e1, *e2; @@ -78,3 +120,26 @@ TEST(memccpy, memcpy) { free(b1); } } + +#define N 4096 + +BENCH(memccpy, bench) { + char dst[N]; + char src[N + 1]; + + printf("\n"); + for (int n = 1; n <= N; n *= 2) { + for (int i = 0; i < n; ++i) + src[i] = max(rand() & 255, 1); + src[n] = 0; + BENCHMARK(100, n, X(memccpy(dst, src, 0, V(N)))); + } + + printf("\n"); + for (int n = 1; n <= N; n *= 2) { + for (int i = 0; i < n; ++i) + src[i] = max(rand() & 255, 1); + src[n] = 0; + BENCHMARK(100, n, X(memccpy_pure(dst, src, 0, V(N)))); + } +}