Make memchr() and memccpy() faster

This commit is contained in:
Justine Tunney 2024-09-30 05:54:34 -07:00
parent fef24d622a
commit e4d6eb382a
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
3 changed files with 95 additions and 24 deletions

View file

@ -19,10 +19,10 @@
#include "libc/dce.h" #include "libc/dce.h"
#include "libc/nexgen32e/x86feature.h" #include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h" #include "libc/str/str.h"
#include "third_party/aarch64/arm_neon.internal.h"
#include "third_party/intel/immintrin.internal.h"
#ifndef __aarch64__ #ifndef __aarch64__
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
static inline const unsigned char *memchr_pure(const unsigned char *s, static inline const unsigned char *memchr_pure(const unsigned char *s,
unsigned char c, size_t n) { unsigned char c, size_t n) {
size_t i; size_t i;
@ -35,22 +35,27 @@ static inline const unsigned char *memchr_pure(const unsigned char *s,
} }
#if defined(__x86_64__) && !defined(__chibicc__) #if defined(__x86_64__) && !defined(__chibicc__)
static __vex const unsigned char *memchr_sse(const unsigned char *s, static const char *memchr_sse(const char *s, char c, size_t n) {
unsigned char c, size_t n) { const char *e = s + n;
size_t i; __m128i t = _mm_set1_epi8(c);
unsigned m; unsigned m, k = (uintptr_t)s & 15;
xmm_t v, t = {c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c}; m = _mm_movemask_epi8(
for (; n >= 16; n -= 16, s += 16) { _mm_cmpeq_epi8(_mm_load_si128((const __m128i *)((uintptr_t)s & -16)), t));
v = *(const xmm_t *)s; m >>= k;
m = __builtin_ia32_pmovmskb128(v == t);
if (m) { if (m) {
m = __builtin_ctzll(m); s += __builtin_ctz(m);
return s + m; if (s < e)
return s;
return 0;
} }
} for (s += 16 - k; s < e; s += 16) {
for (i = 0; i < n; ++i) { m = _mm_movemask_epi8(
if (s[i] == c) { _mm_cmpeq_epi8(_mm_load_si128((const __m128i *)s), t));
return s + i; if (m) {
s += __builtin_ctz(m);
if (s < e)
return s;
return 0;
} }
} }
return 0; return 0;

View file

@ -45,13 +45,14 @@
* @asyncsignalsafe * @asyncsignalsafe
*/ */
void *memccpy(void *dst, const void *src, int c, size_t n) { void *memccpy(void *dst, const void *src, int c, size_t n) {
char *d; const char *p;
size_t i; // this memchr() call is only correct if your memchr() implementation
const char *s; // offers the same readahead safety guarantees as cosmopolitan's does
for (d = dst, s = src, i = 0; i < n; ++i) { if ((p = memchr(src, c, n))) {
if (((d[i] = s[i]) & 255) == (c & 255)) { size_t m = p + 1 - (const char *)src;
return d + i + 1; memmove(dst, src, m);
} return (char *)dst + m;
} }
memmove(dst, src, n);
return 0; return 0;
} }

View file

@ -16,10 +16,18 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE. PERFORMANCE OF THIS SOFTWARE.
*/ */
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/intrin/safemacros.h"
#include "libc/mem/mem.h" #include "libc/mem/mem.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/sysconf.h"
#include "libc/stdio/rand.h" #include "libc/stdio/rand.h"
#include "libc/stdio/stdio.h" #include "libc/stdio/stdio.h"
#include "libc/str/str.h" #include "libc/str/str.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/prot.h"
#include "libc/testlib/benchmark.h"
#include "libc/testlib/ezbench.h" #include "libc/testlib/ezbench.h"
#include "libc/testlib/testlib.h" #include "libc/testlib/testlib.h"
@ -50,6 +58,40 @@ TEST(memccpy, testZeroLength_doesNothing) {
EXPECT_EQ(NULL, memccpy(buf, "hi", '\0', 0)); EXPECT_EQ(NULL, memccpy(buf, "hi", '\0', 0));
} }
TEST(memccpy, fuzz) {
int pagesz = sysconf(_SC_PAGESIZE);
char *map1 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
npassert(map1 != MAP_FAILED);
npassert(!mprotect(map1 + pagesz, pagesz, PROT_NONE));
char *map2 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
npassert(map2 != MAP_FAILED);
npassert(!mprotect(map2 + pagesz, pagesz, PROT_NONE));
char *map3 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
npassert(map3 != MAP_FAILED);
npassert(!mprotect(map3 + pagesz, pagesz, PROT_NONE));
for (int dsize = 1; dsize < 128; ++dsize) {
char *volatile dst1 = map1 + pagesz - dsize;
char *volatile dst2 = map1 + pagesz - dsize;
for (int i = 0; i < dsize; ++i)
dst1[i] = dst2[i] = rand();
for (int ssize = 1; ssize < dsize * 2; ++ssize) {
char *volatile src = map3 + pagesz - (ssize + 1);
for (int i = 0; i < ssize; ++i)
src[i] = max(rand() & 255, 1);
src[ssize] = 0;
ASSERT_EQ(memccpy_pure(dst1, src, 0, dsize),
memccpy(dst2, src, 0, dsize));
ASSERT_EQ(0, memcmp(dst1, dst2, dsize));
}
}
npassert(!munmap(map3, pagesz * 2));
npassert(!munmap(map2, pagesz * 2));
npassert(!munmap(map1, pagesz * 2));
}
TEST(memccpy, memcpy) { TEST(memccpy, memcpy) {
unsigned n, n1, n2; unsigned n, n1, n2;
char *b1, *b2, *b3, *e1, *e2; char *b1, *b2, *b3, *e1, *e2;
@ -78,3 +120,26 @@ TEST(memccpy, memcpy) {
free(b1); free(b1);
} }
} }
#define N 4096
BENCH(memccpy, bench) {
char dst[N];
char src[N + 1];
printf("\n");
for (int n = 1; n <= N; n *= 2) {
for (int i = 0; i < n; ++i)
src[i] = max(rand() & 255, 1);
src[n] = 0;
BENCHMARK(100, n, X(memccpy(dst, src, 0, V(N))));
}
printf("\n");
for (int n = 1; n <= N; n *= 2) {
for (int i = 0; i < n; ++i)
src[i] = max(rand() & 255, 1);
src[n] = 0;
BENCHMARK(100, n, X(memccpy_pure(dst, src, 0, V(N))));
}
}