mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-02-07 06:53:33 +00:00
Make memchr() and memccpy() faster
This commit is contained in:
parent
fef24d622a
commit
e4d6eb382a
3 changed files with 95 additions and 24 deletions
|
@ -19,10 +19,10 @@
|
||||||
#include "libc/dce.h"
|
#include "libc/dce.h"
|
||||||
#include "libc/nexgen32e/x86feature.h"
|
#include "libc/nexgen32e/x86feature.h"
|
||||||
#include "libc/str/str.h"
|
#include "libc/str/str.h"
|
||||||
|
#include "third_party/aarch64/arm_neon.internal.h"
|
||||||
|
#include "third_party/intel/immintrin.internal.h"
|
||||||
#ifndef __aarch64__
|
#ifndef __aarch64__
|
||||||
|
|
||||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
|
||||||
|
|
||||||
static inline const unsigned char *memchr_pure(const unsigned char *s,
|
static inline const unsigned char *memchr_pure(const unsigned char *s,
|
||||||
unsigned char c, size_t n) {
|
unsigned char c, size_t n) {
|
||||||
size_t i;
|
size_t i;
|
||||||
|
@ -35,22 +35,27 @@ static inline const unsigned char *memchr_pure(const unsigned char *s,
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__x86_64__) && !defined(__chibicc__)
|
#if defined(__x86_64__) && !defined(__chibicc__)
|
||||||
static __vex const unsigned char *memchr_sse(const unsigned char *s,
|
static const char *memchr_sse(const char *s, char c, size_t n) {
|
||||||
unsigned char c, size_t n) {
|
const char *e = s + n;
|
||||||
size_t i;
|
__m128i t = _mm_set1_epi8(c);
|
||||||
unsigned m;
|
unsigned m, k = (uintptr_t)s & 15;
|
||||||
xmm_t v, t = {c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c};
|
m = _mm_movemask_epi8(
|
||||||
for (; n >= 16; n -= 16, s += 16) {
|
_mm_cmpeq_epi8(_mm_load_si128((const __m128i *)((uintptr_t)s & -16)), t));
|
||||||
v = *(const xmm_t *)s;
|
m >>= k;
|
||||||
m = __builtin_ia32_pmovmskb128(v == t);
|
|
||||||
if (m) {
|
if (m) {
|
||||||
m = __builtin_ctzll(m);
|
s += __builtin_ctz(m);
|
||||||
return s + m;
|
if (s < e)
|
||||||
|
return s;
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
for (s += 16 - k; s < e; s += 16) {
|
||||||
for (i = 0; i < n; ++i) {
|
m = _mm_movemask_epi8(
|
||||||
if (s[i] == c) {
|
_mm_cmpeq_epi8(_mm_load_si128((const __m128i *)s), t));
|
||||||
return s + i;
|
if (m) {
|
||||||
|
s += __builtin_ctz(m);
|
||||||
|
if (s < e)
|
||||||
|
return s;
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -45,13 +45,14 @@
|
||||||
* @asyncsignalsafe
|
* @asyncsignalsafe
|
||||||
*/
|
*/
|
||||||
void *memccpy(void *dst, const void *src, int c, size_t n) {
|
void *memccpy(void *dst, const void *src, int c, size_t n) {
|
||||||
char *d;
|
const char *p;
|
||||||
size_t i;
|
// this memchr() call is only correct if your memchr() implementation
|
||||||
const char *s;
|
// offers the same readahead safety guarantees as cosmopolitan's does
|
||||||
for (d = dst, s = src, i = 0; i < n; ++i) {
|
if ((p = memchr(src, c, n))) {
|
||||||
if (((d[i] = s[i]) & 255) == (c & 255)) {
|
size_t m = p + 1 - (const char *)src;
|
||||||
return d + i + 1;
|
memmove(dst, src, m);
|
||||||
}
|
return (char *)dst + m;
|
||||||
}
|
}
|
||||||
|
memmove(dst, src, n);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,10 +16,18 @@
|
||||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||||
|
#include "libc/assert.h"
|
||||||
|
#include "libc/calls/calls.h"
|
||||||
|
#include "libc/intrin/safemacros.h"
|
||||||
#include "libc/mem/mem.h"
|
#include "libc/mem/mem.h"
|
||||||
|
#include "libc/runtime/runtime.h"
|
||||||
|
#include "libc/runtime/sysconf.h"
|
||||||
#include "libc/stdio/rand.h"
|
#include "libc/stdio/rand.h"
|
||||||
#include "libc/stdio/stdio.h"
|
#include "libc/stdio/stdio.h"
|
||||||
#include "libc/str/str.h"
|
#include "libc/str/str.h"
|
||||||
|
#include "libc/sysv/consts/map.h"
|
||||||
|
#include "libc/sysv/consts/prot.h"
|
||||||
|
#include "libc/testlib/benchmark.h"
|
||||||
#include "libc/testlib/ezbench.h"
|
#include "libc/testlib/ezbench.h"
|
||||||
#include "libc/testlib/testlib.h"
|
#include "libc/testlib/testlib.h"
|
||||||
|
|
||||||
|
@ -50,6 +58,40 @@ TEST(memccpy, testZeroLength_doesNothing) {
|
||||||
EXPECT_EQ(NULL, memccpy(buf, "hi", '\0', 0));
|
EXPECT_EQ(NULL, memccpy(buf, "hi", '\0', 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(memccpy, fuzz) {
|
||||||
|
int pagesz = sysconf(_SC_PAGESIZE);
|
||||||
|
char *map1 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
|
||||||
|
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
||||||
|
npassert(map1 != MAP_FAILED);
|
||||||
|
npassert(!mprotect(map1 + pagesz, pagesz, PROT_NONE));
|
||||||
|
char *map2 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
|
||||||
|
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
||||||
|
npassert(map2 != MAP_FAILED);
|
||||||
|
npassert(!mprotect(map2 + pagesz, pagesz, PROT_NONE));
|
||||||
|
char *map3 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
|
||||||
|
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
||||||
|
npassert(map3 != MAP_FAILED);
|
||||||
|
npassert(!mprotect(map3 + pagesz, pagesz, PROT_NONE));
|
||||||
|
for (int dsize = 1; dsize < 128; ++dsize) {
|
||||||
|
char *volatile dst1 = map1 + pagesz - dsize;
|
||||||
|
char *volatile dst2 = map1 + pagesz - dsize;
|
||||||
|
for (int i = 0; i < dsize; ++i)
|
||||||
|
dst1[i] = dst2[i] = rand();
|
||||||
|
for (int ssize = 1; ssize < dsize * 2; ++ssize) {
|
||||||
|
char *volatile src = map3 + pagesz - (ssize + 1);
|
||||||
|
for (int i = 0; i < ssize; ++i)
|
||||||
|
src[i] = max(rand() & 255, 1);
|
||||||
|
src[ssize] = 0;
|
||||||
|
ASSERT_EQ(memccpy_pure(dst1, src, 0, dsize),
|
||||||
|
memccpy(dst2, src, 0, dsize));
|
||||||
|
ASSERT_EQ(0, memcmp(dst1, dst2, dsize));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
npassert(!munmap(map3, pagesz * 2));
|
||||||
|
npassert(!munmap(map2, pagesz * 2));
|
||||||
|
npassert(!munmap(map1, pagesz * 2));
|
||||||
|
}
|
||||||
|
|
||||||
TEST(memccpy, memcpy) {
|
TEST(memccpy, memcpy) {
|
||||||
unsigned n, n1, n2;
|
unsigned n, n1, n2;
|
||||||
char *b1, *b2, *b3, *e1, *e2;
|
char *b1, *b2, *b3, *e1, *e2;
|
||||||
|
@ -78,3 +120,26 @@ TEST(memccpy, memcpy) {
|
||||||
free(b1);
|
free(b1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define N 4096
|
||||||
|
|
||||||
|
BENCH(memccpy, bench) {
|
||||||
|
char dst[N];
|
||||||
|
char src[N + 1];
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
for (int n = 1; n <= N; n *= 2) {
|
||||||
|
for (int i = 0; i < n; ++i)
|
||||||
|
src[i] = max(rand() & 255, 1);
|
||||||
|
src[n] = 0;
|
||||||
|
BENCHMARK(100, n, X(memccpy(dst, src, 0, V(N))));
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
for (int n = 1; n <= N; n *= 2) {
|
||||||
|
for (int i = 0; i < n; ++i)
|
||||||
|
src[i] = max(rand() & 255, 1);
|
||||||
|
src[n] = 0;
|
||||||
|
BENCHMARK(100, n, X(memccpy_pure(dst, src, 0, V(N))));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue