cosmopolitan/libc/intrin/memmove.c
Justine Tunney 791f79fcb3
Make improvements
- We now serialize the file descriptor table when spawning / executing
  processes on Windows. This means you can now inherit more stuff than
  just standard i/o. It's needed by bash, which duplicates the console
  to file descriptor #255. We also now do a better job serializing the
  environment variables, so you're less likely to encounter E2BIG when
  using your bash shell. We also no longer coerce environ to uppercase

- execve() on Windows now remotely controls its parent process to make
  them spawn a replacement for itself. Then it'll be able to terminate
  immediately once the spawn succeeds, without having to linger around
  for the lifetime as a shell process for proxying the exit code. When
  process worker thread running in the parent sees the child die, it's
  given a handle to the new child, to replace it in the process table.

- execve() and posix_spawn() on Windows will now provide CreateProcess
  an explicit handle list. This allows us to remove handle locks which
  enables better fork/spawn concurrency, with seriously correct thread
  safety. Other codebases like Go use the same technique. On the other
  hand fork() still favors the conventional WIN32 inheritence approach
  which can be a little bit messy, but is *controlled* by guaranteeing
  perfectly clean slates at both the spawning and execution boundaries

- sigset_t is now 64 bits. Having it be 128 bits was a mistake because
  there's no reason to use that and it's only supported by FreeBSD. By
  using the system word size, signal mask manipulation on Windows goes
  very fast. Furthermore @asyncsignalsafe funcs have been rewritten on
  Windows to take advantage of signal masking, now that it's much more
  pleasant to use.

- All the overlapped i/o code on Windows has been rewritten for pretty
  good signal and cancelation safety. We're now able to ensure overlap
  data structures are cleaned up so long as you don't longjmp() out of
  out of a signal handler that interrupted an i/o operation. Latencies
  are also improved thanks to the removal of lots of "busy wait" code.
  Waits should be optimal for everything except poll(), which shall be
  the last and final demon we slay in the win32 i/o horror show.

- getrusage() on Windows is now able to report RUSAGE_CHILDREN as well
  as RUSAGE_SELF, thanks to aggregation in the process manager thread.
2023-10-08 08:59:53 -07:00

343 lines
12 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2021 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/assert.h"
#include "libc/dce.h"
#include "libc/intrin/asan.internal.h"
#include "libc/nexgen32e/nexgen32e.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#ifndef __aarch64__
typedef long long xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
typedef long long xmm_a __attribute__((__vector_size__(16), __aligned__(16)));
/**
* Copies memory.
*
* memmove n=0 661 picoseconds
* memmove n=1 661 ps/byte 1,476 mb/s
* memmove n=2 330 ps/byte 2,952 mb/s
* memmove n=3 330 ps/byte 2,952 mb/s
* memmove n=4 165 ps/byte 5,904 mb/s
* memmove n=7 141 ps/byte 6,888 mb/s
* memmove n=8 82 ps/byte 11 GB/s
* memmove n=15 44 ps/byte 21 GB/s
* memmove n=16 41 ps/byte 23 GB/s
* memmove n=31 32 ps/byte 29 GB/s
* memmove n=32 31 ps/byte 30 GB/s
* memmove n=63 21 ps/byte 45 GB/s
* memmove n=64 15 ps/byte 61 GB/s
* memmove n=127 13 ps/byte 73 GB/s
* memmove n=128 31 ps/byte 30 GB/s
* memmove n=255 20 ps/byte 45 GB/s
* memmove n=256 19 ps/byte 49 GB/s
* memmove n=511 16 ps/byte 56 GB/s
* memmove n=512 17 ps/byte 54 GB/s
* memmove n=1023 18 ps/byte 52 GB/s
* memmove n=1024 13 ps/byte 72 GB/s
* memmove n=2047 9 ps/byte 96 GB/s
* memmove n=2048 9 ps/byte 98 GB/s
* memmove n=4095 8 ps/byte 112 GB/s
* memmove n=4096 8 ps/byte 109 GB/s
* memmove n=8191 7 ps/byte 124 GB/s
* memmove n=8192 7 ps/byte 125 GB/s
* memmove n=16383 7 ps/byte 134 GB/s
* memmove n=16384 7 ps/byte 134 GB/s
* memmove n=32767 13 ps/byte 72 GB/s
* memmove n=32768 13 ps/byte 72 GB/s
* memmove n=65535 13 ps/byte 68 GB/s
* memmove n=65536 14 ps/byte 67 GB/s
* memmove n=131071 14 ps/byte 65 GB/s
* memmove n=131072 14 ps/byte 64 GB/s
* memmove n=262143 15 ps/byte 63 GB/s
* memmove n=262144 15 ps/byte 63 GB/s
* memmove n=524287 15 ps/byte 61 GB/s
* memmove n=524288 15 ps/byte 61 GB/s
* memmove n=1048575 15 ps/byte 61 GB/s
* memmove n=1048576 15 ps/byte 61 GB/s
* memmove n=2097151 19 ps/byte 48 GB/s
* memmove n=2097152 27 ps/byte 35 GB/s
* memmove n=4194303 28 ps/byte 33 GB/s
* memmove n=4194304 28 ps/byte 33 GB/s
* memmove n=8388607 28 ps/byte 33 GB/s
* memmove n=8388608 28 ps/byte 33 GB/s
*
* DST and SRC may overlap.
*
* @param dst is destination
* @param src is memory to copy
* @param n is number of bytes to copy
* @return dst
* @asyncsignalsafe
*/
void *memmove(void *dst, const void *src, size_t n) {
char *d;
size_t i;
const char *s;
uint64_t a, b;
xmm_t v, w, x, y, V, W, X, Y;
d = dst;
s = src;
#if defined(__x86_64__) && !defined(__chibicc__)
if (IsTiny()) {
uint16_t w1, w2;
uint32_t l1, l2;
uint64_t q1, q2;
if (n <= 16) {
if (n >= 8) {
__builtin_memcpy(&q1, s, 8);
__builtin_memcpy(&q2, s + n - 8, 8);
__builtin_memcpy(d, &q1, 8);
__builtin_memcpy(d + n - 8, &q2, 8);
} else if (n >= 4) {
__builtin_memcpy(&l1, s, 4);
__builtin_memcpy(&l2, s + n - 4, 4);
__builtin_memcpy(d, &l1, 4);
__builtin_memcpy(d + n - 4, &l2, 4);
} else if (n >= 2) {
__builtin_memcpy(&w1, s, 2);
__builtin_memcpy(&w2, s + n - 2, 2);
__builtin_memcpy(d, &w1, 2);
__builtin_memcpy(d + n - 2, &w2, 2);
} else if (n) {
*d = *s;
}
} else {
if (d <= s) {
asm("rep movsb"
: "+D"(d), "+S"(s), "+c"(n), "=m"(*(char(*)[n])dst)
: "m"(*(char(*)[n])src));
} else {
d += n - 1;
s += n - 1;
asm("std\n\t"
"rep movsb\n\t"
"cld"
: "+D"(d), "+S"(s), "+c"(n), "=m"(*(char(*)[n])dst)
: "m"(*(char(*)[n])src));
}
}
return dst;
}
#endif
switch (n) {
case 0:
return d;
case 1:
*d = *s;
return d;
case 2:
__builtin_memcpy(&a, s, 2);
__builtin_memcpy(d, &a, 2);
return d;
case 3:
__builtin_memcpy(&a, s, 2);
__builtin_memcpy(&b, s + 1, 2);
__builtin_memcpy(d, &a, 2);
__builtin_memcpy(d + 1, &b, 2);
return d;
case 4:
__builtin_memcpy(&a, s, 4);
__builtin_memcpy(d, &a, 4);
return d;
case 5 ... 7:
__builtin_memcpy(&a, s, 4);
__builtin_memcpy(&b, s + n - 4, 4);
__builtin_memcpy(d, &a, 4);
__builtin_memcpy(d + n - 4, &b, 4);
return d;
case 8:
__builtin_memcpy(&a, s, 8);
__builtin_memcpy(d, &a, 8);
return d;
case 9 ... 15:
__builtin_memcpy(&a, s, 8);
__builtin_memcpy(&b, s + n - 8, 8);
__builtin_memcpy(d, &a, 8);
__builtin_memcpy(d + n - 8, &b, 8);
return d;
case 16:
*(xmm_t *)d = *(xmm_t *)s;
return d;
case 17 ... 32:
v = *(xmm_t *)s;
w = *(xmm_t *)(s + n - 16);
*(xmm_t *)d = v;
*(xmm_t *)(d + n - 16) = w;
return d;
case 33 ... 64:
v = *(xmm_t *)s;
w = *(xmm_t *)(s + 16);
x = *(xmm_t *)(s + n - 32);
y = *(xmm_t *)(s + n - 16);
*(xmm_t *)d = v;
*(xmm_t *)(d + 16) = w;
*(xmm_t *)(d + n - 32) = x;
*(xmm_t *)(d + n - 16) = y;
return d;
case 65 ... 127:
v = *(xmm_t *)s;
w = *(xmm_t *)(s + 16);
x = *(xmm_t *)(s + 32);
y = *(xmm_t *)(s + 48);
V = *(xmm_t *)(s + n - 64);
W = *(xmm_t *)(s + n - 48);
X = *(xmm_t *)(s + n - 32);
Y = *(xmm_t *)(s + n - 16);
*(xmm_t *)d = v;
*(xmm_t *)(d + 16) = w;
*(xmm_t *)(d + 32) = x;
*(xmm_t *)(d + 48) = y;
*(xmm_t *)(d + n - 64) = V;
*(xmm_t *)(d + n - 48) = W;
*(xmm_t *)(d + n - 32) = X;
*(xmm_t *)(d + n - 16) = Y;
return d;
default:
if (d == s) return d;
#if defined(__x86_64__) && !defined(__chibicc__)
if (n < kHalfCache3 || !kHalfCache3) {
if (d > s) {
if (IsAsan() || n < 900 || !X86_HAVE(ERMS)) {
do {
n -= 32;
v = *(const xmm_t *)(s + n);
w = *(const xmm_t *)(s + n + 16);
*(xmm_t *)(d + n) = v;
*(xmm_t *)(d + n + 16) = w;
} while (n >= 32);
} else {
asm("std\n\t"
"rep movsb\n\t"
"cld"
: "=D"(d), "=S"(s), "+c"(n), "=m"(*(char(*)[n])d)
: "0"(d + n - 1), "1"(s + n - 1), "m"(*(char(*)[n])s));
return dst;
}
} else {
if (IsAsan() || n < 900 || !X86_HAVE(ERMS)) {
i = 0;
do {
v = *(const xmm_t *)(s + i);
w = *(const xmm_t *)(s + i + 16);
*(xmm_t *)(d + i) = v;
*(xmm_t *)(d + i + 16) = w;
} while ((i += 32) + 32 <= n);
d += i;
s += i;
n -= i;
} else {
asm("rep movsb"
: "+D"(d), "+S"(s), "+c"(n), "=m"(*(char(*)[n])d)
: "m"(*(char(*)[n])s));
return dst;
}
}
} else {
if (d > s) {
while ((uintptr_t)(d + n) & 15) {
--n;
d[n] = s[n];
}
do {
n -= 32;
v = *(const xmm_t *)(s + n);
w = *(const xmm_t *)(s + n + 16);
__builtin_ia32_movntdq((xmm_a *)(d + n), v);
__builtin_ia32_movntdq((xmm_a *)(d + n + 16), w);
} while (n >= 32);
} else {
i = 0;
while ((uintptr_t)(d + i) & 15) {
d[i] = s[i];
++i;
}
do {
v = *(const xmm_t *)(s + i);
w = *(const xmm_t *)(s + i + 16);
__builtin_ia32_movntdq((xmm_a *)(d + i), v);
__builtin_ia32_movntdq((xmm_a *)(d + i + 16), w);
} while ((i += 32) + 32 <= n);
d += i;
s += i;
n -= i;
}
asm("sfence");
}
#else
if (d > s) {
do {
n -= 32;
v = *(const xmm_t *)(s + n);
w = *(const xmm_t *)(s + n + 16);
*(xmm_t *)(d + n) = v;
*(xmm_t *)(d + n + 16) = w;
} while (n >= 32);
} else {
i = 0;
do {
v = *(const xmm_t *)(s + i);
w = *(const xmm_t *)(s + i + 16);
*(xmm_t *)(d + i) = v;
*(xmm_t *)(d + i + 16) = w;
} while ((i += 32) + 32 <= n);
d += i;
s += i;
n -= i;
}
#endif
if (n) {
if (n >= 16) {
v = *(const xmm_t *)s;
w = *(const xmm_t *)(s + n - 16);
*(xmm_t *)d = v;
*(xmm_t *)(d + n - 16) = w;
} else if (n >= 8) {
__builtin_memcpy(&a, s, 8);
__builtin_memcpy(&b, s + n - 8, 8);
__builtin_memcpy(d, &a, 8);
__builtin_memcpy(d + n - 8, &b, 8);
} else if (n >= 4) {
__builtin_memcpy(&a, s, 4);
__builtin_memcpy(&b, s + n - 4, 4);
__builtin_memcpy(d, &a, 4);
__builtin_memcpy(d + n - 4, &b, 4);
} else if (n >= 2) {
__builtin_memcpy(&a, s, 2);
__builtin_memcpy(&b, s + n - 2, 2);
__builtin_memcpy(d, &a, 2);
__builtin_memcpy(d + n - 2, &b, 2);
} else {
*d = *s;
}
}
return dst;
}
}
__weak_reference(memmove, memcpy);
#endif /* __aarch64__ */