Make improvements

- Invent openatemp() API
- Invent O_UNLINK open flag
- Introduce getenv_secure() API
- Remove `git pull` from cosmocc
- Fix utimes() when path is NULL
- Fix mktemp() to never return NULL
- Fix utimensat() UTIME_OMIT on XNU
- Improve utimensat() code for RHEL5
- Turn `argv[0]` C:/ to /C/ on Windows
- Introduce tmpnam() and tmpnam_r() APIs
- Fix more const issues with internal APIs
- Permit utimes() on WIN32 in O_RDONLY mode
- Fix fdopendir() to check fd is a directory
- Fix recent crash regression in landlock make
- Fix futimens(AT_FDCWD, NULL) to return EBADF
- Use workaround so `make -j` doesn't fork bomb
- Rename dontdiscard to __wur (just like glibc)
- Fix st_size for WIN32 symlinks containing UTF-8
- Introduce stdio ext APIs needed by GNU coreutils
- Fix lstat() on WIN32 for symlinks to directories
- Move some constants from normalize.inc to limits.h
- Fix segv with memchr() and memcmp() overlapping page
- Implement POSIX fflush() behavior for reader streams
- Implement AT_SYMLINK_NOFOLLOW for utimensat() on WIN32
- Don't change read-only status of existing files on WIN32
- Correctly handle `0x[^[:xdigit:]]` case in strtol() functions
This commit is contained in:
Justine Tunney 2023-09-06 03:54:42 -07:00
parent 8596e83cce
commit f531acc8f9
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
297 changed files with 1920 additions and 1681 deletions

View file

@ -22,113 +22,34 @@
#include "libc/str/str.h"
#ifndef __aarch64__
#define PMOVMSKB(x) __builtin_ia32_pmovmskb128(x)
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
#if defined(__x86_64__) && !defined(__chibicc__)
static dontinline antiquity int memcmp_sse(const unsigned char *p,
const unsigned char *q, size_t n) {
unsigned u;
if (n > 32) {
while (n > 16 + 16) {
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
n -= 16;
p += 16;
q += 16;
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
}
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^
0xffff)) {
return 0;
} else {
u = __builtin_ctzl(u);
return p[n - 16 + u] - q[n - 16 + u];
}
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
_Microarchitecture("avx") static int memcmp_avx(const unsigned char *p,
const unsigned char *q,
size_t n) {
uint64_t w;
unsigned u;
if (n > 32) {
while (n >= 16 + 64) {
w = (uint64_t)PMOVMSKB(((xmm_t *)p)[0] == ((xmm_t *)q)[0]) << 000 |
(uint64_t)PMOVMSKB(((xmm_t *)p)[1] == ((xmm_t *)q)[1]) << 020 |
(uint64_t)PMOVMSKB(((xmm_t *)p)[2] == ((xmm_t *)q)[2]) << 040 |
(uint64_t)PMOVMSKB(((xmm_t *)p)[3] == ((xmm_t *)q)[3]) << 060;
if (w == -1) {
n -= 64;
p += 64;
q += 64;
} else {
w = __builtin_ctzll(w ^ -1);
return p[w] - q[w];
}
}
while (n > 16 + 16) {
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
n -= 16;
p += 16;
q += 16;
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
}
if (!(u = PMOVMSKB(*(xmm_t *)p == *(xmm_t *)q) ^ 0xffff)) {
if (!(u = PMOVMSKB(*(xmm_t *)(p + n - 16) == *(xmm_t *)(q + n - 16)) ^
0xffff)) {
return 0;
} else {
u = __builtin_ctzl(u);
return p[n - 16 + u] - q[n - 16 + u];
}
} else {
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
#endif /* __x86_64__ */
/**
* Compares memory byte by byte.
*
* memcmp n=0 992 picoseconds
* memcmp n=1 1 ns/byte 738 mb/s
* memcmp n=2 661 ps/byte 1,476 mb/s
* memcmp n=3 551 ps/byte 1,771 mb/s
* memcmp n=4 248 ps/byte 3,936 mb/s
* memcmp n=5 198 ps/byte 4,920 mb/s
* memcmp n=6 165 ps/byte 5,904 mb/s
* memcmp n=7 141 ps/byte 6,889 mb/s
* memcmp n=8 124 ps/byte 7,873 mb/s
* memcmp n=9 110 ps/byte 8,857 mb/s
* memcmp n=15 44 ps/byte 22,143 mb/s
* memcmp n=16 41 ps/byte 23,619 mb/s
* memcmp n=17 77 ps/byte 12,547 mb/s
* memcmp n=31 42 ps/byte 22,881 mb/s
* memcmp n=32 41 ps/byte 23,619 mb/s
* memcmp n=33 60 ps/byte 16,238 mb/s
* memcmp n=80 53 ps/byte 18,169 mb/s
* memcmp n=128 38 ps/byte 25,194 mb/s
* memcmp n=256 32 ps/byte 30,233 mb/s
* memcmp n=16384 27 ps/byte 35,885 mb/s
* memcmp n=32768 29 ps/byte 32,851 mb/s
* memcmp n=131072 33 ps/byte 28,983 mb/s
* memcmp n=0 2 nanoseconds
* memcmp n=1 2 ns/byte 357 mb/s
* memcmp n=2 1 ns/byte 530 mb/s
* memcmp n=3 1 ns/byte 631 mb/s
* 𝗺𝗲𝗺𝗰𝗺𝗽 n=4 1 ns/byte 849 mb/s
* memcmp n=5 816 ps/byte 1,195 mb/s
* memcmp n=6 888 ps/byte 1,098 mb/s
* memcmp n=7 829 ps/byte 1,176 mb/s
* 𝗺𝗲𝗺𝗰𝗺𝗽 n=8 773 ps/byte 1,261 mb/s
* memcmp n=9 629 ps/byte 1,551 mb/s
* memcmp n=15 540 ps/byte 1,805 mb/s
* 𝗺𝗲𝗺𝗰𝗺𝗽 n=16 211 ps/byte 4,623 mb/s
* memcmp n=17 268 ps/byte 3,633 mb/s
* memcmp n=31 277 ps/byte 3,524 mb/s
* memcmp n=32 153 ps/byte 6,351 mb/s
* memcmp n=33 179 ps/byte 5,431 mb/s
* memcmp n=79 148 ps/byte 6,576 mb/s
* 𝗺𝗲𝗺𝗰𝗺𝗽 n=80 81 ps/byte 11 GB/s
* memcmp n=128 76 ps/byte 12 GB/s
* memcmp n=256 60 ps/byte 15 GB/s
* memcmp n=16384 51 ps/byte 18 GB/s
* memcmp n=32768 51 ps/byte 18 GB/s
* memcmp n=131072 52 ps/byte 18 GB/s
*
* @return an integer that's (1) equal to zero if `a` is equal to `b`,
* (2) less than zero if `a` is less than `b`, or (3) greater than
@ -137,62 +58,21 @@ _Microarchitecture("avx") static int memcmp_avx(const unsigned char *p,
*/
int memcmp(const void *a, const void *b, size_t n) {
int c;
unsigned u;
uint32_t k, i, j;
uint64_t w, x, y;
const unsigned char *p, *q;
if ((p = a) == (q = b) || !n) return 0;
if ((c = *p - *q)) return c;
#if defined(__x86_64__) && !defined(__chibicc__)
if (!IsTiny()) {
if (n <= 16) {
if (n >= 8) {
if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^
(y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) {
p += n - 8;
q += n - 8;
if (!(w = (x = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070)) ^
(y = ((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070)))) {
return 0;
}
}
u = __builtin_ctzll(w);
u = u & -8;
return ((x >> u) & 255) - ((y >> u) & 255);
} else if (n >= 4) {
if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^
(j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) {
p += n - 4;
q += n - 4;
if (!(k = (i = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030)) ^
(j = ((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030)))) {
return 0;
}
}
u = __builtin_ctzl(k);
u = u & -8;
return ((i >> u) & 255) - ((j >> u) & 255);
}
} else if (LIKELY(X86_HAVE(AVX))) {
return memcmp_avx(p, q, n);
unsigned u;
while (n >= 16 && (((uintptr_t)p & 0xfff) <= 0x1000 - 16 &&
((uintptr_t)q & 0xfff) <= 0x1000 - 16)) {
if (!(u = __builtin_ia32_pmovmskb128(*(xmm_t *)p == *(xmm_t *)q) ^
0xffff)) {
n -= 16;
p += 16;
q += 16;
} else {
return memcmp_sse(p, q, n);
u = __builtin_ctzl(u);
return p[u] - q[u];
}
}
#endif /* __x86_64__ */