Make numerous improvements

- Python static hello world now 1.8mb
- Python static fully loaded now 10mb
- Python HTTPS client now uses MbedTLS
- Python REPL now completes import stmts
- Increase stack size for Python for now
- Begin synthesizing posixpath and ntpath
- Restore Python \N{UNICODE NAME} support
- Restore Python NFKD symbol normalization
- Add optimized code path for Intel SHA-NI
- Get more Python unit tests passing faster
- Get Python help() pagination working on NT
- Python hashlib now supports MbedTLS PBKDF2
- Make memcpy/memmove/memcmp/bcmp/etc. faster
- Add Mersenne Twister and Vigna to LIBC_RAND
- Provide privileged __printf() for error code
- Fix zipos opendir() so that it reports ENOTDIR
- Add basic chmod() implementation for Windows NT
- Add Cosmo's best functions to Python cosmo module
- Pin function trace indent depth to that of caller
- Show memory diagram on invalid access in MODE=dbg
- Differentiate stack overflow on crash in MODE=dbg
- Add stb_truetype and tools for analyzing font files
- Upgrade to UNICODE 13 and reduce its binary footprint
- COMPILE.COM now logs resource usage of build commands
- Start implementing basic poll() support on bare metal
- Set getauxval(AT_EXECFN) to GetModuleFileName() on NT
- Add descriptions to strerror() in non-TINY build modes
- Add COUNTBRANCH() macro to help with micro-optimizations
- Make error / backtrace / asan / memory code more unbreakable
- Add fast perfect C implementation of μ-Law and a-Law audio codecs
- Make strtol() functions consistent with other libc implementations
- Improve Linenoise implementation (see also github.com/jart/bestline)
- COMPILE.COM now suppresses stdout/stderr of successful build commands
This commit is contained in:
Justine Tunney 2021-09-27 22:58:51 -07:00
parent fa7b4f5bd1
commit 39bf41f4eb
806 changed files with 77494 additions and 63859 deletions

View file

@ -16,17 +16,122 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/bits/likely.h"
#include "libc/dce.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
typedef uint64_t xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
static noinline antiquity int memcmp_sse(const char *p, const char *q,
size_t n) {
xmm_t a;
while (n > 32) {
a = *(const xmm_t *)p ^ *(const xmm_t *)q;
if (a[0] | a[1]) return 1;
p += 16;
q += 16;
n -= 16;
}
a = *(const xmm_t *)p ^ *(const xmm_t *)q |
*(const xmm_t *)(p + n - 16) ^ *(const xmm_t *)(q + n - 16);
return !!(a[0] | a[1]);
}
microarchitecture("avx") static int memcmp_avx(const char *p, const char *q,
size_t n) {
xmm_t a, b, c, d;
if (n > 32) {
if (n >= 16 + 64) {
do {
a = ((const xmm_t *)p)[0] ^ ((const xmm_t *)q)[0];
b = ((const xmm_t *)p)[1] ^ ((const xmm_t *)q)[1];
c = ((const xmm_t *)p)[2] ^ ((const xmm_t *)q)[2];
d = ((const xmm_t *)p)[3] ^ ((const xmm_t *)q)[3];
a = a | b | c | d;
if (a[0] | a[1]) return 1;
p += 64;
q += 64;
n -= 64;
} while (n >= 16 + 64);
}
while (n > 16 + 16) {
a = *(const xmm_t *)p ^ *(const xmm_t *)q;
if (a[0] | a[1]) return 1;
p += 16;
q += 16;
n -= 16;
}
}
a = *(const xmm_t *)p ^ *(const xmm_t *)q |
*(const xmm_t *)(p + n - 16) ^ *(const xmm_t *)(q + n - 16);
return !!(a[0] | a[1]);
}
/**
* Compares memory.
* Tests inequality of first 𝑛 bytes of 𝑝 and 𝑞.
*
* This API was thought to be nearly extinct until recent versions
* of Clang (c. 2019) started generating synthetic calls to it.
* bcmp n=0 992 picoseconds
* bcmp n=1 992 ps/byte 984 mb/s
* bcmp n=2 661 ps/byte 1,476 mb/s
* bcmp n=3 441 ps/byte 2,214 mb/s
* bcmp n=4 330 ps/byte 2,952 mb/s
* bcmp n=5 264 ps/byte 3,690 mb/s
* bcmp n=6 165 ps/byte 5,905 mb/s
* bcmp n=7 189 ps/byte 5,166 mb/s
* bcmp n=8 124 ps/byte 7,873 mb/s
* bcmp n=9 183 ps/byte 5,314 mb/s
* bcmp n=15 110 ps/byte 8,857 mb/s
* bcmp n=16 62 ps/byte 15,746 mb/s
* bcmp n=17 175 ps/byte 5,577 mb/s
* bcmp n=31 96 ps/byte 10,169 mb/s
* bcmp n=32 93 ps/byte 10,497 mb/s
* bcmp n=33 80 ps/byte 12,179 mb/s
* bcmp n=80 37 ps/byte 26,244 mb/s
* bcmp n=128 36 ps/byte 26,994 mb/s
* bcmp n=256 27 ps/byte 35,992 mb/s
* bcmp n=16384 19 ps/byte 49,411 mb/s
* bcmp n=32768 27 ps/byte 34,914 mb/s
* bcmp n=131072 30 ps/byte 32,303 mb/s
*
* @return 0 if a and b have equal contents, otherwise non-zero
* @return 0 if a and b have equal contents, otherwise nonzero
* @see timingsafe_bcmp()
* @asyncsignalsafe
*/
int bcmp(const void *a, const void *b, size_t n) {
return memcmp(a, b, n);
int c;
unsigned u;
uint32_t i, j;
uint64_t x, y;
const char *p, *q;
if ((p = a) == (q = b)) return 0;
if (!IsTiny()) {
if (n <= 16) {
if (n >= 8) {
__builtin_memcpy(&x, p, 8);
__builtin_memcpy(&y, q, 8);
if (x ^ y) return 1;
__builtin_memcpy(&x, p + n - 8, 8);
__builtin_memcpy(&y, q + n - 8, 8);
return !!(x ^ y);
} else if (n >= 4) {
__builtin_memcpy(&i, p, 4);
__builtin_memcpy(&j, q, 4);
if (i ^ j) return 1;
__builtin_memcpy(&i, p + n - 4, 4);
__builtin_memcpy(&j, q + n - 4, 4);
return !!(i ^ j);
}
} else if (LIKELY(X86_HAVE(AVX))) {
return memcmp_avx(p, q, n);
} else {
return memcmp_sse(p, q, n);
}
}
while (n--) {
if ((c = p[n] ^ q[n])) {
return c;
}
}
return 0;
}

View file

@ -153,6 +153,20 @@ int BLAKE2B256_Final(struct Blake2b *b2b,
return 0;
}
/**
* Computes blake2b 256bit message digest.
*
* blake2b256 n=0 191 nanoseconds
* blake2b256 n=8 23 ns/byte 40,719 kb/s
* blake2b256 n=31 6 ns/byte 153 mb/s
* blake2b256 n=32 6 ns/byte 158 mb/s
* blake2b256 n=63 3 ns/byte 312 mb/s
* blake2b256 n=64 3 ns/byte 317 mb/s
* blake2b256 n=128 1 ns/byte 640 mb/s
* blake2b256 n=256 1 ns/byte 662 mb/s
* blake2b256 n=22851 1 ns/byte 683 mb/s
*
*/
int BLAKE2B256(const void *data, size_t len,
uint8_t out[BLAKE2B256_DIGEST_LENGTH]) {
struct Blake2b ctx;

View file

@ -1,71 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2021 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/str/str.h"
typedef long long xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
/**
* Sets memory to zero.
*/
void bzero(void *p, size_t n) {
char *b;
uint64_t x;
x = 0;
b = p;
switch (n) {
case 0:
break;
case 1:
__builtin_memcpy(b, &x, 1);
break;
case 2:
__builtin_memcpy(b, &x, 2);
break;
case 3:
__builtin_memcpy(b, &x, 2);
__builtin_memcpy(b + 1, &x, 2);
break;
case 4:
__builtin_memcpy(b, &x, 4);
break;
case 5 ... 7:
__builtin_memcpy(b, &x, 4);
__builtin_memcpy(b + n - 4, &x, 4);
break;
case 8:
__builtin_memcpy(b, &x, 8);
break;
case 9 ... 15:
__builtin_memcpy(b, &x, 8);
__builtin_memcpy(b + n - 8, &x, 8);
break;
case 16:
*(xmm_t *)b = (xmm_t){0};
break;
default:
while (n > 32) {
*(xmm_t *)(b + n - 16) = (xmm_t){0};
*(xmm_t *)(b + n - 32) = (xmm_t){0};
n -= 32;
}
if (n > 16) *(xmm_t *)(b + n - 16) = (xmm_t){0};
*(xmm_t *)b = (xmm_t){0};
break;
}
}

View file

@ -23,7 +23,7 @@
// @param edi is init crc32 value
// @param rsi is nullable pointer to data
// @param edx is int size per zlib interface
crc32: movslq %edx,%rdx
crc32: mov %edx,%edx
jmp crc32_z
.endfn crc32,globl
.source __FILE__

View file

@ -16,11 +16,21 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/bits/safemacros.internal.h"
#include "libc/dce.h"
#include "libc/intrin/asan.internal.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/crc32.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
static uint32_t kCrc32Tab[256];
static inline noasan uint64_t WildRead64(const signed char *p) {
return (uint64_t)(255 & p[7]) << 070 | (uint64_t)(255 & p[6]) << 060 |
(uint64_t)(255 & p[5]) << 050 | (uint64_t)(255 & p[4]) << 040 |
(uint64_t)(255 & p[3]) << 030 | (uint64_t)(255 & p[2]) << 020 |
(uint64_t)(255 & p[1]) << 010 | (uint64_t)(255 & p[0]) << 000;
}
/**
* Computes Phil Katz CRC-32 used by zip/zlib/gzip/etc.
@ -34,28 +44,29 @@ static uint32_t kCrc32Tab[256];
* @param h is initial value
*/
uint32_t crc32_z(uint32_t h, const void *data, size_t size) {
const unsigned char *p, *pe;
size_t n;
static bool once;
size_t skip;
const unsigned char *p, *e;
static uint32_t kCrc32Tab[256];
if (!once) {
crc32init(kCrc32Tab, 0xedb88320);
once = true;
once = 0;
}
if (data) {
h ^= 0xffffffff;
if (size >= 64 && X86_HAVE(PCLMUL)) {
h = crc32_pclmul(h, data, size); /* 51x faster */
skip = rounddown(size, 16);
} else {
skip = 0;
}
p = (const unsigned char *)data + skip;
pe = (const unsigned char *)data + size;
while (p < pe) {
if (size == -1) {
size = data ? strlen(data) : 0;
}
p = data;
e = p + size;
h ^= 0xffffffff;
if (X86_HAVE(PCLMUL)) {
while (((intptr_t)p & 15) && p < e)
h = h >> 8 ^ kCrc32Tab[(h & 0xff) ^ *p++];
if ((n = ROUNDDOWN(e - p, 16)) >= 64) {
if (IsAsan()) __asan_verify(p, n);
h = crc32_pclmul(h, p, n); /* 51x faster */
p += n;
}
return h ^ 0xffffffff;
} else {
return 0;
}
while (p < e) h = h >> 8 ^ kCrc32Tab[(h & 0xff) ^ *p++];
return h ^ 0xffffffff;
}

View file

@ -152,8 +152,21 @@ static void ProcessAll(const uint8_t *data, size_t size, const uint64_t key[4],
if ((size & 31) != 0) HighwayHashUpdateRemainder(data + i, size & 31, state);
}
uint64_t HighwayHash64(const uint8_t *data, size_t size,
const uint64_t key[4]) {
/**
* Computes Highway Hash.
*
* highwayhash64 n=0 121 nanoseconds
* highwayhash64 n=8 16 ns/byte 59,865 kb/s
* highwayhash64 n=31 4 ns/byte 222 mb/s
* highwayhash64 n=32 3 ns/byte 248 mb/s
* highwayhash64 n=63 2 ns/byte 387 mb/s
* highwayhash64 n=64 2 ns/byte 422 mb/s
* highwayhash64 n=128 1 ns/byte 644 mb/s
* highwayhash64 n=256 1 ns/byte 875 mb/s
* highwayhash64 n=22851 721 ps/byte 1,354 mb/s
*
*/
uint64_t HighwayHash64(const void *data, size_t size, const uint64_t key[4]) {
HighwayHashState state;
ProcessAll(data, size, key, &state);
return HighwayHashFinalize64(&state);

View file

@ -3,7 +3,7 @@
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
uint64_t HighwayHash64(const uint8_t *, size_t, const uint64_t[4]);
uint64_t HighwayHash64(const void *, size_t, const uint64_t[4]);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */

View file

@ -21,20 +21,51 @@
/**
* Compares memory case-insensitively.
*
* memcasecmp n=0 992 picoseconds
* memcasecmp n=1 1 ns/byte 590 mb/s
* memcasecmp n=2 1 ns/byte 843 mb/s
* memcasecmp n=3 1 ns/byte 885 mb/s
* memcasecmp n=4 1 ns/byte 843 mb/s
* memcasecmp n=5 1 ns/byte 820 mb/s
* memcasecmp n=6 1 ns/byte 770 mb/s
* memcasecmp n=7 1 ns/byte 765 mb/s
* memcasecmp n=8 206 ps/byte 4,724 mb/s
* memcasecmp n=9 220 ps/byte 4,428 mb/s
* memcasecmp n=15 617 ps/byte 1,581 mb/s
* memcasecmp n=16 124 ps/byte 7,873 mb/s
* memcasecmp n=17 155 ps/byte 6,274 mb/s
* memcasecmp n=31 341 ps/byte 2,860 mb/s
* memcasecmp n=32 82 ps/byte 11,810 mb/s
* memcasecmp n=33 100 ps/byte 9,743 mb/s
* memcasecmp n=80 53 ps/byte 18,169 mb/s
* memcasecmp n=128 49 ps/byte 19,890 mb/s
* memcasecmp n=256 45 ps/byte 21,595 mb/s
* memcasecmp n=16384 42 ps/byte 22,721 mb/s
* memcasecmp n=32768 40 ps/byte 24,266 mb/s
* memcasecmp n=131072 40 ps/byte 24,337 mb/s
*
* @return is <0, 0, or >0 based on uint8_t comparison
*/
int memcasecmp(const void *p, const void *q, size_t n) {
int c;
size_t i;
unsigned u;
uint64_t w;
const unsigned char *a, *b;
if ((a = p) != (b = q)) {
for (i = 0; i < n; ++i) {
while (i + 8 <= n) {
w = READ64LE(a);
w ^= READ64LE(b);
if (w) {
i += (unsigned)__builtin_ctzll(w) >> 3;
if ((w = (((uint64_t)a[0] << 000 | (uint64_t)a[1] << 010 |
(uint64_t)a[2] << 020 | (uint64_t)a[3] << 030 |
(uint64_t)a[4] << 040 | (uint64_t)a[5] << 050 |
(uint64_t)a[6] << 060 | (uint64_t)a[7] << 070) ^
((uint64_t)b[0] << 000 | (uint64_t)b[1] << 010 |
(uint64_t)b[2] << 020 | (uint64_t)b[3] << 030 |
(uint64_t)b[4] << 040 | (uint64_t)b[5] << 050 |
(uint64_t)b[6] << 060 | (uint64_t)b[7] << 070)))) {
u = __builtin_ctzll(w);
i += u >> 3;
break;
} else {
i += 8;

View file

@ -1,146 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/bits/bits.h"
#include "libc/dce.h"
#include "libc/intrin/pcmpeqb.h"
#include "libc/intrin/pmovmskb.h"
#include "libc/nexgen32e/bsf.h"
#include "libc/str/str.h"
/**
* Compares memory.
*
* @return unsigned char subtraction at stop index
* @asyncsignalsafe
*/
int memcmp(const void *a, const void *b, size_t n) {
int c;
uint64_t w;
unsigned m;
uint8_t A[16], B[16];
const uint8_t *p = a, *q = b;
if (p == q) return 0;
if (IsTiny()) {
for (; n >= 8; p += 8, q += 8, n -= 8) {
w = READ64LE(p) ^ READ64LE(q);
if (w) {
m = bsfl(w) >> 3;
return p[m] - q[m];
}
}
for (; n; ++p, ++q, --n) {
if ((c = *p - *q)) {
return c;
}
}
return 0;
}
StartOver:
switch (n) {
case 0:
return 0;
case 1:
return *p - *q;
case 2:
w = (p[0] << 000 | p[1] << 010) ^ (q[0] << 000 | q[1] << 010);
break;
case 3:
w = (p[0] << 000 | p[1] << 010 | p[2] << 020) ^
(q[0] << 000 | q[1] << 010 | q[2] << 020);
break;
case 4:
w = ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
(uint32_t)p[2] << 020 | (uint32_t)p[3] << 030) ^
((uint32_t)q[0] << 000 | (uint32_t)q[1] << 010 |
(uint32_t)q[2] << 020 | (uint32_t)q[3] << 030);
break;
case 5:
w = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
(uint64_t)p[4] << 040) ^
((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040);
break;
case 6:
w = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050) ^
((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050);
break;
case 7:
w = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
(uint64_t)p[6] << 060) ^
((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
(uint64_t)q[6] << 060);
break;
case 8:
w = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070) ^
((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070);
break;
default:
for (; n >= 16; p += 16, q += 16, n -= 16) {
memcpy(A, p, 16);
memcpy(B, q, 16);
pcmpeqb(A, A, B);
if ((m = pmovmskb(A) - 0xffff)) {
m = bsf(m);
return p[m] - q[m];
}
}
if (n > 8) {
case 9:
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
w = ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
(uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
(uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
(uint64_t)p[6] << 060 | (uint64_t)p[7] << 070) ^
((uint64_t)q[0] << 000 | (uint64_t)q[1] << 010 |
(uint64_t)q[2] << 020 | (uint64_t)q[3] << 030 |
(uint64_t)q[4] << 040 | (uint64_t)q[5] << 050 |
(uint64_t)q[6] << 060 | (uint64_t)q[7] << 070);
if (w) goto ItsDifferent;
p += 8;
q += 8;
n -= 8;
}
goto StartOver;
}
if (!w) return 0;
ItsDifferent:
m = bsfl(w) >> 3;
return p[m] - q[m];
}

View file

@ -1,160 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2021 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/str/str.h"
typedef long long xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
/**
* Copies memory.
*
* DST and SRC may overlap.
*
* @param dst is destination
* @param src is memory to copy
* @param n is number of bytes to copy
* @return dst
* @asyncsignalsafe
*/
void *memmove_pure(void *dst, const void *src, size_t n) {
size_t i;
xmm_t v, w;
char *d, *r;
const char *s;
uint64_t a, b;
d = dst;
s = src;
switch (n) {
case 9 ... 15:
__builtin_memcpy(&a, s, 8);
__builtin_memcpy(&b, s + n - 8, 8);
__builtin_memcpy(d, &a, 8);
__builtin_memcpy(d + n - 8, &b, 8);
return d;
case 5 ... 7:
__builtin_memcpy(&a, s, 4);
__builtin_memcpy(&b, s + n - 4, 4);
__builtin_memcpy(d, &a, 4);
__builtin_memcpy(d + n - 4, &b, 4);
return d;
case 17 ... 32:
__builtin_memcpy(&v, s, 16);
__builtin_memcpy(&w, s + n - 16, 16);
__builtin_memcpy(d, &v, 16);
__builtin_memcpy(d + n - 16, &w, 16);
return d;
case 16:
__builtin_memcpy(&v, s, 16);
__builtin_memcpy(d, &v, 16);
return d;
case 0:
return d;
case 1:
*d = *s;
return d;
case 8:
__builtin_memcpy(&a, s, 8);
__builtin_memcpy(d, &a, 8);
return d;
case 4:
__builtin_memcpy(&a, s, 4);
__builtin_memcpy(d, &a, 4);
return d;
case 2:
__builtin_memcpy(&a, s, 2);
__builtin_memcpy(d, &a, 2);
return d;
case 3:
__builtin_memcpy(&a, s, 2);
__builtin_memcpy(&b, s + 1, 2);
__builtin_memcpy(d, &a, 2);
__builtin_memcpy(d + 1, &b, 2);
return d;
default:
r = d;
if (d > s) {
do {
n -= 32;
__builtin_memcpy(&v, s + n, 16);
__builtin_memcpy(&w, s + n + 16, 16);
__builtin_memcpy(d + n, &v, 16);
__builtin_memcpy(d + n + 16, &w, 16);
} while (n >= 32);
} else {
i = 0;
do {
__builtin_memcpy(&v, s + i, 16);
__builtin_memcpy(&w, s + i + 16, 16);
__builtin_memcpy(d + i, &v, 16);
__builtin_memcpy(d + i + 16, &w, 16);
} while ((i += 32) + 32 <= n);
d += i;
s += i;
n -= i;
}
switch (n) {
case 0:
return r;
case 17 ... 31:
__builtin_memcpy(&v, s, 16);
__builtin_memcpy(&w, s + n - 16, 16);
__builtin_memcpy(d, &v, 16);
__builtin_memcpy(d + n - 16, &w, 16);
return r;
case 9 ... 15:
__builtin_memcpy(&a, s, 8);
__builtin_memcpy(&b, s + n - 8, 8);
__builtin_memcpy(d, &a, 8);
__builtin_memcpy(d + n - 8, &b, 8);
return r;
case 5 ... 7:
__builtin_memcpy(&a, s, 4);
__builtin_memcpy(&b, s + n - 4, 4);
__builtin_memcpy(d, &a, 4);
__builtin_memcpy(d + n - 4, &b, 4);
return r;
case 16:
__builtin_memcpy(&v, s, 16);
__builtin_memcpy(d, &v, 16);
return r;
case 8:
__builtin_memcpy(&a, s, 8);
__builtin_memcpy(d, &a, 8);
return r;
case 4:
__builtin_memcpy(&a, s, 4);
__builtin_memcpy(d, &a, 4);
return r;
case 1:
*d = *s;
return r;
case 2:
__builtin_memcpy(&a, s, 2);
__builtin_memcpy(d, &a, 2);
return r;
case 3:
__builtin_memcpy(&a, s, 2);
__builtin_memcpy(&b, s + 1, 2);
__builtin_memcpy(d, &a, 2);
__builtin_memcpy(d + 1, &b, 2);
return r;
default:
unreachable;
}
}
}

View file

@ -1,100 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2021 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/str/str.h"
/**
* Sets memory.
*
* @param p is memory address
* @param c is masked with 255 and used as repeated byte
* @param n is byte length
* @return p
* @asyncsignalsafe
*/
void *memset_pure(void *p, int c, size_t n) {
char *b;
uint64_t x;
b = p;
x = 0x0101010101010101ul * (c & 0xff);
switch (n) {
case 0:
return p;
case 1:
__builtin_memcpy(b, &x, 1);
return p;
case 2:
__builtin_memcpy(b, &x, 2);
return p;
case 3:
__builtin_memcpy(b, &x, 2);
__builtin_memcpy(b + 1, &x, 2);
return p;
case 4:
__builtin_memcpy(b, &x, 4);
return p;
case 5 ... 7:
__builtin_memcpy(b, &x, 4);
__builtin_memcpy(b + n - 4, &x, 4);
return p;
case 8:
__builtin_memcpy(b, &x, 8);
return p;
case 9 ... 16:
__builtin_memcpy(b, &x, 8);
__builtin_memcpy(b + n - 8, &x, 8);
return p;
default:
do {
n -= 16;
__builtin_memcpy(b + n, &x, 8);
asm volatile("" ::: "memory");
__builtin_memcpy(b + n + 8, &x, 8);
} while (n >= 16);
switch (n) {
case 0:
return p;
case 1:
__builtin_memcpy(b, &x, 1);
return p;
case 2:
__builtin_memcpy(b, &x, 2);
return p;
case 3:
__builtin_memcpy(b, &x, 2);
__builtin_memcpy(b + 1, &x, 2);
return p;
case 4:
__builtin_memcpy(b, &x, 4);
return p;
case 5 ... 7:
__builtin_memcpy(b, &x, 4);
__builtin_memcpy(b + n - 4, &x, 4);
return p;
case 8:
__builtin_memcpy(b, &x, 8);
return p;
case 9 ... 15:
__builtin_memcpy(b, &x, 8);
__builtin_memcpy(b + n - 8, &x, 8);
return p;
default:
unreachable;
}
}
}

View file

@ -16,9 +16,13 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/nexgen32e/kompressor.h"
#include "libc/str/str.h"
void *mempcpy_pure(void *dst, const void *src, size_t n) {
memmove_pure(dst, src, n);
return (char *)dst + n;
void rldecode2(void *d, const struct RlDecode *r) {
char *p;
for (p = d; r->repititions; ++r) {
memset(p, r->byte, r->repititions);
p += r->repititions;
}
}

View file

@ -36,7 +36,7 @@
char *stpncpy(char *dest, const char *src, size_t stride) {
char *p;
if ((p = memccpy(dest, src, '\0', stride))) {
memset(p, 0, dest + stride - p);
bzero(p, dest + stride - p);
}
return dest + stride;
}

View file

@ -81,15 +81,20 @@ wint_t towupper(wint_t);
cosmopolitan § strings
*/
void bzero(void *, size_t) libcesque;
void bzero(void *, size_t) memcpyesque;
void *memset(void *, int, size_t) memcpyesque;
void *memmove(void *, const void *, size_t) memcpyesque;
void *memcpy(void *restrict, const void *restrict, size_t) memcpyesque;
void *mempcpy(void *restrict, const void *restrict, size_t) memcpyesque;
void *memccpy(void *restrict, const void *restrict, int, size_t) memcpyesque;
void *memmove(void *, const void *, size_t) memcpyesque;
void *memeqmask(void *, const void *, const void *, size_t) memcpyesque;
void explicit_bzero(void *, size_t);
int bcmp(const void *, const void *, size_t) strlenesque;
int memcmp(const void *, const void *, size_t) strlenesque;
int timingsafe_bcmp(const void *, const void *, size_t);
int timingsafe_memcmp(const void *, const void *, size_t);
size_t strlen(const char *) strlenesque;
size_t strnlen(const char *, size_t) strlenesque;
size_t strnlen_s(const char *, size_t);
@ -115,7 +120,6 @@ char *strstr(const char *, const char *) strlenesque;
char16_t *strstr16(const char16_t *, const char16_t *) strlenesque;
wchar_t *wcsstr(const wchar_t *, const wchar_t *) strlenesque;
void *rawwmemchr(const void *, wchar_t) strlenesque returnsnonnull;
int memcmp(const void *, const void *, size_t) strlenesque;
int strcmp(const char *, const char *) strlenesque;
int strncmp(const char *, const char *, size_t) strlenesque;
int strcmp16(const char16_t *, const char16_t *) strlenesque;
@ -182,10 +186,9 @@ const char *IndexDoubleNulString(const char *, unsigned) strlenesque;
int strverscmp(const char *, const char *);
wchar_t *wmemset(wchar_t *, wchar_t, size_t) memcpyesque;
char16_t *memset16(char16_t *, char16_t, size_t) memcpyesque;
compatfn wchar_t *wmemcpy(wchar_t *, const wchar_t *, size_t) memcpyesque;
compatfn wchar_t *wmempcpy(wchar_t *, const wchar_t *, size_t) memcpyesque;
compatfn wchar_t *wmemmove(wchar_t *, const wchar_t *, size_t) memcpyesque;
int timingsafe_memcmp(const void *, const void *, size_t);
wchar_t *wmemcpy(wchar_t *, const wchar_t *, size_t) memcpyesque;
wchar_t *wmempcpy(wchar_t *, const wchar_t *, size_t) memcpyesque;
wchar_t *wmemmove(wchar_t *, const wchar_t *, size_t) memcpyesque;
void *tinymemccpy(void *, const void *, int, size_t) memcpyesque;
void *memmem(const void *, size_t, const void *, size_t) libcesque nosideeffect;
char *strerror(int) returnsnonnull nothrow nocallback;
@ -203,11 +206,6 @@ bool IsText(const void *, size_t);
bool IsUtf8(const void *, size_t);
bool _isabspath(const char *) strlenesque;
bool escapedos(char16_t *, unsigned, const char16_t *, unsigned);
void *memset_pure(void *, int, size_t) memcpyesque;
void *memmove_pure(void *, const void *, size_t) memcpyesque;
void *mempcpy_pure(void *, const void *, size_t) memcpyesque;
size_t strlen_pure(const char *) strlenesque;
size_t strcspn_pure(const char *, const char *) strlenesque;
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § strings » multibyte
@ -262,133 +260,11 @@ int iswctype(wint_t, wctype_t) pureconst;
char *strsignal(int) returnsnonnull libcesque;
#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § strings » optimizations
*/
#define __memcpy_isgoodsize(SIZE) \
(__builtin_constant_p(SIZE) && ((SIZE) <= __BIGGEST_ALIGNMENT__ && \
__builtin_popcountl((unsigned)(SIZE)) == 1))
#define __memset_isgoodsize(SIZE) \
(__builtin_constant_p(SIZE) && \
(((SIZE) <= __BIGGEST_ALIGNMENT__ && \
__builtin_popcountl((unsigned)(SIZE)) == 1) || \
((SIZE) % __BIGGEST_ALIGNMENT__ == 0 && \
(SIZE) / __BIGGEST_ALIGNMENT__ <= 3)))
#define memcpy(DEST, SRC, SIZE) \
(__memcpy_isgoodsize(SIZE) ? __builtin_memcpy(DEST, SRC, SIZE) \
: __memcpy("MemCpy", DEST, SRC, SIZE))
#define memset(DEST, BYTE, SIZE) \
(__memset_isgoodsize(SIZE) ? __builtin_memset(DEST, BYTE, SIZE) \
: __memset(DEST, BYTE, SIZE))
#if defined(__STDC_HOSTED__) && defined(__SSE2__)
#define strlen(STR) \
(__builtin_constant_p(STR) ? __builtin_strlen(STR) : ({ \
size_t LeN; \
const char *StR = (STR); \
asm("call\tstrlen" \
: "=a"(LeN) \
: "D"(StR), "m"(*(char(*)[0x7fffffff])StR) \
: "rcx", "rdx", "xmm3", "xmm4", "cc"); \
LeN; \
}))
#define memmove(DEST, SRC, SIZE) __memcpy("MemMove", (DEST), (SRC), (SIZE))
#define mempcpy(DEST, SRC, SIZE) \
({ \
size_t SIze = (SIZE); \
(void *)((char *)memcpy((DEST), (SRC), SIze) + SIze); \
})
#define __memcpy(FN, DEST, SRC, SIZE) \
({ \
void *DeSt = (DEST); \
const void *SrC = (SRC); \
size_t SiZe = (SIZE); \
asm("call\t" FN \
: "=m"(*(char(*)[SiZe])(DeSt)) \
: "D"(DeSt), "S"(SrC), "d"(SiZe), "m"(*(const char(*)[SiZe])(SrC)) \
: "xmm3", "xmm4", "rcx", "cc"); \
DeSt; \
})
#define __memset(DEST, BYTE, SIZE) \
({ \
void *DeSt = (DEST); \
size_t SiZe = (SIZE); \
asm("call\tMemSet" \
: "=m"(*(char(*)[SiZe])(DeSt)) \
: "D"(DeSt), "S"(BYTE), "d"(SiZe) \
: "xmm3", "xmm4", "rcx", "cc"); \
DeSt; \
})
#else /* hosted+sse2 */
#define mempcpy(DEST, SRC, SIZE) \
({ \
void *Rdi, *Dest = (DEST); \
const void *Rsi, *Src = (SRC); \
size_t SiZe = (SIZE); \
size_t Rcx; \
asm("rep movsb" \
: "=D"(Rdi), "=S"(Rsi), "=c"(Rcx), "=m"(*(char(*)[SiZe])(Dest)) \
: "0"(Dest), "1"(Src), "2"(SiZe), "m"(*(const char(*)[SiZe])(Src)) \
: "cc"); \
Rdi; \
})
#define __memcpy(FN, DEST, SRC, SIZE) \
({ \
void *Rdi, *Dest = (DEST); \
const void *Rsi, *Src = (SRC); \
size_t SiZe = (SIZE); \
size_t Rcx; \
asm("rep movsb" \
: "=D"(Rdi), "=S"(Rsi), "=c"(Rcx), "=m"(*(char(*)[SiZe])(Dest)) \
: "0"(Dest), "1"(Src), "2"(SiZe), "m"(*(const char(*)[SiZe])(Src)) \
: "cc"); \
Dest; \
})
#define __memset(DEST, BYTE, SIZE) \
({ \
void *Rdi, *Dest = (DEST); \
size_t SiZe = (SIZE); \
size_t Rcx; \
asm("rep stosb" \
: "=D"(Rdi), "=c"(Rcx), "=m"(*(char(*)[SiZe])(Dest)) \
: "0"(Dest), "1"(SiZe), "a"(BYTE) \
: "cc"); \
Dest; \
})
#endif /* hosted/sse2/unbloat */
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § strings » address sanitizer
*/
#if defined(__FSANITIZE_ADDRESS__)
#undef memcpy
#undef memmove
#undef mempcpy
#undef memset
#undef strlen
#define memcpy memmove_pure
#define memmove memmove_pure
#define mempcpy mempcpy_pure
#define memset memset_pure
#define strcspn strcspn_pure
#define strlen strlen_pure
#endif /* __FSANITIZE_ADDRESS__ */
/* gcc rewrites to memset otherwise :'( */
void __bzero(void *, size_t) asm("bzero") memcpyesque;
#define bzero(DEST, SIZE) \
((void)((__builtin_constant_p(SIZE)) ? memset(DEST, 0, SIZE) \
: __bzero(DEST, SIZE)))
#endif /* __GNUC__ && !__STRICT_ANSI__ */
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */

View file

@ -31,7 +31,8 @@ LIBC_STR_A_DIRECTDEPS = \
LIBC_INTRIN \
LIBC_STUBS \
LIBC_SYSV \
LIBC_NEXGEN32E
LIBC_NEXGEN32E \
THIRD_PARTY_COMPILER_RT
LIBC_STR_A_DEPS := \
$(call uniq,$(foreach x,$(LIBC_STR_A_DIRECTDEPS),$($(x))))
@ -48,15 +49,10 @@ o/$(MODE)/libc/str/memmem.o: \
OVERRIDE_CPPFLAGS += \
-DSTACK_FRAME_UNLIMITED
o//libc/str/bzero.o: \
OVERRIDE_CFLAGS += \
-O2
o/$(MODE)/libc/str/dosdatetimetounix.o: \
OVERRIDE_CFLAGS += \
-O3
o//libc/str/memcmp.o \
o/$(MODE)/libc/str/getzipcdir.o \
o/$(MODE)/libc/str/getzipcdircomment.o \
o/$(MODE)/libc/str/getzipcdircommentsize.o \
@ -88,11 +84,6 @@ o/$(MODE)/libc/str/windowstimetotimespec.o: \
OVERRIDE_CFLAGS += \
-O3
o/$(MODE)/libc/str/hey-gcc.asm \
o/$(MODE)/libc/str/hey.o: \
OVERRIDE_CFLAGS += \
-fsanitize=undefined
LIBC_STR_LIBS = $(foreach x,$(LIBC_STR_ARTIFACTS),$($(x)))
LIBC_STR_SRCS = $(foreach x,$(LIBC_STR_ARTIFACTS),$($(x)_SRCS))
LIBC_STR_HDRS = $(foreach x,$(LIBC_STR_ARTIFACTS),$($(x)_HDRS))
@ -101,7 +92,7 @@ LIBC_STR_BINS = $(foreach x,$(LIBC_STR_ARTIFACTS),$($(x)_BINS))
LIBC_STR_CHECKS = $(foreach x,$(LIBC_STR_ARTIFACTS),$($(x)_CHECKS))
LIBC_STR_OBJS = $(foreach x,$(LIBC_STR_ARTIFACTS),$($(x)_OBJS))
LIBC_STR_TESTS = $(foreach x,$(LIBC_STR_ARTIFACTS),$($(x)_TESTS))
$(LIBC_STR_OBJS): $(BUILD_FILES) libc/str/str.mk
# $(LIBC_STR_OBJS): $(BUILD_FILES) libc/str/str.mk
.PHONY: o/$(MODE)/libc/str
o/$(MODE)/libc/str: $(LIBC_STR_CHECKS)

View file

@ -36,13 +36,13 @@ static noasan inline const char *strchr_x64(const char *p, uint64_t c) {
if (a <= b) {
return p + (a >> 3);
} else {
return NULL;
return 0;
}
} else {
return p + (a >> 3);
}
} else {
return NULL;
return 0;
}
}
}

View file

@ -34,12 +34,12 @@
* @see strspn(), strtok_r()
* @asyncsignalsafe
*/
size_t strcspn_pure(const char *s, const char *reject) {
size_t strcspn(const char *s, const char *reject) {
size_t i, n;
unsigned m;
char cv[16], sv[16];
if ((n = strlen(reject)) < 16) {
memset(sv, 0, 16);
bzero(sv, 16);
memcpy(sv, reject, n);
for (i = 0;; ++i) {
cv[0] = s[i];

View file

@ -1,45 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2021 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/bits/bits.h"
#include "libc/str/str.h"
static inline noasan size_t strlen_pure_x64(const char *s, size_t i) {
uint64_t w;
for (;; i += 8) {
w = READ64LE(s + i);
if ((w = ~w & (w - 0x0101010101010101) & 0x8080808080808080)) {
return i + ((unsigned)__builtin_ctzll(w) >> 3);
}
}
}
/**
* Returns length of NUL-terminated string.
*/
size_t strlen_pure(const char *s) {
size_t i;
for (i = 0; (uintptr_t)(s + i) & 7; ++i) {
if (!s[i]) return i;
}
i = strlen_pure_x64(s, i);
assert(!i || s[0]);
assert(!s[i]);
return i;
}

View file

@ -66,6 +66,6 @@ char *strncpy(char *dest, const char *src, size_t stride) {
for (i = 0; i < stride; ++i) {
if (!(dest[i] = src[i])) break;
}
memset(dest + i, 0, stride - i);
bzero(dest + i, stride - i);
return dest;
}

154
libc/str/timingsafe_bcmp.c Normal file
View file

@ -0,0 +1,154 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2021 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/bits/likely.h"
#include "libc/dce.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
typedef uint64_t xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
static noinline antiquity unsigned timingsafe_bcmp_sse(const char *p,
const char *q,
size_t n) {
uint64_t w;
xmm_t a = {0};
while (n > 16 + 16) {
a |= *(const xmm_t *)p ^ *(const xmm_t *)q;
p += 16;
q += 16;
n -= 16;
}
a |= *(const xmm_t *)p ^ *(const xmm_t *)q;
a |= *(const xmm_t *)(p + n - 16) ^ *(const xmm_t *)(q + n - 16);
w = a[0] | a[1];
return w | w >> 32;
}
microarchitecture("avx") static int timingsafe_bcmp_avx(const char *p,
const char *q,
size_t n) {
uint64_t w;
xmm_t a = {0};
if (n > 32) {
if (n >= 16 + 64) {
xmm_t b = {0};
xmm_t c = {0};
xmm_t d = {0};
do {
a |= ((const xmm_t *)p)[0] ^ ((const xmm_t *)q)[0];
b |= ((const xmm_t *)p)[1] ^ ((const xmm_t *)q)[1];
c |= ((const xmm_t *)p)[2] ^ ((const xmm_t *)q)[2];
d |= ((const xmm_t *)p)[3] ^ ((const xmm_t *)q)[3];
p += 64;
q += 64;
n -= 64;
} while (n >= 16 + 64);
a = a | b | c | d;
}
while (n > 32) {
a |= *(const xmm_t *)p ^ *(const xmm_t *)q;
p += 16;
q += 16;
n -= 16;
}
}
a |= *(const xmm_t *)p ^ *(const xmm_t *)q;
a |= *(const xmm_t *)(p + n - 16) ^ *(const xmm_t *)(q + n - 16);
w = a[0] | a[1];
return w | w >> 32;
}
/**
* Tests inequality of first 𝑛 bytes of 𝑝 and 𝑞.
*
* The following expression:
*
* !!timingsafe_bcmp(p, q, n)
*
* Is functionally equivalent to:
*
* !!memcmp(p, q, n)
*
* This function is faster than memcmp() and bcmp() when byte sequences
* are assumed to always be the same; that makes it best for assertions
* or hash table lookups, assuming 𝑛 is variable (since no gcc builtin)
*
* timingsafe_bcmp n=0 992 picoseconds
* timingsafe_bcmp n=1 1 ns/byte 738 mb/s
* timingsafe_bcmp n=2 826 ps/byte 1,181 mb/s
* timingsafe_bcmp n=3 661 ps/byte 1,476 mb/s
* timingsafe_bcmp n=4 330 ps/byte 2,952 mb/s
* timingsafe_bcmp n=5 264 ps/byte 3,690 mb/s
* timingsafe_bcmp n=6 220 ps/byte 4,428 mb/s
* timingsafe_bcmp n=7 189 ps/byte 5,166 mb/s
* timingsafe_bcmp n=8 124 ps/byte 7,873 mb/s
* timingsafe_bcmp n=9 147 ps/byte 6,643 mb/s
* timingsafe_bcmp n=15 88 ps/byte 11,072 mb/s
* timingsafe_bcmp n=16 62 ps/byte 15,746 mb/s
* timingsafe_bcmp n=17 136 ps/byte 7,170 mb/s
* timingsafe_bcmp n=31 74 ps/byte 13,075 mb/s
* timingsafe_bcmp n=32 72 ps/byte 13,497 mb/s
* timingsafe_bcmp n=33 80 ps/byte 12,179 mb/s
* timingsafe_bcmp n=80 57 ps/byte 16,871 mb/s
* timingsafe_bcmp n=128 49 ps/byte 19,890 mb/s
* timingsafe_bcmp n=256 31 ps/byte 31,493 mb/s
* timingsafe_bcmp n=16384 14 ps/byte 67,941 mb/s
* timingsafe_bcmp n=32768 29 ps/byte 33,121 mb/s
* timingsafe_bcmp n=131072 29 ps/byte 32,949 mb/s
*
* Running time is independent of the byte sequences compared, making
* this safe to use for comparing secret values such as cryptographic
* MACs. In contrast, memcmp() may short-circuit after finding the first
* differing byte.
*
* @return nonzero if unequal, otherwise zero
* @see timingsafe_memcmp()
* @asyncsignalsafe
*/
int timingsafe_bcmp(const void *a, const void *b, size_t n) {
const char *p = a, *q = b;
uint32_t u, u0, u1, u2, u3;
uint64_t w, w0, w1, w2, w3;
if (!IsTiny()) {
if (n >= 8) {
if (n <= 16) {
__builtin_memcpy(&w0, p, 8);
__builtin_memcpy(&w1, q, 8);
__builtin_memcpy(&w2, p + n - 8, 8);
__builtin_memcpy(&w3, q + n - 8, 8);
w = (w0 ^ w1) | (w2 ^ w3);
return w | w >> 32;
} else if (X86_HAVE(AVX)) {
return timingsafe_bcmp_avx(p, q, n);
} else {
return timingsafe_bcmp_sse(p, q, n);
}
} else if (n >= 4) {
__builtin_memcpy(&u0, p, 4);
__builtin_memcpy(&u1, q, 4);
__builtin_memcpy(&u2, p + n - 4, 4);
__builtin_memcpy(&u3, q + n - 4, 4);
return (u0 ^ u1) | (u2 ^ u3);
}
}
for (u = 0; n--;) {
u |= p[n] ^ q[n];
}
return u;
}

View file

@ -24,20 +24,62 @@ Copyright 2014 Google Inc.\"");
asm(".include \"libc/disclaimer.inc\"");
/**
* Lexicographically compares the first len bytes in b1 and b2.
* Lexicographically compares the first 𝑛 bytes in 𝑝 and 𝑞.
*
* The following expression:
*
* timingsafe_memcmp(p, q, n)
*
* Is functionally equivalent to:
*
* MAX(-1, MIN(1, memcmp(p, q, n)))
*
* Running time is independent of the byte sequences compared, making
* this safe to use for comparing secret values such as cryptographic
* MACs. In contrast, memcmp() may short-circuit after finding the first
* differing byte.
*
* timingsafe_memcmp n=0 661 picoseconds
* timingsafe_memcmp n=1 1 ns/byte 590 mb/s
* timingsafe_memcmp n=2 1 ns/byte 738 mb/s
* timingsafe_memcmp n=3 1 ns/byte 805 mb/s
* timingsafe_memcmp n=4 1 ns/byte 843 mb/s
* timingsafe_memcmp n=5 1 ns/byte 922 mb/s
* timingsafe_memcmp n=6 1 ns/byte 932 mb/s
* timingsafe_memcmp n=7 1 ns/byte 939 mb/s
* timingsafe_memcmp n=8 992 ps/byte 984 mb/s
* timingsafe_memcmp n=9 992 ps/byte 984 mb/s
* timingsafe_memcmp n=15 926 ps/byte 1,054 mb/s
* timingsafe_memcmp n=16 950 ps/byte 1,026 mb/s
* timingsafe_memcmp n=17 933 ps/byte 1,045 mb/s
* timingsafe_memcmp n=31 896 ps/byte 1,089 mb/s
* timingsafe_memcmp n=32 888 ps/byte 1,098 mb/s
* timingsafe_memcmp n=33 972 ps/byte 1,004 mb/s
* timingsafe_memcmp n=80 913 ps/byte 1,068 mb/s
* timingsafe_memcmp n=128 891 ps/byte 1,095 mb/s
* timingsafe_memcmp n=256 873 ps/byte 1,118 mb/s
* timingsafe_memcmp n=16384 858 ps/byte 1,138 mb/s
* timingsafe_memcmp n=32768 856 ps/byte 1,140 mb/s
* timingsafe_memcmp n=131072 857 ps/byte 1,138 mb/s
* bcmp ne n=256 3 ps/byte 246 gb/s
* bcmp eq n=256 32 ps/byte 30,233 mb/s
* memcmp ne n=256 3 ps/byte 246 gb/s
* memcmp eq n=256 31 ps/byte 31,493 mb/s
* timingsafe_bcmp ne n=256 27 ps/byte 35,992 mb/s
* timingsafe_bcmp eq n=256 27 ps/byte 35,992 mb/s
* timingsafe_memcmp ne n=256 877 ps/byte 1,113 mb/s
* timingsafe_memcmp eq n=256 883 ps/byte 1,105 mb/s
*
* @note each byte is interpreted as unsigned char
* @return -1, 0, or 1 based on comparison
* @see timingsafe_bcmp() it's 100x faster
* @asyncsignalsafe
*/
int timingsafe_memcmp(const void *b1, const void *b2, size_t len) {
const unsigned char *p1 = b1, *p2 = b2;
int timingsafe_memcmp(const void *p, const void *q, size_t n) {
const unsigned char *p1 = p, *p2 = q;
size_t i;
int res = 0, done = 0;
for (i = 0; i < len; i++) {
for (i = 0; i < n; i++) {
/* lt is -1 if p1[i] < p2[i]; else 0. */
int lt = (p1[i] - p2[i]) >> CHAR_BIT;
/* gt is -1 if p1[i] > p2[i]; else 0. */

View file

@ -1,66 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
.source __FILE__
// Encodes Thompson-Pike varint.
//
// @param edi is int to encode
// @return rax is word-encoded byte buffer
// @note invented on a napkin in a new jersey diner
tpenc: .leafprologue
.profilable
mov %edi,%edi
xor %eax,%eax
cmp $127,%edi
jbe 3f
bsr %edi,%ecx
mov kTpenc-7*(1+1)(,%rcx,2),%ecx
1: mov %edi,%edx
shr $6,%edi
and $0b00111111,%dl
or $0b10000000,%al
or %dl,%al
shl $8,%rax
dec %cl
jnz 1b
2: or %ch,%al
3: or %rdi,%rax
.leafepilogue
.endfn tpenc,globl
.rodata
.align 4
kTpenc: .rept 4 # MSB10 (0x7FF)
.byte 1,0b11000000 # len,mark
.endr
.rept 5 # MSB15 (0xFFFF)
.byte 2,0b11100000 # len,mark
.endr
.rept 5 # MSB20 (0x1FFFFF)
.byte 3,0b11110000 # len,mark
.endr
.rept 5 # MSB25 (0x3FFFFFF)
.byte 4,0b11111000 # len,mark
.endr
.rept 6 # MSB31 (0xffffffff)
.byte 5,0b11111100 # len,mark
.endr
.zero 2
.endobj kTpenc

View file

@ -74,7 +74,7 @@ static uint32_t undeflatetree(struct DeflateState *ds, uint32_t *tree,
size_t i, len;
uint32_t code, slot;
uint16_t codes[16], first[16], counts[16];
memset(counts, 0, sizeof(counts));
bzero(counts, sizeof(counts));
for (i = 0; i < symcount; i++) {
counts[lens[i]]++;
}