Make numerous improvements

- Python static hello world now 1.8mb
- Python static fully loaded now 10mb
- Python HTTPS client now uses MbedTLS
- Python REPL now completes import stmts
- Increase stack size for Python for now
- Begin synthesizing posixpath and ntpath
- Restore Python \N{UNICODE NAME} support
- Restore Python NFKD symbol normalization
- Add optimized code path for Intel SHA-NI
- Get more Python unit tests passing faster
- Get Python help() pagination working on NT
- Python hashlib now supports MbedTLS PBKDF2
- Make memcpy/memmove/memcmp/bcmp/etc. faster
- Add Mersenne Twister and Vigna to LIBC_RAND
- Provide privileged __printf() for error code
- Fix zipos opendir() so that it reports ENOTDIR
- Add basic chmod() implementation for Windows NT
- Add Cosmo's best functions to Python cosmo module
- Pin function trace indent depth to that of caller
- Show memory diagram on invalid access in MODE=dbg
- Differentiate stack overflow on crash in MODE=dbg
- Add stb_truetype and tools for analyzing font files
- Upgrade to UNICODE 13 and reduce its binary footprint
- COMPILE.COM now logs resource usage of build commands
- Start implementing basic poll() support on bare metal
- Set getauxval(AT_EXECFN) to GetModuleFileName() on NT
- Add descriptions to strerror() in non-TINY build modes
- Add COUNTBRANCH() macro to help with micro-optimizations
- Make error / backtrace / asan / memory code more unbreakable
- Add fast perfect C implementation of μ-Law and a-Law audio codecs
- Make strtol() functions consistent with other libc implementations
- Improve Linenoise implementation (see also github.com/jart/bestline)
- COMPILE.COM now suppresses stdout/stderr of successful build commands
This commit is contained in:
Justine Tunney 2021-09-27 22:58:51 -07:00
parent fa7b4f5bd1
commit 39bf41f4eb
806 changed files with 77494 additions and 63859 deletions

View file

@ -16,17 +16,122 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/bits/likely.h"
#include "libc/dce.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
typedef uint64_t xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
static noinline antiquity int memcmp_sse(const char *p, const char *q,
size_t n) {
xmm_t a;
while (n > 32) {
a = *(const xmm_t *)p ^ *(const xmm_t *)q;
if (a[0] | a[1]) return 1;
p += 16;
q += 16;
n -= 16;
}
a = *(const xmm_t *)p ^ *(const xmm_t *)q |
*(const xmm_t *)(p + n - 16) ^ *(const xmm_t *)(q + n - 16);
return !!(a[0] | a[1]);
}
microarchitecture("avx") static int memcmp_avx(const char *p, const char *q,
size_t n) {
xmm_t a, b, c, d;
if (n > 32) {
if (n >= 16 + 64) {
do {
a = ((const xmm_t *)p)[0] ^ ((const xmm_t *)q)[0];
b = ((const xmm_t *)p)[1] ^ ((const xmm_t *)q)[1];
c = ((const xmm_t *)p)[2] ^ ((const xmm_t *)q)[2];
d = ((const xmm_t *)p)[3] ^ ((const xmm_t *)q)[3];
a = a | b | c | d;
if (a[0] | a[1]) return 1;
p += 64;
q += 64;
n -= 64;
} while (n >= 16 + 64);
}
while (n > 16 + 16) {
a = *(const xmm_t *)p ^ *(const xmm_t *)q;
if (a[0] | a[1]) return 1;
p += 16;
q += 16;
n -= 16;
}
}
a = *(const xmm_t *)p ^ *(const xmm_t *)q |
*(const xmm_t *)(p + n - 16) ^ *(const xmm_t *)(q + n - 16);
return !!(a[0] | a[1]);
}
/**
* Compares memory.
* Tests inequality of first 𝑛 bytes of 𝑝 and 𝑞.
*
* This API was thought to be nearly extinct until recent versions
* of Clang (c. 2019) started generating synthetic calls to it.
* bcmp n=0 992 picoseconds
* bcmp n=1 992 ps/byte 984 mb/s
* bcmp n=2 661 ps/byte 1,476 mb/s
* bcmp n=3 441 ps/byte 2,214 mb/s
* bcmp n=4 330 ps/byte 2,952 mb/s
* bcmp n=5 264 ps/byte 3,690 mb/s
* bcmp n=6 165 ps/byte 5,905 mb/s
* bcmp n=7 189 ps/byte 5,166 mb/s
* bcmp n=8 124 ps/byte 7,873 mb/s
* bcmp n=9 183 ps/byte 5,314 mb/s
* bcmp n=15 110 ps/byte 8,857 mb/s
* bcmp n=16 62 ps/byte 15,746 mb/s
* bcmp n=17 175 ps/byte 5,577 mb/s
* bcmp n=31 96 ps/byte 10,169 mb/s
* bcmp n=32 93 ps/byte 10,497 mb/s
* bcmp n=33 80 ps/byte 12,179 mb/s
* bcmp n=80 37 ps/byte 26,244 mb/s
* bcmp n=128 36 ps/byte 26,994 mb/s
* bcmp n=256 27 ps/byte 35,992 mb/s
* bcmp n=16384 19 ps/byte 49,411 mb/s
* bcmp n=32768 27 ps/byte 34,914 mb/s
* bcmp n=131072 30 ps/byte 32,303 mb/s
*
* @return 0 if a and b have equal contents, otherwise non-zero
* @return 0 if a and b have equal contents, otherwise nonzero
* @see timingsafe_bcmp()
* @asyncsignalsafe
*/
int bcmp(const void *a, const void *b, size_t n) {
return memcmp(a, b, n);
int c;
unsigned u;
uint32_t i, j;
uint64_t x, y;
const char *p, *q;
if ((p = a) == (q = b)) return 0;
if (!IsTiny()) {
if (n <= 16) {
if (n >= 8) {
__builtin_memcpy(&x, p, 8);
__builtin_memcpy(&y, q, 8);
if (x ^ y) return 1;
__builtin_memcpy(&x, p + n - 8, 8);
__builtin_memcpy(&y, q + n - 8, 8);
return !!(x ^ y);
} else if (n >= 4) {
__builtin_memcpy(&i, p, 4);
__builtin_memcpy(&j, q, 4);
if (i ^ j) return 1;
__builtin_memcpy(&i, p + n - 4, 4);
__builtin_memcpy(&j, q + n - 4, 4);
return !!(i ^ j);
}
} else if (LIKELY(X86_HAVE(AVX))) {
return memcmp_avx(p, q, n);
} else {
return memcmp_sse(p, q, n);
}
}
while (n--) {
if ((c = p[n] ^ q[n])) {
return c;
}
}
return 0;
}