mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-07-04 02:08:30 +00:00
Improve performance of printf functions
This commit is contained in:
parent
b107d2709f
commit
dc6d11a031
39 changed files with 577 additions and 650 deletions
|
@ -1,79 +0,0 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/nexgen32e/crc32.h"
|
||||
|
||||
extern const uint32_t kCrc32cTab[256];
|
||||
|
||||
/**
|
||||
* Computes Castagnoli CRC-32 on old computers.
|
||||
*/
|
||||
uint32_t crc32c_pure(uint32_t init, const void *data, size_t size) {
|
||||
const unsigned char *p = data;
|
||||
uint32_t h = init ^ 0xffffffff;
|
||||
unsigned i;
|
||||
for (i = 0; i < size; i++) {
|
||||
h = h >> 8 ^ kCrc32cTab[(h & 0xff) ^ p[i]];
|
||||
}
|
||||
return h ^ 0xffffffff;
|
||||
}
|
||||
|
||||
/*
|
||||
bench_crc32c_pure for #c per n where c ≈ 0.293ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 4305.000 91.375 44.203 74
|
||||
1 75.000 55.875 44.703 73
|
||||
2 46.500 35.188 24.617 132
|
||||
3 40.333 26.625 19.193 169
|
||||
4 32.250 19.969 16.215 200
|
||||
7 18.429 15.089 12.033 270
|
||||
8 20.625 13.547 11.607 280
|
||||
15 15.667 10.775 9.589 339
|
||||
16 17.562 10.695 9.419 345
|
||||
31 12.226 8.891 8.317 391
|
||||
32 13.219 8.480 8.078 402
|
||||
63 9.571 8.065 7.731 420
|
||||
64 9.672 7.955 7.633 426
|
||||
127 8.433 7.548 7.329 443
|
||||
128 8.492 7.528 7.352 442
|
||||
255 7.557 7.366 7.239 449
|
||||
256 7.699 7.342 7.305 445
|
||||
511 7.376 7.243 7.223 450
|
||||
512 7.408 7.233 7.225 450
|
||||
1023 7.188 7.192 7.098 458
|
||||
1024 7.171 7.194 7.097 458
|
||||
2047 7.130 7.172 7.085 459
|
||||
2048 7.117 7.170 7.169 453
|
||||
4095 7.063 7.076 7.085 459
|
||||
4096 7.078 7.161 7.081 459
|
||||
8191 7.041 7.095 7.055 461
|
||||
8192 7.051 7.098 7.087 459
|
||||
16383 7.039 7.114 7.067 460
|
||||
16384 6.876 6.931 7.133 456
|
||||
32767 7.055 7.108 7.290 446
|
||||
32768 6.868 6.887 6.974 466
|
||||
65535 6.984 6.885 6.967 467
|
||||
65536 6.877 6.924 10.994 296
|
||||
131071 7.166 7.141 7.011 464
|
||||
131072 6.853 6.971 7.694 422
|
||||
262143 6.853 7.213 7.406 439
|
||||
262144 6.852 6.968 7.290 446
|
||||
524287 7.398 7.389 7.166 454
|
||||
524288 6.851 7.094 7.159 454
|
||||
*/
|
|
@ -1,95 +0,0 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/str/internal.h"
|
||||
|
||||
/**
|
||||
* Hashes data with hardware acceleration at 10GBps.
|
||||
* @note needs Nehalem+ c. 2008 or Bulldozer+ c. 2011
|
||||
*/
|
||||
optimizespeed uint32_t crc32c_sse42(uint32_t init, const void *data, size_t n) {
|
||||
const unsigned char *p = (const unsigned char *)data;
|
||||
const unsigned char *pe = (const unsigned char *)data + n;
|
||||
uint32_t h = init ^ 0xffffffff;
|
||||
if (n >= 16 + 8) {
|
||||
while ((uintptr_t)p & 7) asm("crc32b\t%1,%0" : "+r"(h) : "rm"(*p++));
|
||||
uint64_t hl = h;
|
||||
while (p < pe - 16ul) {
|
||||
asm("crc32q\t%1,%0" : "+r"(hl) : "rm"(*(const uint64_t *)p));
|
||||
p += 8;
|
||||
asm("crc32q\t%1,%0" : "+r"(hl) : "rm"(*(const uint64_t *)p));
|
||||
p += 8;
|
||||
}
|
||||
h = (uint32_t)hl;
|
||||
}
|
||||
while (p < pe) asm("crc32b\t%1,%0" : "+r"(h) : "rm"(*p++));
|
||||
return h ^ 0xffffffff;
|
||||
}
|
||||
|
||||
/*
|
||||
bench_crc32c_sse42 for #c per n where c ≈ 0.293ns
|
||||
N x1 x8 x64 mBps
|
||||
------------------------------------------------------------
|
||||
1 877.000 43.375 40.359 81
|
||||
1 45.000 39.625 40.484 80
|
||||
2 34.500 27.562 20.461 159
|
||||
3 23.000 16.708 14.245 228
|
||||
4 18.250 13.094 11.449 284
|
||||
7 10.429 8.339 8.185 397
|
||||
8 42.125 8.734 6.850 475
|
||||
15 9.400 5.375 4.884 665
|
||||
16 7.312 5.070 4.882 666
|
||||
31 5.258 2.923 2.680 1213
|
||||
32 3.969 2.676 2.562 1269
|
||||
63 3.095 1.581 1.428 2276
|
||||
64 2.234 1.623 1.478 2199
|
||||
127 1.205 0.901 0.900 3610
|
||||
128 1.164 0.960 0.915 3552
|
||||
255 0.922 0.651 0.618 5260
|
||||
256 0.715 0.650 0.609 5341
|
||||
511 0.558 0.482 0.477 6819
|
||||
512 0.529 0.475 0.469 6932
|
||||
1023 0.425 0.400 0.396 8204
|
||||
1024 0.417 0.392 0.388 8383
|
||||
2047 0.367 0.355 0.353 9199
|
||||
2048 0.374 0.366 0.364 8929
|
||||
4095 0.351 0.338 0.337 9644
|
||||
4096 0.353 0.338 0.338 9624
|
||||
8191 0.335 0.338 0.337 9641
|
||||
8192 0.335 0.329 0.329 9870
|
||||
16383 0.336 0.325 0.325 10011
|
||||
16384 0.336 0.326 0.375 8666
|
||||
32767 0.329 0.323 0.323 10070
|
||||
32768 0.327 0.324 0.323 10062
|
||||
65535 0.322 0.322 0.322 10103
|
||||
65536 0.321 0.322 0.322 10102
|
||||
131071 0.322 0.321 0.321 10125
|
||||
131072 0.321 0.321 0.321 10124
|
||||
262143 0.322 0.321 0.335 9699
|
||||
262144 0.321 0.321 0.321 10134
|
||||
524287 0.321 0.321 0.499 6516
|
||||
524288 0.321 0.321 0.339 9575
|
||||
1048575 0.322 0.321 0.322 10095
|
||||
1048576 0.320 1.001 0.323 10048
|
||||
2097151 0.325 0.321 0.322 10086
|
||||
2097152 0.330 0.320 0.323 10076
|
||||
4194303 0.331 0.322 0.321 10128
|
||||
4194304 0.332 0.321 0.325 10004
|
||||
8388607 0.334 0.332 0.331 9829
|
||||
8388608 0.334 0.329 0.327 9934
|
||||
*/
|
|
@ -1,43 +0,0 @@
|
|||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/dce.h"
|
||||
#include "libc/macros.internal.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "libc/notice.inc"
|
||||
|
||||
// Computes 32-bit Castagnoli Cyclic Redundancy Check.
|
||||
//
|
||||
// @param edi is the initial hash value (0 is fine)
|
||||
// @param rsi points to the data
|
||||
// @param rdx is the byte size of data
|
||||
// @return eax is the new hash value
|
||||
// @note Used by ISCSI, TensorFlow, etc.
|
||||
.initbss 300,_init_crc32c
|
||||
crc32c: .quad 0
|
||||
.endobj crc32c,globl
|
||||
.previous
|
||||
|
||||
.init.start 300,_init_crc32c
|
||||
ezlea crc32c_pure,ax
|
||||
ezlea crc32c_sse42,cx
|
||||
testb X86_HAVE(SSE4_2)+kCpuids(%rip)
|
||||
cmovnz %rcx,%rax
|
||||
stosq
|
||||
.init.end 300,_init_crc32c
|
||||
.source __FILE__
|
|
@ -22,16 +22,29 @@
|
|||
/**
|
||||
* Computes 32-bit Castagnoli Cyclic Redundancy Check.
|
||||
*
|
||||
* @param h is the initial hash value (0 is fine)
|
||||
* @param p points to the data
|
||||
* @param n is the byte size of data
|
||||
* @param init is the initial hash value
|
||||
* @param data points to the data
|
||||
* @param size is the byte size of data
|
||||
* @return eax is the new hash value
|
||||
* @note Used by ISCSI, TensorFlow, etc.
|
||||
*/
|
||||
uint32_t crc32c(uint32_t h, const void *p, size_t n) {
|
||||
uint32_t crc32c(uint32_t init, const void *data, size_t size) {
|
||||
uint64_t h;
|
||||
const unsigned char *p, *pe;
|
||||
p = data;
|
||||
pe = p + size;
|
||||
h = init ^ 0xffffffff;
|
||||
if (X86_HAVE(SSE4_2)) {
|
||||
return crc32c_sse42(h, p, n);
|
||||
for (; p + 8 <= pe; p += 8) {
|
||||
asm("crc32q\t%1,%0" : "+r"(h) : "rm"(*(const uint64_t *)p));
|
||||
}
|
||||
while (p < pe) {
|
||||
asm("crc32b\t%1,%0" : "+r"(h) : "rm"(*p++));
|
||||
}
|
||||
} else {
|
||||
return crc32c_pure(h, p, n);
|
||||
while (p < pe) {
|
||||
h = h >> 8 ^ kCrc32cTab[(h & 0xff) ^ *p++];
|
||||
}
|
||||
}
|
||||
return h ^ 0xffffffff;
|
||||
}
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
#include "libc/intrin/pmovmskb.h"
|
||||
#include "libc/str/str.h"
|
||||
|
||||
static noasan size_t stpcpy_sse2(char *d, const char *s, size_t i) {
|
||||
static inline noasan size_t stpcpy_sse2(char *d, const char *s, size_t i) {
|
||||
uint8_t v1[16], v2[16], vz[16];
|
||||
for (;;) {
|
||||
memset(vz, 0, 16);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue