Improve performance of printf functions

This commit is contained in:
Justine Tunney 2021-04-24 13:58:34 -07:00
parent b107d2709f
commit dc6d11a031
39 changed files with 577 additions and 650 deletions

View file

@ -1,79 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/nexgen32e/crc32.h"
extern const uint32_t kCrc32cTab[256];
/**
* Computes Castagnoli CRC-32 on old computers.
*/
uint32_t crc32c_pure(uint32_t init, const void *data, size_t size) {
const unsigned char *p = data;
uint32_t h = init ^ 0xffffffff;
unsigned i;
for (i = 0; i < size; i++) {
h = h >> 8 ^ kCrc32cTab[(h & 0xff) ^ p[i]];
}
return h ^ 0xffffffff;
}
/*
bench_crc32c_pure for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 4305.000 91.375 44.203 74
1 75.000 55.875 44.703 73
2 46.500 35.188 24.617 132
3 40.333 26.625 19.193 169
4 32.250 19.969 16.215 200
7 18.429 15.089 12.033 270
8 20.625 13.547 11.607 280
15 15.667 10.775 9.589 339
16 17.562 10.695 9.419 345
31 12.226 8.891 8.317 391
32 13.219 8.480 8.078 402
63 9.571 8.065 7.731 420
64 9.672 7.955 7.633 426
127 8.433 7.548 7.329 443
128 8.492 7.528 7.352 442
255 7.557 7.366 7.239 449
256 7.699 7.342 7.305 445
511 7.376 7.243 7.223 450
512 7.408 7.233 7.225 450
1023 7.188 7.192 7.098 458
1024 7.171 7.194 7.097 458
2047 7.130 7.172 7.085 459
2048 7.117 7.170 7.169 453
4095 7.063 7.076 7.085 459
4096 7.078 7.161 7.081 459
8191 7.041 7.095 7.055 461
8192 7.051 7.098 7.087 459
16383 7.039 7.114 7.067 460
16384 6.876 6.931 7.133 456
32767 7.055 7.108 7.290 446
32768 6.868 6.887 6.974 466
65535 6.984 6.885 6.967 467
65536 6.877 6.924 10.994 296
131071 7.166 7.141 7.011 464
131072 6.853 6.971 7.694 422
262143 6.853 7.213 7.406 439
262144 6.852 6.968 7.290 446
524287 7.398 7.389 7.166 454
524288 6.851 7.094 7.159 454
*/

View file

@ -1,95 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/str/internal.h"
/**
* Hashes data with hardware acceleration at 10GBps.
* @note needs Nehalem+ c. 2008 or Bulldozer+ c. 2011
*/
optimizespeed uint32_t crc32c_sse42(uint32_t init, const void *data, size_t n) {
const unsigned char *p = (const unsigned char *)data;
const unsigned char *pe = (const unsigned char *)data + n;
uint32_t h = init ^ 0xffffffff;
if (n >= 16 + 8) {
while ((uintptr_t)p & 7) asm("crc32b\t%1,%0" : "+r"(h) : "rm"(*p++));
uint64_t hl = h;
while (p < pe - 16ul) {
asm("crc32q\t%1,%0" : "+r"(hl) : "rm"(*(const uint64_t *)p));
p += 8;
asm("crc32q\t%1,%0" : "+r"(hl) : "rm"(*(const uint64_t *)p));
p += 8;
}
h = (uint32_t)hl;
}
while (p < pe) asm("crc32b\t%1,%0" : "+r"(h) : "rm"(*p++));
return h ^ 0xffffffff;
}
/*
bench_crc32c_sse42 for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 877.000 43.375 40.359 81
1 45.000 39.625 40.484 80
2 34.500 27.562 20.461 159
3 23.000 16.708 14.245 228
4 18.250 13.094 11.449 284
7 10.429 8.339 8.185 397
8 42.125 8.734 6.850 475
15 9.400 5.375 4.884 665
16 7.312 5.070 4.882 666
31 5.258 2.923 2.680 1213
32 3.969 2.676 2.562 1269
63 3.095 1.581 1.428 2276
64 2.234 1.623 1.478 2199
127 1.205 0.901 0.900 3610
128 1.164 0.960 0.915 3552
255 0.922 0.651 0.618 5260
256 0.715 0.650 0.609 5341
511 0.558 0.482 0.477 6819
512 0.529 0.475 0.469 6932
1023 0.425 0.400 0.396 8204
1024 0.417 0.392 0.388 8383
2047 0.367 0.355 0.353 9199
2048 0.374 0.366 0.364 8929
4095 0.351 0.338 0.337 9644
4096 0.353 0.338 0.338 9624
8191 0.335 0.338 0.337 9641
8192 0.335 0.329 0.329 9870
16383 0.336 0.325 0.325 10011
16384 0.336 0.326 0.375 8666
32767 0.329 0.323 0.323 10070
32768 0.327 0.324 0.323 10062
65535 0.322 0.322 0.322 10103
65536 0.321 0.322 0.322 10102
131071 0.322 0.321 0.321 10125
131072 0.321 0.321 0.321 10124
262143 0.322 0.321 0.335 9699
262144 0.321 0.321 0.321 10134
524287 0.321 0.321 0.499 6516
524288 0.321 0.321 0.339 9575
1048575 0.322 0.321 0.322 10095
1048576 0.320 1.001 0.323 10048
2097151 0.325 0.321 0.322 10086
2097152 0.330 0.320 0.323 10076
4194303 0.331 0.322 0.321 10128
4194304 0.332 0.321 0.325 10004
8388607 0.334 0.332 0.331 9829
8388608 0.334 0.329 0.327 9934
*/

View file

@ -1,43 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/dce.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/notice.inc"
// Computes 32-bit Castagnoli Cyclic Redundancy Check.
//
// @param edi is the initial hash value (0 is fine)
// @param rsi points to the data
// @param rdx is the byte size of data
// @return eax is the new hash value
// @note Used by ISCSI, TensorFlow, etc.
.initbss 300,_init_crc32c
crc32c: .quad 0
.endobj crc32c,globl
.previous
.init.start 300,_init_crc32c
ezlea crc32c_pure,ax
ezlea crc32c_sse42,cx
testb X86_HAVE(SSE4_2)+kCpuids(%rip)
cmovnz %rcx,%rax
stosq
.init.end 300,_init_crc32c
.source __FILE__

View file

@ -22,16 +22,29 @@
/**
* Computes 32-bit Castagnoli Cyclic Redundancy Check.
*
* @param h is the initial hash value (0 is fine)
* @param p points to the data
* @param n is the byte size of data
* @param init is the initial hash value
* @param data points to the data
* @param size is the byte size of data
* @return eax is the new hash value
* @note Used by ISCSI, TensorFlow, etc.
*/
uint32_t crc32c(uint32_t h, const void *p, size_t n) {
uint32_t crc32c(uint32_t init, const void *data, size_t size) {
uint64_t h;
const unsigned char *p, *pe;
p = data;
pe = p + size;
h = init ^ 0xffffffff;
if (X86_HAVE(SSE4_2)) {
return crc32c_sse42(h, p, n);
for (; p + 8 <= pe; p += 8) {
asm("crc32q\t%1,%0" : "+r"(h) : "rm"(*(const uint64_t *)p));
}
while (p < pe) {
asm("crc32b\t%1,%0" : "+r"(h) : "rm"(*p++));
}
} else {
return crc32c_pure(h, p, n);
while (p < pe) {
h = h >> 8 ^ kCrc32cTab[(h & 0xff) ^ *p++];
}
}
return h ^ 0xffffffff;
}

View file

@ -20,7 +20,7 @@
#include "libc/intrin/pmovmskb.h"
#include "libc/str/str.h"
static noasan size_t stpcpy_sse2(char *d, const char *s, size_t i) {
static inline noasan size_t stpcpy_sse2(char *d, const char *s, size_t i) {
uint8_t v1[16], v2[16], vz[16];
for (;;) {
memset(vz, 0, 16);