cosmopolitan/tool/build/bigmul.c
Justine Tunney 398f0c16fb Add SNI support to redbean and improve SSL perf
This change makes SSL virtual hosting possible. You can now load
multiple certificates for multiple domains and redbean will just
figure out which one to use, even if you only have 1 ip address.
You can also use a jumbo certificate that lists all your domains
in the the subject alternative names.

This change also makes performance improvements to MbedTLS. Here
are some benchmarks vs. cc1920749e

                                   BEFORE    AFTER   (microsecs)
suite_ssl.com                     2512881   191738 13.11x faster
suite_pkparse.com                   36291     3295 11.01x faster
suite_x509parse.com                854669   120293  7.10x faster
suite_pkwrite.com                    6549     1265  5.18x faster
suite_ecdsa.com                     53347    18778  2.84x faster
suite_pk.com                        49051    18717  2.62x faster
suite_ecdh.com                      19535     9502  2.06x faster
suite_shax.com                      15848     7965  1.99x faster
suite_rsa.com                      353257   184828  1.91x faster
suite_x509write.com                162646    85733  1.90x faster
suite_ecp.com                       20503    11050  1.86x faster
suite_hmac_drbg.no_reseed.com       19528    11417  1.71x faster
suite_hmac_drbg.nopr.com            12460     8010  1.56x faster
suite_mpi.com                      687124   442661  1.55x faster
suite_hmac_drbg.pr.com              11890     7752  1.53x faster

There aren't any special tricks to the performance imporvements.
It's mostly due to code cleanup, assembly and intel instructions
like mulx, adox, and adcx.
2021-07-23 13:56:13 -07:00

185 lines
6.7 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2021 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/assert.h"
#include "libc/fmt/conv.h"
#include "libc/log/log.h"
#include "libc/macros.internal.h"
#include "libc/mem/mem.h"
#include "libc/runtime/gc.internal.h"
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
/**
* @fileoverview cryptographic multiplication kernel generator
*/
void PrintMultiplyKernel(int n, int m) {
bool cf, of;
uint128_t x;
bool *Rs, *Ra;
int j, i, k1, k2, g;
uint64_t *R, *H;
printf("\
/**\n\
* Computes %d-bit product of %d-bit and %d-bit numbers.\n\
*\n\
* @param C receives %d quadword result\n\
* @param A is left hand side which must have %d quadwords\n\
* @param B is right hand side which must have %d quadwords\n\
* @note words are host endian while array is little endian\n\
* @mayalias\n\
*/\n\
void Multiply%dx%d(uint64_t C[%d], const uint64_t A[%d], const uint64_t B[%d]) {\n\
static bool cf,of;\n\
uint64_t z,h,l;\n\
uint64_t ",
(n + m) * 64, n * 64, m * 64, n + m, n, m, n, m, n + m, n, m);
Rs = gc(calloc(sizeof(*Rs), n + m + 1));
Ra = gc(calloc(sizeof(*Ra), n + m + 1));
for (j = 0; j < n; ++j) {
if (j) printf(", ");
printf("H%d", j);
}
printf(";\n");
printf(" uint64_t ");
for (j = 0; j < n + m; ++j) {
if (j) printf(", ");
printf("R%d", j);
}
printf(";\n");
for (j = 0; j < m; ++j) {
(printf)("\
asm(\"xorl\\t%%k0,%%k0\" : \"=r\"(z), \"+m\"(cf), \"+m\"(of));\n",
j);
for (cf = of = i = 0; i < n; ++i) {
if (!i) {
if (!Rs[i + j] && !Rs[i + j + 1]) {
assert(!cf);
assert(!of);
Rs[i + j + 0] = true;
Rs[i + j + 1] = true;
(printf)("\
asm(\"mulx\\t%%2,%%1,%%0\" : \"=r\"(R%d), \"=r\"(R%d) : \"rm\"(A[%d]), \"d\"(B[%d]));\n",
i + j + 1, i + j, i, j);
} else if (!Rs[i + j + 1]) {
of = true;
assert(!cf);
Ra[i + j + 0] = true;
Rs[i + j + 1] = true;
(printf)("\
asm(\"mulx\\t%%2,%%1,%%0\" : \"=r\"(R%d), \"=r\"(l) : \"rm\"(A[%d]), \"d\"(B[%d]));\n\
asm(\"adox\\t%%2,%%0\" : \"+r\"(R%d), \"+m\"(of) : \"r\"(l));\n",
i + j + 1, i, j, i + j);
} else {
cf = true;
of = true;
assert(Rs[i + j]);
Ra[i + j + 0] = true;
Ra[i + j + 1] = true;
(printf)("\
asm(\"mulx\\t%%2,%%1,%%0\" : \"=r\"(h), \"=r\"(l) : \"rm\"(A[%d]), \"d\"(B[%d]));\n\
asm(\"adox\\t%%2,%%0\" : \"+r\"(R%d), \"+m\"(of) : \"r\"(l));\n\
asm(\"adcx\\t%%2,%%0\" : \"+r\"(R%d), \"+m\"(cf) : \"r\"(h));\n",
i, j, i + j, i + j + 1);
}
} else {
assert(Rs[i + j]);
if (!Rs[i + j + 1]) {
if (cf) {
of = true;
cf = false;
Ra[i + j + 0] = true;
Rs[i + j + 1] = true;
Ra[i + j + 1] = false;
(printf)("\
asm(\"mulx\\t%%2,%%1,%%0\" : \"=r\"(R%d), \"=r\"(l) : \"rm\"(A[%d]), \"d\"(B[%d]));\n\
asm(\"adox\\t%%2,%%0\" : \"+r\"(R%d), \"+m\"(of) : \"r\"(l));\n\
asm(\"adcx\\t%%2,%%0\" : \"+r\"(R%d), \"+m\"(cf) : \"r\"(z));\n",
i + j + 1, i, j, i + j, i + j + 1);
} else {
of = true;
Ra[i + j + 0] = true;
Rs[i + j + 1] = true;
Ra[i + j + 1] = false;
(printf)("\
asm(\"mulx\\t%%2,%%1,%%0\" : \"=r\"(R%d), \"=r\"(l) : \"rm\"(A[%d]), \"d\"(B[%d]));\n\
asm(\"adox\\t%%2,%%0\" : \"+r\"(R%d), \"+m\"(of) : \"r\"(l));\n",
i + j + 1, i, j, i + j);
}
} else {
of = true;
cf = true;
Ra[i + j + 0] = true;
Ra[i + j + 1] = true;
(printf)("\
asm(\"mulx\\t%%2,%%1,%%0\" : \"=r\"(h), \"=r\"(l) : \"rm\"(A[%d]), \"d\"(B[%d]));\n\
asm(\"adox\\t%%2,%%0\" : \"+r\"(R%d), \"+m\"(of) : \"r\"(l));\n\
asm(\"adcx\\t%%2,%%0\" : \"+r\"(R%d), \"+m\"(cf) : \"rm\"(h));\n",
i, j, i + j, i + j + 1);
}
}
}
k1 = 0;
if (of) {
for (;; ++k1) {
(printf)("\
asm(\"adox\\t%%2,%%0\" : \"+r\"(R%d), \"+m\"(of) : \"r\"(z));\n",
i + j + k1);
if (!Rs[i + j + k1]) {
break;
}
if (!Ra[i + j + k1]) {
break;
}
}
}
k2 = 0;
if (cf) {
for (;; ++k2) {
(printf)("\
asm(\"adcx\\t%%2,%%0\" : \"+r\"(R%d), \"+m\"(cf) : \"r\"(z));\n",
i + j + k2);
if (!Rs[i + j + k2]) {
break;
}
if (!Ra[i + j + k2]) {
break;
}
}
}
for (g = 0; g < MAX(k1, k2); ++g) {
Rs[i + j + g] = true;
}
}
for (j = 0; j < n + m; ++j) {
printf(" C[%d] = R%d;\n", j, j);
}
printf("}\n");
fflush(stdout);
}
int main(int argc, char *argv[]) {
int n, m;
if (argc != 3 || (n = atoi(argv[1])) <= 0 || (m = atoi(argv[2])) <= 0) {
fprintf(stderr, "Usage: %s LHS-LIMBS RHS-LIMBS\n", argv[0]);
return 1;
}
PrintMultiplyKernel(n, m);
return 0;
}