cosmopolitan/libc/nexgen32e/mul4x4adx.S
Justine Tunney 398f0c16fb Add SNI support to redbean and improve SSL perf
This change makes SSL virtual hosting possible. You can now load
multiple certificates for multiple domains and redbean will just
figure out which one to use, even if you only have 1 ip address.
You can also use a jumbo certificate that lists all your domains
in the the subject alternative names.

This change also makes performance improvements to MbedTLS. Here
are some benchmarks vs. cc1920749e

                                   BEFORE    AFTER   (microsecs)
suite_ssl.com                     2512881   191738 13.11x faster
suite_pkparse.com                   36291     3295 11.01x faster
suite_x509parse.com                854669   120293  7.10x faster
suite_pkwrite.com                    6549     1265  5.18x faster
suite_ecdsa.com                     53347    18778  2.84x faster
suite_pk.com                        49051    18717  2.62x faster
suite_ecdh.com                      19535     9502  2.06x faster
suite_shax.com                      15848     7965  1.99x faster
suite_rsa.com                      353257   184828  1.91x faster
suite_x509write.com                162646    85733  1.90x faster
suite_ecp.com                       20503    11050  1.86x faster
suite_hmac_drbg.no_reseed.com       19528    11417  1.71x faster
suite_hmac_drbg.nopr.com            12460     8010  1.56x faster
suite_mpi.com                      687124   442661  1.55x faster
suite_hmac_drbg.pr.com              11890     7752  1.53x faster

There aren't any special tricks to the performance imporvements.
It's mostly due to code cleanup, assembly and intel instructions
like mulx, adox, and adcx.
2021-07-23 13:56:13 -07:00

116 lines
3.4 KiB
ArmAsm

/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2021 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
Mul4x4Adx:
push %rbp
mov %rsp,%rbp
.profilable
push %r15
push %r14
push %r13
push %r12
mov %rdx,%r12
push %rbx
sub $16,%rsp
mov (%rdx),%rdx
mov (%rsi),%rax
mov 16(%rsi),%r11
mov 24(%rsi),%r10
xor %r13d,%r13d
mulx %rax,%rbx,%rax
mov %rbx,-48(%rbp)
mov 8(%rsi),%rbx
mulx %rbx,%rdx,%rcx
adox %rdx,%rax
mov (%r12),%rdx
mulx %r11,%rdx,%r9
adox %rdx,%rcx
mov (%r12),%rdx
mulx %r10,%rdx,%r8
adox %rdx,%r9
adox %r13,%r8
xor %r13d,%r13d
mov (%rsi),%r14
mov 8(%r12),%rdx
mulx %r14,%r14,%r15
adox %r14,%rax
adcx %r15,%rcx
mov %rax,-56(%rbp)
mulx %rbx,%r14,%rax
adox %r14,%rcx
adcx %rax,%r9
mulx %r11,%r14,%rax
adox %r14,%r9
adcx %rax,%r8
mulx %r10,%rdx,%rax
adox %rdx,%r8
mov 16(%r12),%rdx
adcx %r13,%rax
adox %r13,%rax
mov (%rsi),%r13
xor %r15d,%r15d
mulx %r13,%r13,%r14
adox %r13,%rcx
adcx %r14,%r9
mulx %rbx,%r14,%r13
adox %r14,%r9
adcx %r13,%r8
mulx %r11,%r14,%r13
adox %r14,%r8
adcx %r13,%rax
mov (%rsi),%rsi
mulx %r10,%rdx,%r13
adox %rdx,%rax
adcx %r15,%r13
mov 24(%r12),%rdx
adox %r15,%r13
mulx %rsi,%r12,%rsi
xor %r14d,%r14d
adox %r12,%r9
adcx %rsi,%r8
mulx %rbx,%rsi,%rbx
adox %rsi,%r8
adcx %rbx,%rax
mulx %r11,%r11,%rsi
mov -56(%rbp),%rbx
mov %rcx,16(%rdi)
adcx %rsi,%r13
mov -48(%rbp),%rsi
mov %rbx,8(%rdi)
adox %r11,%rax
mov %r9,24(%rdi)
mov %r8,32(%rdi)
mov %rax,40(%rdi)
mulx %r10,%rdx,%r10
adox %rdx,%r13
adcx %r14,%r10
mov %r13,48(%rdi)
adox %r14,%r10
mov %rsi,(%rdi)
mov %r10,56(%rdi)
add $16,%rsp
pop %rbx
pop %r12
pop %r13
pop %r14
pop %r15
pop %rbp
ret
.endfn Mul4x4Adx,globl