Make SSL handshakes much faster

This change boosts SSL handshake performance from 2,627 to ~10,000 per
second which is the same level of performance as NGINX at establishing
secure connections. That's impressive if we consider that redbean is a
forking frontend application server. This was accomplished by:

  1. Enabling either SSL session caching or SSL tickets. We choose to
     use tickets since they reduce network round trips too and that's
     a more important metric than wrk'ing localhost.

  2. Fixing mbedtls_mpi_sub_abs() which is the most frequently called
     function. It's called about 12,000 times during an SSL handshake
     since it's the basis of most arithmetic operations like addition
     and for some strange reason it was designed to make two needless
     copies in addition to calling malloc and free. That's now fixed.

  3. Improving TLS output buffering during the SSL handshake only, so
     that only a single is write and read system call is needed until
     blocking on the ping pong.

redbean will now do a better job wiping sensitive memory from a child
process as soon as it's not needed. The nice thing about fork is it's
much faster than reverse proxying so the goal is to use the different
address spaces along with setuid() to minimize the risk that a server
key will be compromised in the event that application code is hacked.
This commit is contained in:
Justine Tunney 2021-07-11 23:17:47 -07:00
parent 8c4cce043c
commit f3e28aa192
103 changed files with 1310 additions and 1085 deletions

24
libc/str/mempcpy-pure.c Normal file
View file

@ -0,0 +1,24 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2021 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/str/str.h"
void *mempcpy_pure(void *dst, const void *src, size_t n) {
memmove_pure(dst, src, n);
return (char *)dst + n;
}

View file

@ -23,11 +23,11 @@
static inline noasan size_t stpcpy_sse2(char *d, const char *s, size_t i) {
uint8_t v1[16], v2[16], vz[16];
for (;;) {
memset(vz, 0, 16);
memcpy(v1, s + i, 16);
__builtin_memset(vz, 0, 16);
__builtin_memcpy(v1, s + i, 16);
pcmpeqb(v2, v1, vz);
if (!pmovmskb(v2)) {
memcpy(d + i, v1, 16);
__builtin_memcpy(d + i, v1, 16);
i += 16;
} else {
break;

View file

@ -199,6 +199,12 @@ wchar_t *wchomp(wchar_t *);
bool escapedos(char16_t *, unsigned, const char16_t *, unsigned);
void *memset_pure(void *, int, size_t) memcpyesque;
void *memmove_pure(void *, const void *, size_t) memcpyesque;
void *mempcpy_pure(void *, const void *, size_t) memcpyesque;
size_t strlen_pure(const char *) strlenesque;
size_t strcspn_pure(const char *, const char *) strlenesque;
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § strings » multibyte
*/
@ -374,41 +380,20 @@ char *strsignal(int) returnsnonnull libcesque;
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § strings » address sanitizer
*/
void *memset_pure(void *, int, size_t) memcpyesque;
void *memmove_pure(void *, const void *, size_t) memcpyesque;
size_t strlen_pure(const char *) strlenesque;
size_t strcspn_pure(const char *, const char *) strlenesque;
#if defined(__FSANITIZE_ADDRESS__)
#define strcspn(STR, REJECT) strcspn_pure(STR, REJECT)
#undef strlen
#define strlen(STR) \
(__builtin_constant_p(STR) ? __builtin_strlen(STR) : strlen_pure(STR))
#undef memset
#define memset(DST, CHAR, SIZE) \
(__memcpy_isgoodsize(SIZE) ? __builtin_memset(DST, CHAR, SIZE) \
: memset_pure(DST, CHAR, SIZE))
#undef memmove
#define memmove(DST, SRC, SIZE) \
(__memcpy_isgoodsize(SIZE) ? __builtin_memmove(DST, SRC, SIZE) \
: memmove_pure(DST, SRC, SIZE))
#undef memcpy
#define memcpy(DST, SRC, SIZE) \
(__memcpy_isgoodsize(SIZE) ? __builtin_memcpy(DST, SRC, SIZE) \
: memmove_pure(DST, SRC, SIZE))
#undef memmove
#undef mempcpy
#define mempcpy(DST, SRC, SIZE) \
(__memcpy_isgoodsize(SIZE) ? __builtin_mempcpy(DST, SRC, SIZE) : ({ \
void *DsT = (DST); \
size_t SiZe = (SIZE); \
memmove_pure(DsT, SRC, SiZe); \
(void *)((char *)DsT + SiZe); \
}))
#undef memset
#undef strlen
#define memcpy memmove_pure
#define memmove memmove_pure
#define mempcpy mempcpy_pure
#define memset memset_pure
#define strcspn strcspn_pure
#define strlen strlen_pure
#endif /* __FSANITIZE_ADDRESS__ */
#endif /* __GNUC__ && !__STRICT_ANSI__ */

View file

@ -23,11 +23,11 @@
static noasan size_t strcpy_sse2(char *d, const char *s, size_t i) {
uint8_t v1[16], v2[16], vz[16];
for (;;) {
memset(vz, 0, 16);
memcpy(v1, s + i, 16);
__builtin_memset(vz, 0, 16);
__builtin_memcpy(v1, s + i, 16);
pcmpeqb(v2, v1, vz);
if (!pmovmskb(v2)) {
memcpy(d + i, v1, 16);
__builtin_memcpy(d + i, v1, 16);
i += 16;
} else {
break;

View file

@ -20,7 +20,7 @@
#include "libc/bits/bits.h"
#include "libc/str/str.h"
static noasan size_t strlen_pure_x64(const char *s, size_t i) {
static inline noasan size_t strlen_pure_x64(const char *s, size_t i) {
uint64_t w;
for (;; i += 8) {
w = READ64LE(s + i);

View file

@ -31,15 +31,15 @@ static const int16_t kDel16[8] = {127, 127, 127, 127, 127, 127, 127, 127};
static noasan axdx_t tprecode16to8_sse2(char *dst, size_t dstsize,
const char16_t *src, axdx_t r) {
int16_t v1[8], v2[8], v3[8], vz[8];
memset(vz, 0, 16);
__builtin_memset(vz, 0, 16);
while (r.ax + 8 < dstsize) {
memcpy(v1, src + r.dx, 16);
__builtin_memcpy(v1, src + r.dx, 16);
pcmpgtw(v2, v1, vz);
pcmpgtw(v3, v1, kDel16);
pandn((void *)v2, (void *)v3, (void *)v2);
if (pmovmskb((void *)v2) != 0xFFFF) break;
packsswb((void *)v1, v1, v1);
memcpy(dst + r.ax, v1, 8);
__builtin_memcpy(dst + r.ax, v1, 8);
r.ax += 8;
r.dx += 8;
}

View file

@ -28,15 +28,15 @@
static inline noasan axdx_t tprecode8to16_sse2(char16_t *dst, size_t dstsize,
const char *src, axdx_t r) {
uint8_t v1[16], v2[16], vz[16];
memset(vz, 0, 16);
__builtin_memset(vz, 0, 16);
while (r.ax + 16 < dstsize) {
memcpy(v1, src + r.dx, 16);
__builtin_memcpy(v1, src + r.dx, 16);
pcmpgtb((int8_t *)v2, (int8_t *)v1, (int8_t *)vz);
if (pmovmskb(v2) != 0xFFFF) break;
punpcklbw(v2, v1, vz);
punpckhbw(v1, v1, vz);
memcpy(dst + r.ax + 0, v2, 16);
memcpy(dst + r.ax + 8, v1, 16);
__builtin_memcpy(dst + r.ax + 0, v2, 16);
__builtin_memcpy(dst + r.ax + 8, v1, 16);
r.ax += 16;
r.dx += 16;
}