Polish greenbean example a bit more

Windows support for this example is still a work in progress. It's
encountering some unusual crashes. Thank you Chris Wellons for the cool
synchronization code too!
This commit is contained in:
Justine Tunney 2022-05-15 09:14:48 -07:00
parent e5e141d9b5
commit 91ee2b19d4
4 changed files with 145 additions and 81 deletions

View file

@ -16,6 +16,7 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE. PERFORMANCE OF THIS SOFTWARE.
*/ */
#include "libc/bits/atomic.h"
#include "libc/calls/calls.h" #include "libc/calls/calls.h"
#include "libc/calls/sigbits.h" #include "libc/calls/sigbits.h"
#include "libc/calls/struct/sigset.h" #include "libc/calls/struct/sigset.h"
@ -27,6 +28,7 @@
#include "libc/log/check.h" #include "libc/log/check.h"
#include "libc/log/log.h" #include "libc/log/log.h"
#include "libc/mem/mem.h" #include "libc/mem/mem.h"
#include "libc/runtime/runtime.h"
#include "libc/sock/goodsocket.internal.h" #include "libc/sock/goodsocket.internal.h"
#include "libc/sock/sock.h" #include "libc/sock/sock.h"
#include "libc/str/str.h" #include "libc/str/str.h"
@ -71,25 +73,25 @@
* Like redbean, greenbean has superior performance too, with an * Like redbean, greenbean has superior performance too, with an
* advantage on benchmarks biased towards high connection counts * advantage on benchmarks biased towards high connection counts
* *
* $ sudo wrk -c 300 -t 32 --latency http://127.0.0.1:8080/ * $ sudo wrk -c 300 -t 32 --latency http://10.10.10.124:8080/
* Running 10s test @ http://127.0.0.1:8080/ * Running 10s test @ http://10.10.10.124:8080/
* 32 threads and 300 connections * 32 threads and 300 connections
* Thread Stats Avg Stdev Max +/- Stdev * Thread Stats Avg Stdev Max +/- Stdev
* Latency 36.21us 133.39us 8.10ms 98.52% * Latency 1.07ms 8.27ms 138.55ms 98.58%
* Req/Sec 73.24k 28.92k 131.06k 47.49% * Req/Sec 37.98k 12.61k 117.65k 80.11%
* Latency Distribution * Latency Distribution
* 50% 22.00us * 50% 200.00us
* 75% 29.00us * 75% 227.00us
* 90% 40.00us * 90% 303.00us
* 99% 333.00us * 99% 32.46ms
* 4356560 requests in 4.62s, 1.29GB read * 10033090 requests in 8.31s, 2.96GB read
* Requests/sec: 942663.73 * Requests/sec: 1207983.58
* Transfer/sec: 284.98MB * Transfer/sec: 365.19MB
* *
*/ */
#define THREADS 32 #define THREADS 512
#define HEARTBEAT 500 #define HEARTBEAT 100
#define KEEPALIVE 5000 #define KEEPALIVE 5000
#define LOGGING 0 #define LOGGING 0
@ -98,23 +100,106 @@
"Referrer-Policy: origin\r\n" \ "Referrer-Policy: origin\r\n" \
"Cache-Control: private; max-age=0\r\n" "Cache-Control: private; max-age=0\r\n"
int workers; ////////////////////////////////////////////////////////////////////////////////
int barrier; // BEGIN: Chris Wellons's Public Domain GNU Atomics Library
#define BARRIER_INC(x) __atomic_add_fetch(x, 1, __ATOMIC_SEQ_CST)
#define BARRIER_GET(x) __atomic_load_n(x, __ATOMIC_SEQ_CST)
#define ATOMIC_LOAD(q) __atomic_load_n(q, __ATOMIC_ACQUIRE)
#define ATOMIC_RLOAD(q) __atomic_load_n(q, __ATOMIC_RELAXED)
#define ATOMIC_STORE(q, v) __atomic_store_n(q, v, __ATOMIC_RELEASE)
#define ATOMIC_ADD(q, c) __atomic_add_fetch(q, c, __ATOMIC_RELEASE)
#define ATOMIC_AND(q, m) __atomic_and_fetch(q, m, __ATOMIC_RELEASE)
#define ATOMIC_CAS(q, e, d) \
__atomic_compare_exchange_n(q, e, d, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED)
// Return the array index for then next value to be pushed. The size of this
// array must be (1 << exp) elements. Write the value into this array index,
// then commit it. With a single-consumer queue, this element store need not
// be atomic. The value will appear in the queue after the commit. Returns
// -1 if the queue is full.
static int queue_push(uint32_t *q, int exp) {
uint32_t r = ATOMIC_LOAD(q);
int mask = (1u << exp) - 1;
int head = r & mask;
int tail = r >> 16 & mask;
int next = (head + 1u) & mask;
if (r & 0x8000) { // avoid overflow on commit
ATOMIC_AND(q, ~0x8000);
}
return next == tail ? -1 : head;
}
// Commits and completes the push operation. Do this after storing into the
// array. This operation cannot fail.
static void queue_push_commit(uint32_t *q) {
ATOMIC_ADD(q, 1);
}
// Return the array index for the next value to be popped. The size of this
// array must be (1 << exp) elements. Read from this array index, then
// commit the pop. This element load need not be atomic. The value will be
// removed from the queue after the commit. Returns -1 if the queue is
// empty.
static int queue_pop(uint32_t *q, int exp) {
uint32_t r = ATOMIC_LOAD(q);
int mask = (1u << exp) - 1;
int head = r & mask;
int tail = r >> 16 & mask;
return head == tail ? -1 : tail;
}
// Commits and completes the pop operation. Do this after loading from the
// array. This operation cannot fail.
static void queue_pop_commit(uint32_t *q) {
ATOMIC_ADD(q, 0x10000);
}
// Like queue_pop() but for multiple-consumer queues. The element load must
// be atomic since it is concurrent with the producer's push, though it can
// use a relaxed memory order. The loaded value must not be used unless the
// commit is successful. Stores a temporary "save" to be used at commit.
static int queue_mpop(uint32_t *q, int exp, uint32_t *save) {
uint32_t r = *save = ATOMIC_LOAD(q);
int mask = (1u << exp) - 1;
int head = r & mask;
int tail = r >> 16 & mask;
return head == tail ? -1 : tail;
}
// Like queue_pop_commit() but for multiple-consumer queues. It may fail if
// another consumer pops concurrently, in which case the pop must be retried
// from the beginning.
static bool queue_mpop_commit(uint32_t *q, uint32_t save) {
return ATOMIC_CAS(q, &save, save + 0x10000);
}
// Spin-lock barrier for n threads, where n is a power of two.
// Initialize *barrier to zero.
static void barrier_waitn(int *barrier, int n) {
int v = BARRIER_INC(barrier);
if (v & (n - 1)) {
for (v &= n; (BARRIER_GET(barrier) & n) == v;) {
donothing;
}
}
}
// END: Chris Wellons's Public Domain GNU Atomics Library
////////////////////////////////////////////////////////////////////////////////
int barrier1;
int itsbegun;
int closingtime; int closingtime;
int barrier2;
int itsdone;
int Worker(void *id) { int Worker(void *id) {
int server, itsover, ready, yes = 1; int server, yes = 1;
// announce to the main process this has spawned kprintf(" %d", id);
kprintf(" #%.2ld", (intptr_t)id); barrier_waitn(&barrier1, THREADS);
__atomic_add_fetch(&workers, 1, __ATOMIC_SEQ_CST); itsbegun = true;
// wait for all threads to spawn before we proceed
for (;;) {
__atomic_load(&barrier, &ready, __ATOMIC_SEQ_CST);
if (ready) break;
__builtin_ia32_pause();
}
// load balance incoming connections for port 8080 across all threads // load balance incoming connections for port 8080 across all threads
// hangup on any browser clients that lag for more than a few seconds // hangup on any browser clients that lag for more than a few seconds
@ -131,7 +216,7 @@ int Worker(void *id) {
CHECK_EQ(0, listen(server, 10)); CHECK_EQ(0, listen(server, 10));
// connection loop // connection loop
for (;;) { while (!closingtime) {
struct tm tm; struct tm tm;
int64_t unixts; int64_t unixts;
struct Url url; struct Url url;
@ -143,15 +228,8 @@ int Worker(void *id) {
char inbuf[1500], outbuf[512], *p, *q; char inbuf[1500], outbuf[512], *p, *q;
int clientip, client, inmsglen, outmsglen; int clientip, client, inmsglen, outmsglen;
__atomic_load(&closingtime, &itsover, __ATOMIC_SEQ_CST);
if (itsover) break;
if (!IsLinux() &&
poll(&(struct pollfd){server, POLLIN}, 1, HEARTBEAT) < 1) {
continue;
}
// wait for client connection // wait for client connection
if (poll(&(struct pollfd){server, POLLIN}, 1, HEARTBEAT) < 1) continue;
clientaddrsize = sizeof(clientaddr); clientaddrsize = sizeof(clientaddr);
client = accept(server, &clientaddr, &clientaddrsize); client = accept(server, &clientaddr, &clientaddrsize);
@ -163,7 +241,7 @@ int Worker(void *id) {
// inherited by the accepted sockets, but using them also has the // inherited by the accepted sockets, but using them also has the
// side-effect that the listening socket fails with EAGAIN, every // side-effect that the listening socket fails with EAGAIN, every
// several seconds. we can use that to our advantage to check for // several seconds. we can use that to our advantage to check for
// the ctrl-c shutdown event; otherwise, we retry the accept call // the ctrl-c shutdowne event; otherwise, we retry the accept call
continue; continue;
} }
@ -179,7 +257,7 @@ int Worker(void *id) {
#if LOGGING #if LOGGING
// log the incoming http message // log the incoming http message
clientip = ntohl(clientaddr.sin_addr.s_addr); clientip = ntohl(clientaddr.sin_addr.s_addr);
kprintf("#%.2ld get some %d.%d.%d.%d:%d %#.*s\n", (intptr_t)id, kprintf("#%.4x get some %d.%d.%d.%d:%d %#.*s\n", (intptr_t)id,
(clientip & 0xff000000) >> 030, (clientip & 0x00ff0000) >> 020, (clientip & 0xff000000) >> 030, (clientip & 0x00ff0000) >> 020,
(clientip & 0x0000ff00) >> 010, (clientip & 0x000000ff) >> 000, (clientip & 0x0000ff00) >> 010, (clientip & 0x000000ff) >> 000,
ntohs(clientaddr.sin_port), msg.uri.b - msg.uri.a, ntohs(clientaddr.sin_port), msg.uri.b - msg.uri.a,
@ -239,8 +317,9 @@ int Worker(void *id) {
// inform the parent that this clone has finished // inform the parent that this clone has finished
close(server); close(server);
kprintf(" #%.2ld", (intptr_t)id); kprintf(" %d", id);
__atomic_sub_fetch(&workers, 1, __ATOMIC_SEQ_CST); barrier_waitn(&barrier2, THREADS);
itsdone = true;
return 0; return 0;
} }
@ -249,45 +328,20 @@ void OnCtrlC(int sig) {
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
/* ShowCrashReports(); */
int64_t loadtzdbearly; int64_t loadtzdbearly;
int i, gotsome, haveleft, ready = 1;
ShowCrashReports();
kprintf("welcome to greenbean\n"); kprintf("welcome to greenbean\n");
gmtime(&loadtzdbearly); gmtime(&loadtzdbearly);
for (int i = 0; i < THREADS; ++i) {
// spawn a bunch of threads
for (i = 0; i < THREADS; ++i) {
void *stack = mmap(0, 65536, PROT_READ | PROT_WRITE, void *stack = mmap(0, 65536, PROT_READ | PROT_WRITE,
MAP_STACK | MAP_ANONYMOUS, -1, 0); MAP_STACK | MAP_ANONYMOUS, -1, 0);
clone(Worker, stack, 65536, clone(Worker, stack, 65536,
CLONE_THREAD | CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND, CLONE_THREAD | CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND,
(void *)(intptr_t)i, 0, 0, 0, 0); (void *)(intptr_t)i, 0, 0, 0, 0);
} }
while (!ATOMIC_LOAD(&itsbegun)) usleep(HEARTBEAT * 1000);
// wait for all threads to spawn
for (;;) {
__atomic_load(&workers, &gotsome, __ATOMIC_SEQ_CST);
if (workers == THREADS) break;
__builtin_ia32_pause();
}
// all threads are spawned so unleash the barrier
kprintf("\ngreenbean is ready to go\n");
sigaction(SIGINT, &(struct sigaction){.sa_handler = OnCtrlC}, 0); sigaction(SIGINT, &(struct sigaction){.sa_handler = OnCtrlC}, 0);
__atomic_store(&barrier, &ready, __ATOMIC_SEQ_CST); kprintf("\nit's begun\n");
while (!ATOMIC_LOAD(&itsdone)) usleep(HEARTBEAT * 1000);
// main process does nothing until it's closing time kprintf("\nthank you for flying greenbean\n");
for (;;) {
__atomic_load(&workers, &haveleft, __ATOMIC_SEQ_CST);
if (!haveleft) break;
__builtin_ia32_pause();
usleep(HEARTBEAT * 1000);
if (closingtime) {
kprintf("\rgreenbean is shutting down...\n");
}
}
kprintf("\n");
kprintf("thank you for flying greenbean\n");
} }

View file

@ -8,7 +8,7 @@
#define _KERNTRACE 0 /* not configurable w/ flag yet */ #define _KERNTRACE 0 /* not configurable w/ flag yet */
#define _POLLTRACE 0 /* not configurable w/ flag yet */ #define _POLLTRACE 0 /* not configurable w/ flag yet */
#define _DATATRACE 1 /* not configurable w/ flag yet */ #define _DATATRACE 1 /* not configurable w/ flag yet */
#define _NTTRACE 0 /* not configurable w/ flag yet */ #define _NTTRACE 1 /* not configurable w/ flag yet */
#define STRACE_PROLOGUE "%rSYS %5P %'18T " #define STRACE_PROLOGUE "%rSYS %5P %'18T "

View file

@ -121,6 +121,16 @@ forceinline void MakeLongDoubleLongAgain(void) {
asm volatile("fldcw\t%0" : /* no outputs */ : "m"(x87cw)); asm volatile("fldcw\t%0" : /* no outputs */ : "m"(x87cw));
} }
// https://nullprogram.com/blog/2022/02/18/
static inline char16_t *MyCommandLine(void) {
void *cmd;
asm("mov\t%%gs:(0x60),%0\n"
"mov\t0x20(%0),%0\n"
"mov\t0x78(%0),%0\n"
: "=r"(cmd));
return cmd;
}
static inline size_t StrLen16(const char16_t *s) { static inline size_t StrLen16(const char16_t *s) {
size_t n; size_t n;
for (n = 0;; ++n) { for (n = 0;; ++n) {
@ -271,7 +281,7 @@ __msabi textwindows int64_t WinMain(int64_t hInstance, int64_t hPrevInstance,
#if !IsTiny() #if !IsTiny()
__wincrashearly = AddVectoredExceptionHandler(1, (void *)OnEarlyWinCrash); __wincrashearly = AddVectoredExceptionHandler(1, (void *)OnEarlyWinCrash);
#endif #endif
cmdline = GetCommandLine(); cmdline = MyCommandLine();
#ifdef SYSDEBUG #ifdef SYSDEBUG
/* sloppy flag-only check for early initialization */ /* sloppy flag-only check for early initialization */
if (__strstr16(cmdline, u"--strace")) ++__strace; if (__strstr16(cmdline, u"--strace")) ++__strace;

View file

@ -1158,18 +1158,18 @@ syscon ms MS_INVALIDATE 2 2 2 4 2 0
# statvfs() flags # statvfs() flags
# #
# group name GNU/Systemd XNU's Not UNIX! FreeBSD OpenBSD NetBSD The New Technology Commentary # group name GNU/Systemd XNU's Not UNIX! FreeBSD OpenBSD NetBSD The New Technology Commentary
syscon statvfs ST_NOSUID 2 2 2 2 2 0 # unix consensus
syscon statvfs ST_RDONLY 1 1 1 1 1 0 # unix consensus syscon statvfs ST_RDONLY 1 1 1 1 1 0 # unix consensus
syscon statvfs ST_NOSUID 2 2 2 2 2 0 # unix consensus
syscon statvfs ST_NODEV 4 0 0 0 0x00000010 0
syscon statvfs ST_NOEXEC 8 0 0 0 4 0
syscon statvfs ST_SYNCHRONOUS 16 0 0 0 2 0
syscon statvfs ST_APPEND 0x0100 0 0 0 0 0 syscon statvfs ST_APPEND 0x0100 0 0 0 0 0
syscon statvfs ST_IMMUTABLE 0x0200 0 0 0 0 0 syscon statvfs ST_IMMUTABLE 0x0200 0 0 0 0 0
syscon statvfs ST_MANDLOCK 0x40 0 0 0 0 0 syscon statvfs ST_MANDLOCK 0x0040 0 0 0 0 0
syscon statvfs ST_NOATIME 0x0400 0 0 0x04000000 0 0 syscon statvfs ST_NOATIME 0x0400 0 0 0x04000000 0 0
syscon statvfs ST_NODEV 4 0 0 0 0x00000010 0
syscon statvfs ST_NODIRATIME 0x0800 0 0 0 0 0 syscon statvfs ST_NODIRATIME 0x0800 0 0 0 0 0
syscon statvfs ST_NOEXEC 8 0 0 0 4 0 syscon statvfs ST_WRITE 0x0080 0 0 0 0 0
syscon statvfs ST_RELATIME 0x1000 0 0 0 0x00020000 0 syscon statvfs ST_RELATIME 0x1000 0 0 0 0x00020000 0
syscon statvfs ST_SYNCHRONOUS 0x10 0 0 0 2 0
syscon statvfs ST_WRITE 0x80 0 0 0 0 0
# sendfile() flags # sendfile() flags
# #
@ -1442,7 +1442,7 @@ syscon termios IUTF8 0b0100000000000000 0b0100000000000000 0 0 0 0b010
# #
# group name GNU/Systemd XNU's Not UNIX! FreeBSD OpenBSD NetBSD The New Technology Commentary # group name GNU/Systemd XNU's Not UNIX! FreeBSD OpenBSD NetBSD The New Technology Commentary
syscon termios OPOST 0b0000000000000001 0b000000000000000001 0b000000000000000001 0b0000000000000001 0b0000000000000001 0b0000000000000001 # termios.c_oflag&=~OPOST disables output processing magic, e.g. MULTICS newlines syscon termios OPOST 0b0000000000000001 0b000000000000000001 0b000000000000000001 0b0000000000000001 0b0000000000000001 0b0000000000000001 # termios.c_oflag&=~OPOST disables output processing magic, e.g. MULTICS newlines
syscon termios OLCUC 0b0000000000000010 0 0 0b0000000000100000 0 0b0000000000000010 # termios.c_oflag|=OLCUC maps a-z → A-Z output syscon termios OLCUC 0b0000000000000010 0 0 0b0000000000100000 0 0b0000000000000010 # termios.c_oflag|=OLCUC maps a-z → A-Z output (SHOUTING)
syscon termios ONLCR 0b0000000000000100 0b000000000000000010 0b000000000000000010 0b0000000000000010 0b0000000000000010 0b0000000000000100 # termios.c_oflag|=ONLCR map \n → \r\n output (MULTICS newline) and requires OPOST syscon termios ONLCR 0b0000000000000100 0b000000000000000010 0b000000000000000010 0b0000000000000010 0b0000000000000010 0b0000000000000100 # termios.c_oflag|=ONLCR map \n → \r\n output (MULTICS newline) and requires OPOST
syscon termios OCRNL 0b0000000000001000 0b000000000000010000 0b000000000000010000 0b0000000000010000 0b0000000000010000 0b0000000000001000 # termios.c_oflag|=OCRNL maps \r → \n output syscon termios OCRNL 0b0000000000001000 0b000000000000010000 0b000000000000010000 0b0000000000010000 0b0000000000010000 0b0000000000001000 # termios.c_oflag|=OCRNL maps \r → \n output
syscon termios ONOCR 0b0000000000010000 0b000000000000100000 0b000000000000100000 0b0000000001000000 0b0000000001000000 0b0000000000010000 # termios.c_oflag|=ONOCR maps \r → ∅ output iff column 0 syscon termios ONOCR 0b0000000000010000 0b000000000000100000 0b000000000000100000 0b0000000001000000 0b0000000001000000 0b0000000000010000 # termios.c_oflag|=ONOCR maps \r → ∅ output iff column 0
@ -1478,14 +1478,14 @@ syscon termios FF1 0b1000000000000000 0b000100000000000000 0b0001000000000
# Teletypewriter Special Control Character Assignments # Teletypewriter Special Control Character Assignments
# #
# group name GNU/Systemd XNU's Not UNIX! FreeBSD OpenBSD NetBSD The New Technology Commentary # group name GNU/Systemd XNU's Not UNIX! FreeBSD OpenBSD NetBSD The New Technology Commentary
syscon termios VMIN 6+1 16 16 16 16 6 # termios.c_cc[VMIN]=𝑥 in non-canonical mode can be set to 0 for non-blocking reads, 1 for single character raw mode reads, or higher to buffer
syscon termios VTIME 5+1 17 17 17 17 5 # termios.c_cc[VTIME]=𝑥 sets non-canonical read timeout to 𝑥×𝟷𝟶𝟶ms which is needed when entering escape sequences manually with the escape key
syscon termios NCCS 20 20 20 20 20 20 # ARRAYLEN(termios.c_cc); we schlep c_line into c_cc on linux syscon termios NCCS 20 20 20 20 20 20 # ARRAYLEN(termios.c_cc); we schlep c_line into c_cc on linux
syscon termios VINTR 0+1 8 8 8 8 0 # termios.c_cc[VINTR]=𝑥 syscon termios VINTR 0+1 8 8 8 8 0 # termios.c_cc[VINTR]=𝑥
syscon termios VQUIT 1+1 9 9 9 9 1 # termios.c_cc[VQUIT]=𝑥 syscon termios VQUIT 1+1 9 9 9 9 1 # termios.c_cc[VQUIT]=𝑥
syscon termios VERASE 2+1 3 3 3 3 2 # termios.c_cc[VERASE]=𝑥 syscon termios VERASE 2+1 3 3 3 3 2 # termios.c_cc[VERASE]=𝑥
syscon termios VKILL 3+1 5 5 5 5 3 # termios.c_cc[VKILL]=𝑥 syscon termios VKILL 3+1 5 5 5 5 3 # termios.c_cc[VKILL]=𝑥
syscon termios VEOF 4+1 0 0 0 0 4 # termios.c_cc[VEOF]=𝑥 syscon termios VEOF 4+1 0 0 0 0 4 # termios.c_cc[VEOF]=𝑥
syscon termios VTIME 5+1 17 17 17 17 5 # termios.c_cc[VTIME]=𝑥 sets non-canonical read timeout to 𝑥×𝟷𝟶𝟶ms which is needed when entering escape sequences manually with the escape key
syscon termios VMIN 6+1 16 16 16 16 6 # termios.c_cc[VMIN]=𝑥 in non-canonical mode can be set to 0 for non-blocking reads, 1 for single character raw mode reads, or higher to buffer
syscon termios VSWTC 7+1 0 0 0 0 7 # termios.c_cc[VSWTC]=𝑥 syscon termios VSWTC 7+1 0 0 0 0 7 # termios.c_cc[VSWTC]=𝑥
syscon termios VSTART 8+1 12 12 12 12 8 # termios.c_cc[VSTART]=𝑥 syscon termios VSTART 8+1 12 12 12 12 8 # termios.c_cc[VSTART]=𝑥
syscon termios VSTOP 9+1 13 13 13 13 9 # termios.c_cc[VSTOP]=𝑥 syscon termios VSTOP 9+1 13 13 13 13 9 # termios.c_cc[VSTOP]=𝑥