diff --git a/examples/greenbean.c b/examples/greenbean.c index 72b761edd..29b3c3596 100644 --- a/examples/greenbean.c +++ b/examples/greenbean.c @@ -16,6 +16,7 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/bits/atomic.h" #include "libc/calls/calls.h" #include "libc/calls/sigbits.h" #include "libc/calls/struct/sigset.h" @@ -27,6 +28,7 @@ #include "libc/log/check.h" #include "libc/log/log.h" #include "libc/mem/mem.h" +#include "libc/runtime/runtime.h" #include "libc/sock/goodsocket.internal.h" #include "libc/sock/sock.h" #include "libc/str/str.h" @@ -71,25 +73,25 @@ * Like redbean, greenbean has superior performance too, with an * advantage on benchmarks biased towards high connection counts * - * $ sudo wrk -c 300 -t 32 --latency http://127.0.0.1:8080/ - * Running 10s test @ http://127.0.0.1:8080/ + * $ sudo wrk -c 300 -t 32 --latency http://10.10.10.124:8080/ + * Running 10s test @ http://10.10.10.124:8080/ * 32 threads and 300 connections * Thread Stats Avg Stdev Max +/- Stdev - * Latency 36.21us 133.39us 8.10ms 98.52% - * Req/Sec 73.24k 28.92k 131.06k 47.49% + * Latency 1.07ms 8.27ms 138.55ms 98.58% + * Req/Sec 37.98k 12.61k 117.65k 80.11% * Latency Distribution - * 50% 22.00us - * 75% 29.00us - * 90% 40.00us - * 99% 333.00us - * 4356560 requests in 4.62s, 1.29GB read - * Requests/sec: 942663.73 - * Transfer/sec: 284.98MB + * 50% 200.00us + * 75% 227.00us + * 90% 303.00us + * 99% 32.46ms + * 10033090 requests in 8.31s, 2.96GB read + * Requests/sec: 1207983.58 + * Transfer/sec: 365.19MB * */ -#define THREADS 32 -#define HEARTBEAT 500 +#define THREADS 512 +#define HEARTBEAT 100 #define KEEPALIVE 5000 #define LOGGING 0 @@ -98,23 +100,106 @@ "Referrer-Policy: origin\r\n" \ "Cache-Control: private; max-age=0\r\n" -int workers; -int barrier; +//////////////////////////////////////////////////////////////////////////////// +// BEGIN: Chris Wellons's Public Domain GNU Atomics Library + +#define BARRIER_INC(x) __atomic_add_fetch(x, 1, __ATOMIC_SEQ_CST) +#define BARRIER_GET(x) __atomic_load_n(x, __ATOMIC_SEQ_CST) +#define ATOMIC_LOAD(q) __atomic_load_n(q, __ATOMIC_ACQUIRE) +#define ATOMIC_RLOAD(q) __atomic_load_n(q, __ATOMIC_RELAXED) +#define ATOMIC_STORE(q, v) __atomic_store_n(q, v, __ATOMIC_RELEASE) +#define ATOMIC_ADD(q, c) __atomic_add_fetch(q, c, __ATOMIC_RELEASE) +#define ATOMIC_AND(q, m) __atomic_and_fetch(q, m, __ATOMIC_RELEASE) +#define ATOMIC_CAS(q, e, d) \ + __atomic_compare_exchange_n(q, e, d, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED) + +// Return the array index for then next value to be pushed. The size of this +// array must be (1 << exp) elements. Write the value into this array index, +// then commit it. With a single-consumer queue, this element store need not +// be atomic. The value will appear in the queue after the commit. Returns +// -1 if the queue is full. +static int queue_push(uint32_t *q, int exp) { + uint32_t r = ATOMIC_LOAD(q); + int mask = (1u << exp) - 1; + int head = r & mask; + int tail = r >> 16 & mask; + int next = (head + 1u) & mask; + if (r & 0x8000) { // avoid overflow on commit + ATOMIC_AND(q, ~0x8000); + } + return next == tail ? -1 : head; +} + +// Commits and completes the push operation. Do this after storing into the +// array. This operation cannot fail. +static void queue_push_commit(uint32_t *q) { + ATOMIC_ADD(q, 1); +} + +// Return the array index for the next value to be popped. The size of this +// array must be (1 << exp) elements. Read from this array index, then +// commit the pop. This element load need not be atomic. The value will be +// removed from the queue after the commit. Returns -1 if the queue is +// empty. +static int queue_pop(uint32_t *q, int exp) { + uint32_t r = ATOMIC_LOAD(q); + int mask = (1u << exp) - 1; + int head = r & mask; + int tail = r >> 16 & mask; + return head == tail ? -1 : tail; +} + +// Commits and completes the pop operation. Do this after loading from the +// array. This operation cannot fail. +static void queue_pop_commit(uint32_t *q) { + ATOMIC_ADD(q, 0x10000); +} + +// Like queue_pop() but for multiple-consumer queues. The element load must +// be atomic since it is concurrent with the producer's push, though it can +// use a relaxed memory order. The loaded value must not be used unless the +// commit is successful. Stores a temporary "save" to be used at commit. +static int queue_mpop(uint32_t *q, int exp, uint32_t *save) { + uint32_t r = *save = ATOMIC_LOAD(q); + int mask = (1u << exp) - 1; + int head = r & mask; + int tail = r >> 16 & mask; + return head == tail ? -1 : tail; +} + +// Like queue_pop_commit() but for multiple-consumer queues. It may fail if +// another consumer pops concurrently, in which case the pop must be retried +// from the beginning. +static bool queue_mpop_commit(uint32_t *q, uint32_t save) { + return ATOMIC_CAS(q, &save, save + 0x10000); +} + +// Spin-lock barrier for n threads, where n is a power of two. +// Initialize *barrier to zero. +static void barrier_waitn(int *barrier, int n) { + int v = BARRIER_INC(barrier); + if (v & (n - 1)) { + for (v &= n; (BARRIER_GET(barrier) & n) == v;) { + donothing; + } + } +} + +// END: Chris Wellons's Public Domain GNU Atomics Library +//////////////////////////////////////////////////////////////////////////////// + +int barrier1; +int itsbegun; int closingtime; +int barrier2; +int itsdone; int Worker(void *id) { - int server, itsover, ready, yes = 1; + int server, yes = 1; - // announce to the main process this has spawned - kprintf(" #%.2ld", (intptr_t)id); - __atomic_add_fetch(&workers, 1, __ATOMIC_SEQ_CST); - - // wait for all threads to spawn before we proceed - for (;;) { - __atomic_load(&barrier, &ready, __ATOMIC_SEQ_CST); - if (ready) break; - __builtin_ia32_pause(); - } + kprintf(" %d", id); + barrier_waitn(&barrier1, THREADS); + itsbegun = true; // load balance incoming connections for port 8080 across all threads // hangup on any browser clients that lag for more than a few seconds @@ -131,7 +216,7 @@ int Worker(void *id) { CHECK_EQ(0, listen(server, 10)); // connection loop - for (;;) { + while (!closingtime) { struct tm tm; int64_t unixts; struct Url url; @@ -143,15 +228,8 @@ int Worker(void *id) { char inbuf[1500], outbuf[512], *p, *q; int clientip, client, inmsglen, outmsglen; - __atomic_load(&closingtime, &itsover, __ATOMIC_SEQ_CST); - if (itsover) break; - - if (!IsLinux() && - poll(&(struct pollfd){server, POLLIN}, 1, HEARTBEAT) < 1) { - continue; - } - // wait for client connection + if (poll(&(struct pollfd){server, POLLIN}, 1, HEARTBEAT) < 1) continue; clientaddrsize = sizeof(clientaddr); client = accept(server, &clientaddr, &clientaddrsize); @@ -163,7 +241,7 @@ int Worker(void *id) { // inherited by the accepted sockets, but using them also has the // side-effect that the listening socket fails with EAGAIN, every // several seconds. we can use that to our advantage to check for - // the ctrl-c shutdown event; otherwise, we retry the accept call + // the ctrl-c shutdowne event; otherwise, we retry the accept call continue; } @@ -179,7 +257,7 @@ int Worker(void *id) { #if LOGGING // log the incoming http message clientip = ntohl(clientaddr.sin_addr.s_addr); - kprintf("#%.2ld get some %d.%d.%d.%d:%d %#.*s\n", (intptr_t)id, + kprintf("#%.4x get some %d.%d.%d.%d:%d %#.*s\n", (intptr_t)id, (clientip & 0xff000000) >> 030, (clientip & 0x00ff0000) >> 020, (clientip & 0x0000ff00) >> 010, (clientip & 0x000000ff) >> 000, ntohs(clientaddr.sin_port), msg.uri.b - msg.uri.a, @@ -239,8 +317,9 @@ int Worker(void *id) { // inform the parent that this clone has finished close(server); - kprintf(" #%.2ld", (intptr_t)id); - __atomic_sub_fetch(&workers, 1, __ATOMIC_SEQ_CST); + kprintf(" %d", id); + barrier_waitn(&barrier2, THREADS); + itsdone = true; return 0; } @@ -249,45 +328,20 @@ void OnCtrlC(int sig) { } int main(int argc, char *argv[]) { + /* ShowCrashReports(); */ int64_t loadtzdbearly; - int i, gotsome, haveleft, ready = 1; - - ShowCrashReports(); kprintf("welcome to greenbean\n"); gmtime(&loadtzdbearly); - - // spawn a bunch of threads - for (i = 0; i < THREADS; ++i) { + for (int i = 0; i < THREADS; ++i) { void *stack = mmap(0, 65536, PROT_READ | PROT_WRITE, MAP_STACK | MAP_ANONYMOUS, -1, 0); clone(Worker, stack, 65536, CLONE_THREAD | CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND, (void *)(intptr_t)i, 0, 0, 0, 0); } - - // wait for all threads to spawn - for (;;) { - __atomic_load(&workers, &gotsome, __ATOMIC_SEQ_CST); - if (workers == THREADS) break; - __builtin_ia32_pause(); - } - - // all threads are spawned so unleash the barrier - kprintf("\ngreenbean is ready to go\n"); + while (!ATOMIC_LOAD(&itsbegun)) usleep(HEARTBEAT * 1000); sigaction(SIGINT, &(struct sigaction){.sa_handler = OnCtrlC}, 0); - __atomic_store(&barrier, &ready, __ATOMIC_SEQ_CST); - - // main process does nothing until it's closing time - for (;;) { - __atomic_load(&workers, &haveleft, __ATOMIC_SEQ_CST); - if (!haveleft) break; - __builtin_ia32_pause(); - usleep(HEARTBEAT * 1000); - if (closingtime) { - kprintf("\rgreenbean is shutting down...\n"); - } - } - - kprintf("\n"); - kprintf("thank you for flying greenbean\n"); + kprintf("\nit's begun\n"); + while (!ATOMIC_LOAD(&itsdone)) usleep(HEARTBEAT * 1000); + kprintf("\nthank you for flying greenbean\n"); } diff --git a/libc/calls/strace.internal.h b/libc/calls/strace.internal.h index 1051c0044..fbfa984a6 100644 --- a/libc/calls/strace.internal.h +++ b/libc/calls/strace.internal.h @@ -8,7 +8,7 @@ #define _KERNTRACE 0 /* not configurable w/ flag yet */ #define _POLLTRACE 0 /* not configurable w/ flag yet */ #define _DATATRACE 1 /* not configurable w/ flag yet */ -#define _NTTRACE 0 /* not configurable w/ flag yet */ +#define _NTTRACE 1 /* not configurable w/ flag yet */ #define STRACE_PROLOGUE "%rSYS %5P %'18T " diff --git a/libc/runtime/winmain.greg.c b/libc/runtime/winmain.greg.c index 74bc3e64d..4d737dffa 100644 --- a/libc/runtime/winmain.greg.c +++ b/libc/runtime/winmain.greg.c @@ -121,6 +121,16 @@ forceinline void MakeLongDoubleLongAgain(void) { asm volatile("fldcw\t%0" : /* no outputs */ : "m"(x87cw)); } +// https://nullprogram.com/blog/2022/02/18/ +static inline char16_t *MyCommandLine(void) { + void *cmd; + asm("mov\t%%gs:(0x60),%0\n" + "mov\t0x20(%0),%0\n" + "mov\t0x78(%0),%0\n" + : "=r"(cmd)); + return cmd; +} + static inline size_t StrLen16(const char16_t *s) { size_t n; for (n = 0;; ++n) { @@ -271,7 +281,7 @@ __msabi textwindows int64_t WinMain(int64_t hInstance, int64_t hPrevInstance, #if !IsTiny() __wincrashearly = AddVectoredExceptionHandler(1, (void *)OnEarlyWinCrash); #endif - cmdline = GetCommandLine(); + cmdline = MyCommandLine(); #ifdef SYSDEBUG /* sloppy flag-only check for early initialization */ if (__strstr16(cmdline, u"--strace")) ++__strace; diff --git a/libc/sysv/consts.sh b/libc/sysv/consts.sh index 2ad3f7c43..3be697aa8 100755 --- a/libc/sysv/consts.sh +++ b/libc/sysv/consts.sh @@ -1158,18 +1158,18 @@ syscon ms MS_INVALIDATE 2 2 2 4 2 0 # statvfs() flags # # group name GNU/Systemd XNU's Not UNIX! FreeBSD OpenBSD NetBSD The New Technology Commentary -syscon statvfs ST_NOSUID 2 2 2 2 2 0 # unix consensus syscon statvfs ST_RDONLY 1 1 1 1 1 0 # unix consensus +syscon statvfs ST_NOSUID 2 2 2 2 2 0 # unix consensus +syscon statvfs ST_NODEV 4 0 0 0 0x00000010 0 +syscon statvfs ST_NOEXEC 8 0 0 0 4 0 +syscon statvfs ST_SYNCHRONOUS 16 0 0 0 2 0 syscon statvfs ST_APPEND 0x0100 0 0 0 0 0 syscon statvfs ST_IMMUTABLE 0x0200 0 0 0 0 0 -syscon statvfs ST_MANDLOCK 0x40 0 0 0 0 0 +syscon statvfs ST_MANDLOCK 0x0040 0 0 0 0 0 syscon statvfs ST_NOATIME 0x0400 0 0 0x04000000 0 0 -syscon statvfs ST_NODEV 4 0 0 0 0x00000010 0 syscon statvfs ST_NODIRATIME 0x0800 0 0 0 0 0 -syscon statvfs ST_NOEXEC 8 0 0 0 4 0 +syscon statvfs ST_WRITE 0x0080 0 0 0 0 0 syscon statvfs ST_RELATIME 0x1000 0 0 0 0x00020000 0 -syscon statvfs ST_SYNCHRONOUS 0x10 0 0 0 2 0 -syscon statvfs ST_WRITE 0x80 0 0 0 0 0 # sendfile() flags # @@ -1442,7 +1442,7 @@ syscon termios IUTF8 0b0100000000000000 0b0100000000000000 0 0 0 0b010 # # group name GNU/Systemd XNU's Not UNIX! FreeBSD OpenBSD NetBSD The New Technology Commentary syscon termios OPOST 0b0000000000000001 0b000000000000000001 0b000000000000000001 0b0000000000000001 0b0000000000000001 0b0000000000000001 # termios.c_oflag&=~OPOST disables output processing magic, e.g. MULTICS newlines -syscon termios OLCUC 0b0000000000000010 0 0 0b0000000000100000 0 0b0000000000000010 # termios.c_oflag|=OLCUC maps a-z → A-Z output +syscon termios OLCUC 0b0000000000000010 0 0 0b0000000000100000 0 0b0000000000000010 # termios.c_oflag|=OLCUC maps a-z → A-Z output (SHOUTING) syscon termios ONLCR 0b0000000000000100 0b000000000000000010 0b000000000000000010 0b0000000000000010 0b0000000000000010 0b0000000000000100 # termios.c_oflag|=ONLCR map \n → \r\n output (MULTICS newline) and requires OPOST syscon termios OCRNL 0b0000000000001000 0b000000000000010000 0b000000000000010000 0b0000000000010000 0b0000000000010000 0b0000000000001000 # termios.c_oflag|=OCRNL maps \r → \n output syscon termios ONOCR 0b0000000000010000 0b000000000000100000 0b000000000000100000 0b0000000001000000 0b0000000001000000 0b0000000000010000 # termios.c_oflag|=ONOCR maps \r → ∅ output iff column 0 @@ -1478,14 +1478,14 @@ syscon termios FF1 0b1000000000000000 0b000100000000000000 0b0001000000000 # Teletypewriter Special Control Character Assignments # # group name GNU/Systemd XNU's Not UNIX! FreeBSD OpenBSD NetBSD The New Technology Commentary +syscon termios VMIN 6+1 16 16 16 16 6 # termios.c_cc[VMIN]=𝑥 in non-canonical mode can be set to 0 for non-blocking reads, 1 for single character raw mode reads, or higher to buffer +syscon termios VTIME 5+1 17 17 17 17 5 # termios.c_cc[VTIME]=𝑥 sets non-canonical read timeout to 𝑥×𝟷𝟶𝟶ms which is needed when entering escape sequences manually with the escape key syscon termios NCCS 20 20 20 20 20 20 # ARRAYLEN(termios.c_cc); we schlep c_line into c_cc on linux syscon termios VINTR 0+1 8 8 8 8 0 # termios.c_cc[VINTR]=𝑥 syscon termios VQUIT 1+1 9 9 9 9 1 # termios.c_cc[VQUIT]=𝑥 syscon termios VERASE 2+1 3 3 3 3 2 # termios.c_cc[VERASE]=𝑥 syscon termios VKILL 3+1 5 5 5 5 3 # termios.c_cc[VKILL]=𝑥 syscon termios VEOF 4+1 0 0 0 0 4 # termios.c_cc[VEOF]=𝑥 -syscon termios VTIME 5+1 17 17 17 17 5 # termios.c_cc[VTIME]=𝑥 sets non-canonical read timeout to 𝑥×𝟷𝟶𝟶ms which is needed when entering escape sequences manually with the escape key -syscon termios VMIN 6+1 16 16 16 16 6 # termios.c_cc[VMIN]=𝑥 in non-canonical mode can be set to 0 for non-blocking reads, 1 for single character raw mode reads, or higher to buffer syscon termios VSWTC 7+1 0 0 0 0 7 # termios.c_cc[VSWTC]=𝑥 syscon termios VSTART 8+1 12 12 12 12 8 # termios.c_cc[VSTART]=𝑥 syscon termios VSTOP 9+1 13 13 13 13 9 # termios.c_cc[VSTOP]=𝑥