diff --git a/libc/intrin/sched_yield.S b/libc/intrin/sched_yield.S index 556c45c25..9077234f3 100644 --- a/libc/intrin/sched_yield.S +++ b/libc/intrin/sched_yield.S @@ -89,7 +89,13 @@ sched_yield: ret #elif defined(__aarch64__) - mov x8,#0x7c + + mov x0,#0 + mov x1,#0 + mov x2,#0 + mov x3,#0 + mov x8,#0x7c // sched_yield() for linux + mov x16,#0x85d // select(0,0,0,0) for xnu svc 0 ret diff --git a/libc/runtime/clone.c b/libc/runtime/clone.c index 4ec9f3bdc..98c039fc3 100644 --- a/libc/runtime/clone.c +++ b/libc/runtime/clone.c @@ -18,12 +18,14 @@ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/sysv/consts/clone.h" #include "libc/assert.h" +#include "libc/atomic.h" #include "libc/calls/calls.h" #include "libc/calls/struct/ucontext-netbsd.internal.h" #include "libc/calls/syscall-sysv.internal.h" #include "libc/dce.h" #include "libc/errno.h" #include "libc/intrin/asan.internal.h" +#include "libc/intrin/atomic.h" #include "libc/intrin/describeflags.internal.h" #include "libc/intrin/kprintf.h" #include "libc/intrin/strace.internal.h" @@ -36,6 +38,7 @@ #include "libc/runtime/clone.internal.h" #include "libc/runtime/internal.h" #include "libc/runtime/runtime.h" +#include "libc/runtime/syslib.internal.h" #include "libc/sock/internal.h" #include "libc/stdalign.internal.h" #include "libc/str/str.h" @@ -50,8 +53,6 @@ #include "libc/thread/tls2.h" #include "libc/thread/xnu.internal.h" -#ifdef __x86_64__ - #define __NR_thr_new 455 #define __NR_clone_linux 56 #define __NR__lwp_create 309 @@ -62,10 +63,6 @@ #define LWP_DETACHED 0x00000040 #define LWP_SUSPENDED 0x00000080 -__msabi extern typeof(TlsSetValue) *const __imp_TlsSetValue; -__msabi extern typeof(ExitThread) *const __imp_ExitThread; -__msabi extern typeof(WakeByAddressAll) *const __imp_WakeByAddressAll; - struct CloneArgs { union { int tid; @@ -80,9 +77,15 @@ struct CloneArgs { void *arg; }; +#ifdef __x86_64__ + //////////////////////////////////////////////////////////////////////////////// // THE NEW TECHNOLOGY +__msabi extern typeof(TlsSetValue) *const __imp_TlsSetValue; +__msabi extern typeof(ExitThread) *const __imp_ExitThread; +__msabi extern typeof(WakeByAddressAll) *const __imp_WakeByAddressAll; + int WinThreadLaunch(void *arg, // rdi int tid, // rsi int (*func)(void *, int), // rdx @@ -143,12 +146,12 @@ static textwindows errno_t CloneWindows(int (*func)(void *, int), char *stk, //////////////////////////////////////////////////////////////////////////////// // XNU'S NOT UNIX -void XnuThreadThunk(void *pthread, // rdi - int machport, // rsi - void *(*func)(void *), // rdx - void *arg, // rcx - intptr_t *stack, // r8 - unsigned xnuflags); // r9 +void XnuThreadThunk(void *pthread, // rdi x0 + int machport, // rsi x1 + void *(*func)(void *), // rdx x2 + void *arg, // rcx x3 + intptr_t *stack, // r8 x4 + unsigned xnuflags); // r9 x5 asm("XnuThreadThunk:\n\t" "xor\t%ebp,%ebp\n\t" "mov\t%r8,%rsp\n\t" @@ -189,8 +192,7 @@ XnuThreadMain(void *pthread, // rdi // %r10 = uint32_t sem); asm volatile("movl\t$0,%0\n\t" // *wt->ztid = 0 "xor\t%%r10d,%%r10d\n\t" // sem = 0 - "syscall\n\t" // __bsdthread_terminate() - "ud2" + "syscall" // __bsdthread_terminate() : "=m"(*wt->ztid) : "a"(0x2000000 | 361), "D"(0), "S"(0), "d"(0L) : "rcx", "r10", "r11", "memory"); @@ -430,6 +432,52 @@ static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz, #endif /* __x86_64__ */ +#ifdef __aarch64__ + +//////////////////////////////////////////////////////////////////////////////// +// APPLE SILICON + +static void *SiliconThreadMain(void *arg) { + register struct CloneArgs *wt asm("x21") = arg; + asm volatile("ldr\tx28,%0" : /* no outputs */ : "m"(wt->tls)); + int tid = sys_gettid(); + *wt->ctid = tid; + *wt->ptid = tid; + register long x0 asm("x0") = (long)wt->arg; + register long x1 asm("x1") = (long)tid; + asm volatile("mov\tx19,x29\n\t" // save frame pointer + "mov\tx20,sp\n\t" // save stack pointer + "mov\tx29,#0\n\t" // reset backtrace + "mov\tsp,x21\n\t" // switch stack + "blr\t%2\n\t" // wt->func(wt->arg, tid) + "mov\tx29,x19\n\t" // restore frame pointer + "mov\tsp,x20" // restore stack pointer + : "+r"(x0) + : "r"(x1), "r"(wt->func) + : "x19", "x20", "memory"); + *wt->ztid = 0; + return 0; +} + +static errno_t CloneSilicon(int (*fn)(void *, int), char *stk, size_t stksz, + int flags, void *arg, void *tls, int *ptid, + int *ctid) { + pthread_t th; + struct CloneArgs *wt; + wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) - + sizeof(struct CloneArgs)) & + -MAX(16, alignof(struct CloneArgs))); + wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid; + wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid; + wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid; + wt->tls = flags & CLONE_SETTLS ? tls : 0; + wt->func = fn; + wt->arg = arg; + return __syslib->pthread_create(&th, 0, SiliconThreadMain, wt); +} + +#endif /* __aarch64__ */ + //////////////////////////////////////////////////////////////////////////////// // GNU/SYSTEMD @@ -605,9 +653,15 @@ errno_t clone(void *func, void *stk, size_t stksz, int flags, void *arg, CLONE_SIGHAND | CLONE_SYSVSEM)) { STRACE("cosmo clone() is picky about flags, see clone.c"); rc = EINVAL; -#ifdef __x86_64__ } else if (IsXnu()) { +#ifdef __x86_64__ rc = CloneXnu(func, stk, stksz, flags, arg, tls, ptid, ctid); +#elif defined(__aarch64__) + rc = CloneSilicon(func, stk, stksz, flags, arg, tls, ptid, ctid); +#else +#error "unsupported architecture" +#endif +#ifdef __x86_64__ } else if (IsFreebsd()) { rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, ptid, ctid); } else if (IsNetbsd()) { diff --git a/third_party/ggml/llama.cc b/third_party/ggml/llama.cc index 32048ba7a..e18a2d727 100644 --- a/third_party/ggml/llama.cc +++ b/third_party/ggml/llama.cc @@ -1128,8 +1128,10 @@ static void llama_model_load_internal( const size_t mem_required_state = scale*MEM_REQ_KV_SELF().at(model.type); + if (verbose > 0) { fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); + } #ifdef GGML_USE_CUBLAS const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));