mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-03-02 23:18:44 +00:00
Get POSIX threads working on Apple Silicon
It's now possible to run a working ape-m1 o/aarch64/third_party/ggml/llama.com on Apple M1 hardware running XNU!
This commit is contained in:
parent
8fdb31681a
commit
b5eab2b0b7
3 changed files with 78 additions and 16 deletions
|
@ -89,7 +89,13 @@ sched_yield:
|
|||
ret
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
mov x8,#0x7c
|
||||
|
||||
mov x0,#0
|
||||
mov x1,#0
|
||||
mov x2,#0
|
||||
mov x3,#0
|
||||
mov x8,#0x7c // sched_yield() for linux
|
||||
mov x16,#0x85d // select(0,0,0,0) for xnu
|
||||
svc 0
|
||||
ret
|
||||
|
||||
|
|
|
@ -18,12 +18,14 @@
|
|||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/sysv/consts/clone.h"
|
||||
#include "libc/assert.h"
|
||||
#include "libc/atomic.h"
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/calls/struct/ucontext-netbsd.internal.h"
|
||||
#include "libc/calls/syscall-sysv.internal.h"
|
||||
#include "libc/dce.h"
|
||||
#include "libc/errno.h"
|
||||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/intrin/atomic.h"
|
||||
#include "libc/intrin/describeflags.internal.h"
|
||||
#include "libc/intrin/kprintf.h"
|
||||
#include "libc/intrin/strace.internal.h"
|
||||
|
@ -36,6 +38,7 @@
|
|||
#include "libc/runtime/clone.internal.h"
|
||||
#include "libc/runtime/internal.h"
|
||||
#include "libc/runtime/runtime.h"
|
||||
#include "libc/runtime/syslib.internal.h"
|
||||
#include "libc/sock/internal.h"
|
||||
#include "libc/stdalign.internal.h"
|
||||
#include "libc/str/str.h"
|
||||
|
@ -50,8 +53,6 @@
|
|||
#include "libc/thread/tls2.h"
|
||||
#include "libc/thread/xnu.internal.h"
|
||||
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define __NR_thr_new 455
|
||||
#define __NR_clone_linux 56
|
||||
#define __NR__lwp_create 309
|
||||
|
@ -62,10 +63,6 @@
|
|||
#define LWP_DETACHED 0x00000040
|
||||
#define LWP_SUSPENDED 0x00000080
|
||||
|
||||
__msabi extern typeof(TlsSetValue) *const __imp_TlsSetValue;
|
||||
__msabi extern typeof(ExitThread) *const __imp_ExitThread;
|
||||
__msabi extern typeof(WakeByAddressAll) *const __imp_WakeByAddressAll;
|
||||
|
||||
struct CloneArgs {
|
||||
union {
|
||||
int tid;
|
||||
|
@ -80,9 +77,15 @@ struct CloneArgs {
|
|||
void *arg;
|
||||
};
|
||||
|
||||
#ifdef __x86_64__
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// THE NEW TECHNOLOGY
|
||||
|
||||
__msabi extern typeof(TlsSetValue) *const __imp_TlsSetValue;
|
||||
__msabi extern typeof(ExitThread) *const __imp_ExitThread;
|
||||
__msabi extern typeof(WakeByAddressAll) *const __imp_WakeByAddressAll;
|
||||
|
||||
int WinThreadLaunch(void *arg, // rdi
|
||||
int tid, // rsi
|
||||
int (*func)(void *, int), // rdx
|
||||
|
@ -143,12 +146,12 @@ static textwindows errno_t CloneWindows(int (*func)(void *, int), char *stk,
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// XNU'S NOT UNIX
|
||||
|
||||
void XnuThreadThunk(void *pthread, // rdi
|
||||
int machport, // rsi
|
||||
void *(*func)(void *), // rdx
|
||||
void *arg, // rcx
|
||||
intptr_t *stack, // r8
|
||||
unsigned xnuflags); // r9
|
||||
void XnuThreadThunk(void *pthread, // rdi x0
|
||||
int machport, // rsi x1
|
||||
void *(*func)(void *), // rdx x2
|
||||
void *arg, // rcx x3
|
||||
intptr_t *stack, // r8 x4
|
||||
unsigned xnuflags); // r9 x5
|
||||
asm("XnuThreadThunk:\n\t"
|
||||
"xor\t%ebp,%ebp\n\t"
|
||||
"mov\t%r8,%rsp\n\t"
|
||||
|
@ -189,8 +192,7 @@ XnuThreadMain(void *pthread, // rdi
|
|||
// %r10 = uint32_t sem);
|
||||
asm volatile("movl\t$0,%0\n\t" // *wt->ztid = 0
|
||||
"xor\t%%r10d,%%r10d\n\t" // sem = 0
|
||||
"syscall\n\t" // __bsdthread_terminate()
|
||||
"ud2"
|
||||
"syscall" // __bsdthread_terminate()
|
||||
: "=m"(*wt->ztid)
|
||||
: "a"(0x2000000 | 361), "D"(0), "S"(0), "d"(0L)
|
||||
: "rcx", "r10", "r11", "memory");
|
||||
|
@ -430,6 +432,52 @@ static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
|
|||
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
#ifdef __aarch64__
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// APPLE SILICON
|
||||
|
||||
static void *SiliconThreadMain(void *arg) {
|
||||
register struct CloneArgs *wt asm("x21") = arg;
|
||||
asm volatile("ldr\tx28,%0" : /* no outputs */ : "m"(wt->tls));
|
||||
int tid = sys_gettid();
|
||||
*wt->ctid = tid;
|
||||
*wt->ptid = tid;
|
||||
register long x0 asm("x0") = (long)wt->arg;
|
||||
register long x1 asm("x1") = (long)tid;
|
||||
asm volatile("mov\tx19,x29\n\t" // save frame pointer
|
||||
"mov\tx20,sp\n\t" // save stack pointer
|
||||
"mov\tx29,#0\n\t" // reset backtrace
|
||||
"mov\tsp,x21\n\t" // switch stack
|
||||
"blr\t%2\n\t" // wt->func(wt->arg, tid)
|
||||
"mov\tx29,x19\n\t" // restore frame pointer
|
||||
"mov\tsp,x20" // restore stack pointer
|
||||
: "+r"(x0)
|
||||
: "r"(x1), "r"(wt->func)
|
||||
: "x19", "x20", "memory");
|
||||
*wt->ztid = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static errno_t CloneSilicon(int (*fn)(void *, int), char *stk, size_t stksz,
|
||||
int flags, void *arg, void *tls, int *ptid,
|
||||
int *ctid) {
|
||||
pthread_t th;
|
||||
struct CloneArgs *wt;
|
||||
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
|
||||
sizeof(struct CloneArgs)) &
|
||||
-MAX(16, alignof(struct CloneArgs)));
|
||||
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
|
||||
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
|
||||
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
|
||||
wt->tls = flags & CLONE_SETTLS ? tls : 0;
|
||||
wt->func = fn;
|
||||
wt->arg = arg;
|
||||
return __syslib->pthread_create(&th, 0, SiliconThreadMain, wt);
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// GNU/SYSTEMD
|
||||
|
||||
|
@ -605,9 +653,15 @@ errno_t clone(void *func, void *stk, size_t stksz, int flags, void *arg,
|
|||
CLONE_SIGHAND | CLONE_SYSVSEM)) {
|
||||
STRACE("cosmo clone() is picky about flags, see clone.c");
|
||||
rc = EINVAL;
|
||||
#ifdef __x86_64__
|
||||
} else if (IsXnu()) {
|
||||
#ifdef __x86_64__
|
||||
rc = CloneXnu(func, stk, stksz, flags, arg, tls, ptid, ctid);
|
||||
#elif defined(__aarch64__)
|
||||
rc = CloneSilicon(func, stk, stksz, flags, arg, tls, ptid, ctid);
|
||||
#else
|
||||
#error "unsupported architecture"
|
||||
#endif
|
||||
#ifdef __x86_64__
|
||||
} else if (IsFreebsd()) {
|
||||
rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
|
||||
} else if (IsNetbsd()) {
|
||||
|
|
2
third_party/ggml/llama.cc
vendored
2
third_party/ggml/llama.cc
vendored
|
@ -1128,8 +1128,10 @@ static void llama_model_load_internal(
|
|||
const size_t mem_required_state =
|
||||
scale*MEM_REQ_KV_SELF().at(model.type);
|
||||
|
||||
if (verbose > 0) {
|
||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
|
Loading…
Add table
Reference in a new issue