mirror of
				https://github.com/jart/cosmopolitan.git
				synced 2025-10-25 18:50:57 +00:00 
			
		
		
		
	Get POSIX threads working on Apple Silicon
It's now possible to run a working
    ape-m1 o/aarch64/third_party/ggml/llama.com
on Apple M1 hardware running XNU!
			
			
This commit is contained in:
		
							parent
							
								
									8fdb31681a
								
							
						
					
					
						commit
						b5eab2b0b7
					
				
					 3 changed files with 78 additions and 16 deletions
				
			
		|  | @ -89,7 +89,13 @@ sched_yield: | ||||||
| 	ret | 	ret | ||||||
| 
 | 
 | ||||||
| #elif defined(__aarch64__) | #elif defined(__aarch64__) | ||||||
| 	mov	x8,#0x7c | 
 | ||||||
|  | 	mov	x0,#0 | ||||||
|  | 	mov	x1,#0 | ||||||
|  | 	mov	x2,#0 | ||||||
|  | 	mov	x3,#0 | ||||||
|  | 	mov	x8,#0x7c			// sched_yield() for linux | ||||||
|  | 	mov	x16,#0x85d			// select(0,0,0,0) for xnu | ||||||
| 	svc	0 | 	svc	0 | ||||||
| 	ret | 	ret | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -18,12 +18,14 @@ | ||||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||||
| #include "libc/sysv/consts/clone.h" | #include "libc/sysv/consts/clone.h" | ||||||
| #include "libc/assert.h" | #include "libc/assert.h" | ||||||
|  | #include "libc/atomic.h" | ||||||
| #include "libc/calls/calls.h" | #include "libc/calls/calls.h" | ||||||
| #include "libc/calls/struct/ucontext-netbsd.internal.h" | #include "libc/calls/struct/ucontext-netbsd.internal.h" | ||||||
| #include "libc/calls/syscall-sysv.internal.h" | #include "libc/calls/syscall-sysv.internal.h" | ||||||
| #include "libc/dce.h" | #include "libc/dce.h" | ||||||
| #include "libc/errno.h" | #include "libc/errno.h" | ||||||
| #include "libc/intrin/asan.internal.h" | #include "libc/intrin/asan.internal.h" | ||||||
|  | #include "libc/intrin/atomic.h" | ||||||
| #include "libc/intrin/describeflags.internal.h" | #include "libc/intrin/describeflags.internal.h" | ||||||
| #include "libc/intrin/kprintf.h" | #include "libc/intrin/kprintf.h" | ||||||
| #include "libc/intrin/strace.internal.h" | #include "libc/intrin/strace.internal.h" | ||||||
|  | @ -36,6 +38,7 @@ | ||||||
| #include "libc/runtime/clone.internal.h" | #include "libc/runtime/clone.internal.h" | ||||||
| #include "libc/runtime/internal.h" | #include "libc/runtime/internal.h" | ||||||
| #include "libc/runtime/runtime.h" | #include "libc/runtime/runtime.h" | ||||||
|  | #include "libc/runtime/syslib.internal.h" | ||||||
| #include "libc/sock/internal.h" | #include "libc/sock/internal.h" | ||||||
| #include "libc/stdalign.internal.h" | #include "libc/stdalign.internal.h" | ||||||
| #include "libc/str/str.h" | #include "libc/str/str.h" | ||||||
|  | @ -50,8 +53,6 @@ | ||||||
| #include "libc/thread/tls2.h" | #include "libc/thread/tls2.h" | ||||||
| #include "libc/thread/xnu.internal.h" | #include "libc/thread/xnu.internal.h" | ||||||
| 
 | 
 | ||||||
| #ifdef __x86_64__ |  | ||||||
| 
 |  | ||||||
| #define __NR_thr_new                      455 | #define __NR_thr_new                      455 | ||||||
| #define __NR_clone_linux                  56 | #define __NR_clone_linux                  56 | ||||||
| #define __NR__lwp_create                  309 | #define __NR__lwp_create                  309 | ||||||
|  | @ -62,10 +63,6 @@ | ||||||
| #define LWP_DETACHED                      0x00000040 | #define LWP_DETACHED                      0x00000040 | ||||||
| #define LWP_SUSPENDED                     0x00000080 | #define LWP_SUSPENDED                     0x00000080 | ||||||
| 
 | 
 | ||||||
| __msabi extern typeof(TlsSetValue) *const __imp_TlsSetValue; |  | ||||||
| __msabi extern typeof(ExitThread) *const __imp_ExitThread; |  | ||||||
| __msabi extern typeof(WakeByAddressAll) *const __imp_WakeByAddressAll; |  | ||||||
| 
 |  | ||||||
| struct CloneArgs { | struct CloneArgs { | ||||||
|   union { |   union { | ||||||
|     int tid; |     int tid; | ||||||
|  | @ -80,9 +77,15 @@ struct CloneArgs { | ||||||
|   void *arg; |   void *arg; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | #ifdef __x86_64__ | ||||||
|  | 
 | ||||||
| ////////////////////////////////////////////////////////////////////////////////
 | ////////////////////////////////////////////////////////////////////////////////
 | ||||||
| // THE NEW TECHNOLOGY
 | // THE NEW TECHNOLOGY
 | ||||||
| 
 | 
 | ||||||
|  | __msabi extern typeof(TlsSetValue) *const __imp_TlsSetValue; | ||||||
|  | __msabi extern typeof(ExitThread) *const __imp_ExitThread; | ||||||
|  | __msabi extern typeof(WakeByAddressAll) *const __imp_WakeByAddressAll; | ||||||
|  | 
 | ||||||
| int WinThreadLaunch(void *arg,                 // rdi
 | int WinThreadLaunch(void *arg,                 // rdi
 | ||||||
|                     int tid,                   // rsi
 |                     int tid,                   // rsi
 | ||||||
|                     int (*func)(void *, int),  // rdx
 |                     int (*func)(void *, int),  // rdx
 | ||||||
|  | @ -143,12 +146,12 @@ static textwindows errno_t CloneWindows(int (*func)(void *, int), char *stk, | ||||||
| ////////////////////////////////////////////////////////////////////////////////
 | ////////////////////////////////////////////////////////////////////////////////
 | ||||||
| // XNU'S NOT UNIX
 | // XNU'S NOT UNIX
 | ||||||
| 
 | 
 | ||||||
| void XnuThreadThunk(void *pthread,          // rdi
 | void XnuThreadThunk(void *pthread,          // rdi x0
 | ||||||
|                     int machport,           // rsi
 |                     int machport,           // rsi x1
 | ||||||
|                     void *(*func)(void *),  // rdx
 |                     void *(*func)(void *),  // rdx x2
 | ||||||
|                     void *arg,              // rcx
 |                     void *arg,              // rcx x3
 | ||||||
|                     intptr_t *stack,        // r8
 |                     intptr_t *stack,        // r8  x4
 | ||||||
|                     unsigned xnuflags);     // r9
 |                     unsigned xnuflags);     // r9  x5
 | ||||||
| asm("XnuThreadThunk:\n\t" | asm("XnuThreadThunk:\n\t" | ||||||
|     "xor\t%ebp,%ebp\n\t" |     "xor\t%ebp,%ebp\n\t" | ||||||
|     "mov\t%r8,%rsp\n\t" |     "mov\t%r8,%rsp\n\t" | ||||||
|  | @ -189,8 +192,7 @@ XnuThreadMain(void *pthread,                    // rdi | ||||||
|   //                                %r10 = uint32_t sem);
 |   //                                %r10 = uint32_t sem);
 | ||||||
|   asm volatile("movl\t$0,%0\n\t"         // *wt->ztid = 0
 |   asm volatile("movl\t$0,%0\n\t"         // *wt->ztid = 0
 | ||||||
|                "xor\t%%r10d,%%r10d\n\t"  // sem = 0
 |                "xor\t%%r10d,%%r10d\n\t"  // sem = 0
 | ||||||
|                "syscall\n\t"             // __bsdthread_terminate()
 |                "syscall"                 // __bsdthread_terminate()
 | ||||||
|                "ud2" |  | ||||||
|                : "=m"(*wt->ztid) |                : "=m"(*wt->ztid) | ||||||
|                : "a"(0x2000000 | 361), "D"(0), "S"(0), "d"(0L) |                : "a"(0x2000000 | 361), "D"(0), "S"(0), "d"(0L) | ||||||
|                : "rcx", "r10", "r11", "memory"); |                : "rcx", "r10", "r11", "memory"); | ||||||
|  | @ -430,6 +432,52 @@ static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz, | ||||||
| 
 | 
 | ||||||
| #endif /* __x86_64__ */ | #endif /* __x86_64__ */ | ||||||
| 
 | 
 | ||||||
|  | #ifdef __aarch64__ | ||||||
|  | 
 | ||||||
|  | ////////////////////////////////////////////////////////////////////////////////
 | ||||||
|  | // APPLE SILICON
 | ||||||
|  | 
 | ||||||
|  | static void *SiliconThreadMain(void *arg) { | ||||||
|  |   register struct CloneArgs *wt asm("x21") = arg; | ||||||
|  |   asm volatile("ldr\tx28,%0" : /* no outputs */ : "m"(wt->tls)); | ||||||
|  |   int tid = sys_gettid(); | ||||||
|  |   *wt->ctid = tid; | ||||||
|  |   *wt->ptid = tid; | ||||||
|  |   register long x0 asm("x0") = (long)wt->arg; | ||||||
|  |   register long x1 asm("x1") = (long)tid; | ||||||
|  |   asm volatile("mov\tx19,x29\n\t"  // save frame pointer
 | ||||||
|  |                "mov\tx20,sp\n\t"   // save stack pointer
 | ||||||
|  |                "mov\tx29,#0\n\t"   // reset backtrace
 | ||||||
|  |                "mov\tsp,x21\n\t"   // switch stack
 | ||||||
|  |                "blr\t%2\n\t"       // wt->func(wt->arg, tid)
 | ||||||
|  |                "mov\tx29,x19\n\t"  // restore frame pointer
 | ||||||
|  |                "mov\tsp,x20"       // restore stack pointer
 | ||||||
|  |                : "+r"(x0) | ||||||
|  |                : "r"(x1), "r"(wt->func) | ||||||
|  |                : "x19", "x20", "memory"); | ||||||
|  |   *wt->ztid = 0; | ||||||
|  |   return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static errno_t CloneSilicon(int (*fn)(void *, int), char *stk, size_t stksz, | ||||||
|  |                             int flags, void *arg, void *tls, int *ptid, | ||||||
|  |                             int *ctid) { | ||||||
|  |   pthread_t th; | ||||||
|  |   struct CloneArgs *wt; | ||||||
|  |   wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) - | ||||||
|  |                              sizeof(struct CloneArgs)) & | ||||||
|  |                             -MAX(16, alignof(struct CloneArgs))); | ||||||
|  |   wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid; | ||||||
|  |   wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid; | ||||||
|  |   wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid; | ||||||
|  |   wt->tls = flags & CLONE_SETTLS ? tls : 0; | ||||||
|  |   wt->func = fn; | ||||||
|  |   wt->arg = arg; | ||||||
|  |   return __syslib->pthread_create(&th, 0, SiliconThreadMain, wt); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #endif /* __aarch64__ */ | ||||||
|  | 
 | ||||||
| ////////////////////////////////////////////////////////////////////////////////
 | ////////////////////////////////////////////////////////////////////////////////
 | ||||||
| // GNU/SYSTEMD
 | // GNU/SYSTEMD
 | ||||||
| 
 | 
 | ||||||
|  | @ -605,9 +653,15 @@ errno_t clone(void *func, void *stk, size_t stksz, int flags, void *arg, | ||||||
|                   CLONE_SIGHAND | CLONE_SYSVSEM)) { |                   CLONE_SIGHAND | CLONE_SYSVSEM)) { | ||||||
|     STRACE("cosmo clone() is picky about flags, see clone.c"); |     STRACE("cosmo clone() is picky about flags, see clone.c"); | ||||||
|     rc = EINVAL; |     rc = EINVAL; | ||||||
| #ifdef __x86_64__ |  | ||||||
|   } else if (IsXnu()) { |   } else if (IsXnu()) { | ||||||
|  | #ifdef __x86_64__ | ||||||
|     rc = CloneXnu(func, stk, stksz, flags, arg, tls, ptid, ctid); |     rc = CloneXnu(func, stk, stksz, flags, arg, tls, ptid, ctid); | ||||||
|  | #elif defined(__aarch64__) | ||||||
|  |     rc = CloneSilicon(func, stk, stksz, flags, arg, tls, ptid, ctid); | ||||||
|  | #else | ||||||
|  | #error "unsupported architecture" | ||||||
|  | #endif | ||||||
|  | #ifdef __x86_64__ | ||||||
|   } else if (IsFreebsd()) { |   } else if (IsFreebsd()) { | ||||||
|     rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, ptid, ctid); |     rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, ptid, ctid); | ||||||
|   } else if (IsNetbsd()) { |   } else if (IsNetbsd()) { | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								third_party/ggml/llama.cc
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								third_party/ggml/llama.cc
									
										
									
									
										vendored
									
									
								
							|  | @ -1128,8 +1128,10 @@ static void llama_model_load_internal( | ||||||
|         const size_t mem_required_state = |         const size_t mem_required_state = | ||||||
|             scale*MEM_REQ_KV_SELF().at(model.type); |             scale*MEM_REQ_KV_SELF().at(model.type); | ||||||
| 
 | 
 | ||||||
|  |     if (verbose > 0) { | ||||||
|         fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__, |         fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__, | ||||||
|                 mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); |                 mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
| #ifdef GGML_USE_CUBLAS | #ifdef GGML_USE_CUBLAS | ||||||
|         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); |         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue