Get llama.com working on aarch64

This commit is contained in:
Justine Tunney 2023-05-09 22:41:57 -07:00
parent 4c093155a3
commit a0237a017c
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
19 changed files with 321 additions and 157 deletions

View file

@ -1361,7 +1361,6 @@ static privileged void AllowMmapExec(struct Filter *f) {
// The flags parameter must not have:
//
// - MAP_LOCKED (0x02000)
// - MAP_POPULATE (0x08000)
// - MAP_NONBLOCK (0x10000)
// - MAP_HUGETLB (0x40000)
//

View file

@ -57,6 +57,49 @@
#include "libc/thread/tls2.h"
#include "libc/vga/vga.internal.h"
#define KGETINT(x, va, t, s) \
switch (t) { \
case -3: \
x = !!va_arg(va, int); \
break; \
case -2: \
if (s) { \
x = (signed char)va_arg(va, int); \
} else { \
x = (unsigned char)va_arg(va, int); \
} \
break; \
case -1: \
if (s) { \
x = (signed short)va_arg(va, int); \
} else { \
x = (unsigned short)va_arg(va, int); \
} \
break; \
case 0: \
default: \
if (s) { \
x = va_arg(va, int); \
} else { \
x = va_arg(va, unsigned int); \
} \
break; \
case 1: \
if (s) { \
x = va_arg(va, long); \
} else { \
x = va_arg(va, unsigned long); \
} \
break; \
case 2: \
if (s) { \
x = va_arg(va, long long); \
} else { \
x = va_arg(va, unsigned long long); \
} \
break; \
}
extern _Hide struct SymbolTable *__symtab;
privileged static inline char *kadvance(char *p, char *e, long n) {
@ -80,23 +123,6 @@ privileged static char *kemitquote(char *p, char *e, signed char t,
return p;
}
privileged static unsigned long long kgetint(va_list va, signed char t,
bool s) {
int bits;
unsigned long long x;
x = va_arg(va, unsigned long);
if (t <= 0) {
bits = 64 - (32 >> MIN(5, -t));
x <<= bits;
if (s) {
x = (signed long)x >> bits;
} else {
x >>= bits;
}
}
return x;
}
privileged static inline bool kiskernelpointer(const void *p) {
return 0x7f0000000000 <= (intptr_t)p && (intptr_t)p < 0x800000000000;
}
@ -363,7 +389,7 @@ privileged static size_t kformat(char *b, size_t n, const char *fmt,
s = va_arg(va, int) ? "true" : "false";
goto FormatString;
}
x = kgetint(va, type, c == 'd');
KGETINT(x, va, type, c == 'd');
FormatDecimal:
if ((long long)x < 0 && c != 'u') {
x = -x;
@ -426,7 +452,7 @@ privileged static size_t kformat(char *b, size_t n, const char *fmt,
base = 1;
if (hash) hash = '0' | 'b' << 8;
BinaryNumber:
x = kgetint(va, type, false);
KGETINT(x, va, type, false);
FormatNumber:
i = 0;
m = (1 << base) - 1;

View file

@ -20,14 +20,9 @@
#include "libc/runtime/memtrack.internal.h"
#include "libc/thread/thread.h"
#ifdef __x86_64__
STATIC_YOINK("_init__mmi");
#endif
struct MemoryIntervals _mmi;
pthread_mutex_t __mmi_lock_obj; // recursive :'(
__attribute__((__constructor__)) void __mmi_init(void) {
static bool once;
if (once) return;
_mmi.n = ARRAYLEN(_mmi.s);
_mmi.p = _mmi.s;
__mmi_lock_obj._type = PTHREAD_MUTEX_RECURSIVE;
once = true;
}

26
libc/intrin/mmi.init.S Normal file
View file

@ -0,0 +1,26 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2021 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/thread/thread.h"
#include "libc/macros.internal.h"
.init.start 200,_init__mmi
movb $OPEN_MAX,_mmi+8
movl $_mmi+24,_mmi+16
movb $PTHREAD_MUTEX_RECURSIVE,__mmi_lock_obj+4(%rip)
.init.end 200,_init__mmi

View file

@ -56,8 +56,6 @@ sys_clone_linux:
syscall
1: hlt // ctid was corrupted by program!
#elif defined(__aarch64__)
and x1,x1,#-16 // align stack
stp x5,x6,[x1,#-16]! // save func and arg
mov x8,x3 // swap x3 and x4
mov x3,x4 // swap x3 and x4
mov x4,x8 // swap x3 and x4
@ -65,8 +63,8 @@ sys_clone_linux:
svc #0
cbz x0,2f
ret
2: ldp x1,x0,[sp],#16 // child thread
blr x1
2: mov x0,x6 // child thread
blr x5
mov x8,#93 // __NR_exit
svc #0
#else

View file

@ -25,6 +25,7 @@
#include "libc/errno.h"
#include "libc/intrin/asan.internal.h"
#include "libc/intrin/describeflags.internal.h"
#include "libc/intrin/kprintf.h"
#include "libc/intrin/strace.internal.h"
#include "libc/limits.h"
#include "libc/macros.internal.h"
@ -452,7 +453,12 @@ static int CloneLinux(int (*func)(void *arg, int rc), char *stk, size_t stksz,
ctid = (int *)sp;
sp -= 8; // experiment
}
sp = sp & -16; // align the stack
// align the stack
#ifdef __aarch64__
sp = sp & -128; // for kernel 4.6 and earlier
#else
sp = sp & -16;
#endif
if ((rc = sys_clone_linux(flags, sp, ptid, ctid, tls, func, arg)) >= 0) {
// clone() is documented as setting ptid before return
return 0;
@ -577,6 +583,10 @@ errno_t clone(void *func, void *stk, size_t stksz, int flags, void *arg,
__enable_threads();
}
STRACE("clone(func=%t, stk=%p, stksz=%'zu, flags=%#x, arg=%p, ptid=%p, "
"tls=%p, ctid=%p)",
func, stk, stksz, flags, arg, ptid, tls, ctid);
if (!func) {
rc = EINVAL;
} else if (!IsTiny() &&

View file

@ -83,19 +83,19 @@ cosmo: push %rbp
call _init
// call constructors
ezlea __init_array_start,ax // static ctors in forward order
.weak __init_array_start // could be called multiple times
ezlea __init_array_end,cx // idempotency recommended
.weak __init_array_end // @see ape/ape.lds
ezlea __init_array_end,ax // static ctors in forward order
.weak __init_array_end // could be called multiple times
ezlea __init_array_start,cx // idempotency recommended
.weak __init_array_start // @see ape/ape.lds
1: cmp %rax,%rcx
je 2f
sub $8,%rax
push %rax
push %rcx
call .Largs
call *(%rax)
pop %rcx
pop %rax
add $8,%rax
jmp 1b
// call main()
@ -141,7 +141,6 @@ cosmo: push %rbp
push %rsi
// allocate stack
call __mmi_init
movabs $ape_stack_vaddr,%rdi
mov $ape_stack_memsz,%esi
mov $ape_stack_prot,%edx

View file

@ -16,8 +16,15 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/intrin/kprintf.h"
#include "libc/intrin/strace.internal.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/rdtsc.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/memtrack.internal.h"
#include "libc/runtime/runtime.h"
#include "libc/thread/thread.h"
#include "libc/thread/tls.h"
#ifndef __x86_64__
int main(int, char **, char **) __attribute__((__weak__));
@ -40,35 +47,65 @@ typedef int init_f(int argc, char **argv, char **envp, unsigned long *auxv);
extern init_f __strace_init;
extern init_f *__init_array_start[] __attribute__((__weak__));
extern init_f *__init_array_end[] __attribute__((__weak__));
extern uintptr_t ape_idata_iat[] __attribute__((__weak__));
extern uintptr_t ape_idata_iatend[] __attribute__((__weak__));
extern pthread_mutex_t __mmi_lock_obj;
struct CosmoTib *tib;
void cosmo(long *sp) {
int argc;
init_f **fp;
uintptr_t *pp;
char **argv, **envp;
unsigned long *auxv;
// get startup timestamp as early as possible
// its used by --strace and also kprintf() %T
kStartTsc = rdtsc();
// extracts arguments from old sysv stack abi
argc = *sp;
argv = (char **)(sp + 1);
envp = (char **)(sp + 1 + argc + 1);
auxv = (unsigned long *)(sp + 1 + argc + 1);
for (;;) {
if (!*auxv++) {
break;
}
while (*auxv++) donothing;
// needed by kisdangerous()
__oldstack = (intptr_t)sp;
// make win32 imps noop
for (pp = ape_idata_iat; pp < ape_idata_iatend; ++pp) {
*pp = (uintptr_t)_missingno;
}
// initialize mmap() manager extremely early
_mmi.n = ARRAYLEN(_mmi.s);
_mmi.p = _mmi.s;
__mmi_lock_obj._type = PTHREAD_MUTEX_RECURSIVE;
#ifdef SYSDEBUG
// initialize --strace functionality
argc = __strace_init(argc, argv, envp, auxv);
#endif
// set helpful globals
__argc = argc;
__argv = argv;
__envp = envp;
__auxv = auxv;
environ = envp;
if (argc) program_invocation_name = argv[0];
// run initialization callbacks
_init();
for (fp = __init_array_start; fp < __init_array_end; ++fp) {
__enable_tls();
for (fp = __init_array_end; fp-- > __init_array_start;) {
(*fp)(argc, argv, envp, auxv);
}
// run program
exit(main(argc, argv, envp));
}
#endif /* __aarch64__ */
#endif /* __x86_64__ */

View file

@ -62,7 +62,9 @@ static privileged dontinline void FixupLockNops(void) {
void __enable_threads(void) {
if (__threaded) return;
#ifdef __x86_64__
STRACE("__enable_threads()");
FixupLockNops();
#endif
__threaded = sys_gettid();
}

View file

@ -23,17 +23,15 @@
#include "libc/intrin/asancodes.h"
#include "libc/intrin/atomic.h"
#include "libc/intrin/weaken.h"
#include "libc/log/libfatal.internal.h"
#include "libc/macros.internal.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/runtime.h"
#include "libc/str/str.h"
#include "libc/thread/posixthread.internal.h"
#include "libc/thread/thread.h"
#include "libc/thread/tls.h"
#define _TLSZ ((intptr_t)_tls_size)
#define _TLDZ ((intptr_t)_tdata_size)
#define _TIBZ sizeof(struct CosmoTib)
#define I(x) ((uintptr_t)x)
extern unsigned char __tls_mov_nt_rax[];
extern unsigned char __tls_add_nt_rax[];
@ -41,20 +39,34 @@ extern unsigned char __tls_add_nt_rax[];
nsync_dll_list_ _pthread_list;
pthread_spinlock_t _pthread_lock;
static struct PosixThread _pthread_main;
_Alignas(TLS_ALIGNMENT) static char __static_tls[5008];
_Alignas(TLS_ALIGNMENT) static char __static_tls[6016];
/**
* Enables thread local storage for main process.
*
* %fs Linux/BSDs
* Here's the TLS memory layout on x86_64:
*
* __get_tls()
*
* _Thread_local __get_tls()
* %fs Linux/BSDs
* _Thread_local
*
* pad .tdata .tbss tib
*
*
* Windows/Mac %gs
*
* Here's the TLS memory layout on aarch64:
*
* %tpidr_el0
*
* _Thread_local
*
* tibdtv .tdata .tbss
*
*
* __get_tls()
*
* This function is always called by the core runtime to guarantee TLS
* is always available to your program. You must build your code using
* -mno-tls-direct-seg-refs if you want to use _Thread_local.
@ -81,10 +93,31 @@ _Alignas(TLS_ALIGNMENT) static char __static_tls[5008];
void __enable_tls(void) {
int tid;
size_t siz;
struct CosmoTib *tib;
char *mem, *tls;
struct CosmoTib *tib;
siz = ROUNDUP(_TLSZ + _TIBZ, _Alignof(__static_tls));
// Here's the layout we're currently using:
//
// .align PAGESIZE
// _tdata_start:
// .tdata
// _tdata_size = . - _tdata_start
// .align PAGESIZE
// _tbss_start:
// _tdata_start + _tbss_offset:
// .tbss
// .align TLS_ALIGNMENT
// _tbss_size = . - _tbss_start
// _tbss_end:
// _tbss_start + _tbss_size:
// _tdata_start + _tls_size:
//
_unassert(_tbss_start == _tdata_start + I(_tbss_offset));
_unassert(_tbss_start + I(_tbss_size) == _tdata_start + I(_tls_size));
#ifdef __x86_64__
siz = ROUNDUP(I(_tls_size) + sizeof(*tib), _Alignof(__static_tls));
if (siz <= sizeof(__static_tls)) {
// if tls requirement is small then use the static tls block
// which helps avoid a system call for appes with little tls
@ -103,14 +136,52 @@ void __enable_tls(void) {
if (IsAsan()) {
// poison the space between .tdata and .tbss
__asan_poison(mem + (intptr_t)_tdata_size,
(intptr_t)_tbss_offset - (intptr_t)_tdata_size,
__asan_poison(mem + I(_tdata_size), I(_tbss_offset) - I(_tdata_size),
kAsanProtected);
}
tib = (struct CosmoTib *)(mem + siz - sizeof(*tib));
tls = mem + siz - sizeof(*tib) - I(_tls_size);
#elif defined(__aarch64__)
siz = ROUNDUP(sizeof(*tib) + 2 * sizeof(void *) + I(_tls_size),
_Alignof(__static_tls));
if (siz <= sizeof(__static_tls)) {
mem = __static_tls;
} else {
_npassert(_weaken(_mapanon));
siz = ROUNDUP(siz, FRAMESIZE);
mem = _weaken(_mapanon)(siz);
_npassert(mem);
}
if (IsAsan()) {
// there's a roundup(pagesize) gap between .tdata and .tbss
// poison that empty space
__asan_poison(mem + sizeof(*tib) + 2 * sizeof(void *) + I(_tdata_size),
I(_tbss_offset) - I(_tdata_size), kAsanProtected);
}
tib = (struct CosmoTib *)mem;
tls = mem + sizeof(*tib) + 2 * sizeof(void *);
// Set the DTV.
//
// We don't support dynamic shared objects at the moment. The APE
// linker script will only produce a single PT_TLS program header
// therefore our job is relatively simple.
//
// @see musl/src/env/__init_tls.c
// @see https://chao-tic.github.io/blog/2018/12/25/tls
((uintptr_t *)tls)[-2] = 1;
((void **)tls)[-1] = tls;
#else
#error "unsupported architecture"
#endif /* __x86_64__ */
// initialize main thread tls memory
tib = (struct CosmoTib *)(mem + siz - _TIBZ);
tls = mem + siz - _TIBZ - _TLSZ;
tib->tib_self = tib;
tib->tib_self2 = tib;
tib->tib_errno = __errno;
@ -135,7 +206,9 @@ void __enable_tls(void) {
atomic_store_explicit(&_pthread_main.ptid, tid, memory_order_relaxed);
// copy in initialized data section
__repmovsb(tls, _tdata_start, _TLDZ);
if (I(_tdata_size)) {
memcpy(tls, _tdata_start, I(_tdata_size));
}
// ask the operating system to change the x86 segment register
__set_tls(tib);

View file

@ -27,6 +27,7 @@
int sys_set_tls();
void __set_tls(struct CosmoTib *tib) {
tib = __adj_tls(tib);
#ifdef __x86_64__
// ask the operating system to change the x86 segment register
int ax, dx;
@ -58,6 +59,6 @@ void __set_tls(struct CosmoTib *tib) {
"d"((uint32_t)(val >> 32)));
}
#else
asm volatile("msr\ttpidr_el0,%0" : /* no outputs */ : "r"(tib + 1));
asm volatile("msr\ttpidr_el0,%0" : /* no outputs */ : "r"(tib));
#endif
}

View file

@ -28,18 +28,30 @@
#include "libc/thread/spawn.h"
#include "libc/thread/tls.h"
#define I(x) ((intptr_t)x)
#define I(x) ((uintptr_t)x)
void Bzero(void *, size_t) asm("bzero"); // gcc bug
/**
* Allocates thread-local storage memory for new thread.
* @return buffer that must be released with free()
*/
char *_mktls(struct CosmoTib **out_tib) {
static char *_mktls_finish(struct CosmoTib **out_tib, char *mem,
struct CosmoTib *tib) {
struct CosmoTib *old;
old = __get_tls();
Bzero(tib, sizeof(*tib));
tib->tib_self = tib;
tib->tib_self2 = tib;
tib->tib_ftrace = old->tib_ftrace;
tib->tib_strace = old->tib_strace;
tib->tib_sigmask = old->tib_sigmask;
atomic_store_explicit(&tib->tib_tid, -1, memory_order_relaxed);
if (out_tib) {
*out_tib = tib;
}
return mem;
}
static char *_mktls_below(struct CosmoTib **out_tib) {
char *tls;
struct CosmoTib *neu, *old;
__require_tls();
struct CosmoTib *neu;
// allocate memory for tdata, tbss, and tib
tls = memalign(TLS_ALIGNMENT, I(_tls_size) + sizeof(struct CosmoTib));
@ -51,22 +63,67 @@ char *_mktls(struct CosmoTib **out_tib) {
kAsanProtected);
}
// initialize tdata and clear tbss
memmove(tls, _tdata_start, I(_tdata_size));
Bzero(tls + I(_tbss_offset), I(_tbss_size) + sizeof(struct CosmoTib));
// initialize .tdata
if (I(_tdata_size)) {
memmove(tls, _tdata_start, I(_tdata_size));
}
// clear .tbss
Bzero(tls + I(_tbss_offset), I(_tbss_size));
// set up thread information block
old = __get_tls();
neu = (struct CosmoTib *)(tls + I(_tls_size));
neu->tib_self = neu;
neu->tib_self2 = neu;
neu->tib_ftrace = old->tib_ftrace;
neu->tib_strace = old->tib_strace;
neu->tib_sigmask = old->tib_sigmask;
atomic_store_explicit(&neu->tib_tid, -1, memory_order_relaxed);
if (out_tib) {
*out_tib = neu;
}
return tls;
return _mktls_finish(out_tib, tls, (struct CosmoTib *)(tls + I(_tls_size)));
}
static char *_mktls_above(struct CosmoTib **out_tib) {
size_t siz;
char *mem, *dtv, *tls;
struct CosmoTib *tib, *old;
// allocate memory for tdata, tbss, and tib
siz = ROUNDUP(sizeof(struct CosmoTib) + 2 * sizeof(void *) + I(_tls_size),
TLS_ALIGNMENT);
mem = memalign(TLS_ALIGNMENT, siz);
if (!mem) return 0;
// poison memory between tdata and tbss
if (IsAsan()) {
__asan_poison(
mem + sizeof(struct CosmoTib) + 2 * sizeof(void *) + I(_tdata_size),
I(_tbss_offset) - I(_tdata_size), kAsanProtected);
}
tib = (struct CosmoTib *)mem;
dtv = mem + sizeof(*tib);
tls = dtv + 2 * sizeof(void *);
// set dtv
((uintptr_t *)dtv)[0] = 1;
((void **)dtv)[1] = tls;
// initialize .tdata
if (I(_tdata_size)) {
memmove(tls, _tdata_start, I(_tdata_size));
}
// clear .tbss
if (I(_tbss_size)) {
Bzero(tls + I(_tbss_offset), I(_tbss_size));
}
// set up thread information block
return _mktls_finish(out_tib, mem, tib);
}
/**
* Allocates thread-local storage memory for new thread.
* @return buffer that must be released with free()
*/
char *_mktls(struct CosmoTib **out_tib) {
__require_tls();
#ifdef __x86_64__
return _mktls_below(out_tib);
#else
return _mktls_above(out_tib);
#endif
}

View file

@ -255,9 +255,10 @@ static errno_t pthread_create_impl(pthread_t *thread,
if ((rc = clone(PosixThread, pt->attr.__stackaddr,
pt->attr.__stacksize - (IsOpenbsd() ? 16 : 0),
CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES |
CLONE_SIGHAND | CLONE_SETTLS | CLONE_PARENT_SETTID |
CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
pt, &pt->ptid, pt->tib, &pt->tib->tib_tid))) {
CLONE_SIGHAND | CLONE_SYSVSEM | CLONE_SETTLS |
CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
CLONE_CHILD_CLEARTID,
pt, &pt->ptid, __adj_tls(pt->tib), &pt->tib->tib_tid))) {
pthread_spin_lock(&_pthread_lock);
_pthread_list = nsync_dll_remove_(_pthread_list, &pt->list);
pthread_spin_unlock(&_pthread_lock);

View file

@ -129,7 +129,7 @@ int _spawn(int fun(void *, int), void *arg, struct spawn *opt_out_thread) {
CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
CLONE_CHILD_CLEARTID,
spawner, &th->ptid, th->tib, &th->tib->tib_tid);
spawner, &th->ptid, __adj_tls(th->tib), &th->tib->tib_tid);
if (rc) {
errno = rc;
_freestack(th->stk);

View file

@ -57,8 +57,10 @@ void __set_tls(struct CosmoTib *);
asm("mov\t%%fs:0,%0" : "=r"(_t) : /* no inputs */ : "memory"); \
_t; \
})
#else
#define __get_tls() ((struct CosmoTib *)__builtin_thread_pointer())
#define __adj_tls(tib) (tib)
#elif defined(__aarch64__)
#define __get_tls() ((struct CosmoTib *)__builtin_thread_pointer() - 1)
#define __adj_tls(tib) ((struct CosmoTib *)(tib) + 1)
#endif
COSMOPOLITAN_C_END_

View file

@ -37,7 +37,7 @@ static noasan inline void __set_tls_win32(void *tls) {
}
#elif defined(__aarch64__)
#define __get_tls_privileged() ((struct CosmoTib *)__builtin_thread_pointer())
#define __get_tls_privileged() __get_tls()
#define __get_tls_win32() ((struct CosmoTib *)0)
#define __set_tls_win32(tls) (void)0
#endif /* GNU x86-64 */

View file

@ -58,7 +58,7 @@ static inline uint64_t mul64(uint64_t a, uint64_t b)
*/
double sqrt(double x)
{
#ifdef __SSE2__
#if defined(__x86_64__) && defined(__SSE2__)
asm("sqrtsd\t%1,%0" : "=x"(x) : "x"(x));
return x;
@ -218,5 +218,5 @@ double sqrt(double x)
}
return y;
#endif /* __SSE2__ */
#endif /* __x86_64__ */
}

View file

@ -16,12 +16,12 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/intrin/kprintf.h"
#include "libc/calls/calls.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/fmt/fmt.h"
#include "libc/intrin/bits.h"
#include "libc/intrin/kprintf.h"
#include "libc/limits.h"
#include "libc/log/libfatal.internal.h"
#include "libc/macros.internal.h"

View file

@ -47,69 +47,7 @@ GGML (MIT License)\\n\
Copyright (c) 2023 Georgi Gerganov\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
#if defined(_WIN32)
typedef volatile LONG atomic_int;
typedef atomic_int atomic_bool;
static void atomic_store(atomic_int* ptr, LONG val) {
InterlockedExchange(ptr, val);
}
static LONG atomic_load(atomic_int* ptr) {
return InterlockedCompareExchange(ptr, 0, 0);
}
static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {
return InterlockedExchangeAdd(ptr, inc);
}
static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
return atomic_fetch_add(ptr, -(dec));
}
typedef HANDLE pthread_t;
typedef DWORD thread_ret_t;
static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
(void) unused;
HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
if (handle == NULL)
{
return EAGAIN;
}
*out = handle;
return 0;
}
static int pthread_join(pthread_t thread, void* unused) {
(void) unused;
return (int) WaitForSingleObject(thread, INFINITE);
}
static int sched_yield (void) {
Sleep (0);
return 0;
}
#else
typedef void* thread_ret_t;
#endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
#ifndef __FMA__
#define __FMA__
#endif
#ifndef __F16C__
#define __F16C__
#endif
#ifndef __SSE3__
#define __SSE3__
#endif
#endif
#ifdef __HAIKU__
#define static_assert(cond, msg) _Static_assert(cond, msg)
#endif
/*#define GGML_PERF*/
#define GGML_DEBUG 0