Moving all the threading operations to it's own file

2024-05-27 18:37:46 -05:00 · 2024-05-27 18:37:46 -05:00 · b414599a72
commit b414599a72
parent db0aba2d53
5 changed files with 414 additions and 362 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1221,7 +1221,7 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
-            )
+             "ggml-threading.cpp" "ggml-threading.h")
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
 target_compile_features   (ggml PUBLIC c_std_11) # don't bump
--- a/ggml-threading.cpp
+++ b/ggml-threading.cpp
@ -0,0 +1,280 @@
 #include "ggml-threading.h"
 #include <stdio.h>
 #define GGML_UNUSED(x) (void)(x)
 //
 // NUMA support
 //
 #define GGML_NUMA_MAX_NODES 8
 #define GGML_NUMA_MAX_CPUS 512
 struct ggml_numa_node {
    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
    uint32_t n_cpus;
 };
 struct ggml_numa_nodes {
    enum ggml_numa_strategy numa_strategy;
    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
    uint32_t n_nodes;
    uint32_t total_cpus; // hardware threads on system
    uint32_t current_node; // node on which main process is execting
 #if defined(__gnu_linux__)
    cpu_set_t cpuset; // cpuset from numactl
 #else
    uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
 #endif
 };
 struct ggml_numa_nodes g_state_numa;
 #if defined(__gnu_linux__)
 static cpu_set_t ggml_get_numa_affinity(void) {
    cpu_set_t cpuset;
    pthread_t thread;
    thread = pthread_self();
    CPU_ZERO(&cpuset);
    pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
    return cpuset;
 }
 #else
 static uint32_t ggml_get_numa_affinity(void) {
    return 0; // no NUMA support
 }
 #endif
 void ggml_numa_zero() {
    g_state_numa.n_nodes = 0;
    g_state_numa.total_cpus = 0;
 }
 void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
    if (g_state_numa.n_nodes > 0) {
        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
        return;
    }
 #if defined(__gnu_linux__)
    struct stat st;
    char path[256];
    int rv;
    // set numa scheme
    g_state_numa.numa_strategy = numa_flag;
    GGML_PRINT_DEBUG("numa strategy %u\n", g_state_numa.numa_strategy);
    g_state_numa.cpuset = ggml_get_numa_affinity();
    // enumerate nodes
    while (g_state_numa.n_nodes < GGML_NUMA_MAX_NODES) {
        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state_numa.n_nodes);
        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        if (stat(path, &st) != 0) { break; }
        ++g_state_numa.n_nodes;
    }
    // enumerate CPUs
    while (g_state_numa.total_cpus < GGML_NUMA_MAX_CPUS) {
        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state_numa.total_cpus);
        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        if (stat(path, &st) != 0) { break; }
        ++g_state_numa.total_cpus;
    }
    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state_numa.n_nodes, g_state_numa.total_cpus);
    // figure out which node we're on
    uint current_cpu;
    int getcpu_ret = 0;
 #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
    getcpu_ret = getcpu(&current_cpu, &g_state_numa.current_node);
 #else
    // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
 #   if !defined(SYS_getcpu) && defined(SYS_get_cpu)
 #       define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
 #   endif
    getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state_numa.current_node);
 #endif
    if (g_state_numa.n_nodes < 1 || g_state_numa.total_cpus < 1 || getcpu_ret != 0) {
        g_state_numa.n_nodes = 0;
        return;
    }
    GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state_numa.current_node, current_cpu);
    for (uint32_t n = 0; n < g_state_numa.n_nodes; ++n) {
        struct ggml_numa_node* node = &g_state_numa.nodes[n];
        GGML_PRINT_DEBUG("CPUs on node %u:", n);
        node->n_cpus = 0;
        for (uint32_t c = 0; c < g_state_numa.total_cpus; ++c) {
            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
            if (stat(path, &st) == 0) {
                node->cpus[node->n_cpus++] = c;
                GGML_PRINT_DEBUG(" %u", c);
            }
        }
        GGML_PRINT_DEBUG("\n");
    }
    if (ggml_is_numa()) {
        FILE* fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
        if (fptr != NULL) {
            char buf[42];
            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
            }
            fclose(fptr);
        }
    }
 #else
    GGML_UNUSED(numa_flag);
    // TODO
 #endif
 }
 bool ggml_is_numa(void) {
    return g_state_numa.n_nodes > 1;
 }
 ////////////////////////////////////////////////////////////////////////////////
 #if defined(_WIN32)
 void atomic_store(atomic_int* ptr, LONG val) {
    InterlockedExchange(ptr, val);
 }
 LONG atomic_load(atomic_int* ptr) {
    return InterlockedCompareExchange(ptr, 0, 0);
 }
 LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {
    return InterlockedExchangeAdd(ptr, inc);
 }
 LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
    return atomic_fetch_add(ptr, -(dec));
 }
 int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
    (void)unused;
    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL);
    if (handle == NULL)
    {
        return EAGAIN;
    }
    *out = handle;
    return 0;
 }
 int pthread_join(pthread_t thread, void* unused) {
    (void)unused;
    int ret = (int)WaitForSingleObject(thread, INFINITE);
    CloseHandle(thread);
    return ret;
 }
 int sched_yield(void) {
    Sleep(0);
    return 0;
 }
 #endif
 static atomic_int g_state_barrier = 0;
 // barrier via spin lock for g_state
 void ggml_critical_section_start(void) {
    int processing = atomic_fetch_add(&g_state_barrier, 1);
    while (processing > 0) {
        // wait for other threads to finish
        atomic_fetch_sub(&g_state_barrier, 1);
        sched_yield(); // TODO: reconsider this
        processing = atomic_fetch_add(&g_state_barrier, 1);
    }
 }
 // TODO: make this somehow automatically executed
 //       some sort of "sentry" mechanism
 void ggml_critical_section_end(void) {
    atomic_fetch_sub(&g_state_barrier, 1);
 }
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
 void set_numa_thread_affinity(int thread_n) {
    if (!ggml_is_numa()) {
        return;
    }
    int node_num;
    int rv;
    size_t setsize = CPU_ALLOC_SIZE(g_state_numa.total_cpus);
    switch (g_state_numa.numa_strategy) {
    case GGML_NUMA_STRATEGY_DISTRIBUTE:
        // run thread on node_num thread_n / (threads per node)
        node_num = thread_n % g_state_numa.n_nodes;
        break;
    case GGML_NUMA_STRATEGY_ISOLATE:
        // run thread on current_node
        node_num = g_state_numa.current_node;
        break;
    case GGML_NUMA_STRATEGY_NUMACTL:
        // use the cpuset that numactl gave us
        rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state_numa.cpuset);
        if (rv) {
            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
        }
        return;
    default:
        return;
    }
    struct ggml_numa_node* node = &g_state_numa.nodes[node_num];
    cpu_set_t* cpus = CPU_ALLOC(g_state_numa.total_cpus);
    CPU_ZERO_S(setsize, cpus);
    for (size_t i = 0; i < node->n_cpus; ++i) {
        CPU_SET_S(node->cpus[i], setsize, cpus);
    }
    rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
    if (rv) {
        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
    }
    CPU_FREE(cpus);
 }
 void clear_numa_thread_affinity(void) {
    if (!ggml_is_numa()) {
        return;
    }
    size_t setsize = CPU_ALLOC_SIZE(g_state_numa.total_cpus);
    cpu_set_t* cpus = CPU_ALLOC(g_state_numa.total_cpus);
    CPU_ZERO_S(setsize, cpus);
    for (unsigned i = 0; i < g_state_numa.total_cpus; ++i) {
        CPU_SET_S(i, setsize, cpus);
    }
    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
    if (rv) {
        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
    }
    CPU_FREE(cpus);
 }
 #else
 // TODO: Windows etc.
 // (the linux implementation may also work on BSD, someone should test)
 void set_numa_thread_affinity(int thread_n) { GGML_UNUSED(thread_n); }
 void clear_numa_thread_affinity(void) {}
 #endif
--- a/ggml-threading.h
+++ b/ggml-threading.h
@ -0,0 +1,120 @@
 #include <stdint.h>
 #include <llama.h>
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
 #define NOMINMAX
 #endif
 #include <windows.h>
 #ifdef  __cplusplus
 extern "C" {
 #endif
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 extern void atomic_store(atomic_int* ptr, LONG val);
 extern LONG atomic_load(atomic_int* ptr);
 extern LONG atomic_fetch_add(atomic_int* ptr, LONG inc);
 extern LONG atomic_fetch_sub(atomic_int* ptr, LONG dec);
 typedef HANDLE pthread_t;
 typedef DWORD thread_ret_t;
 extern int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg);
 extern int pthread_join(pthread_t thread, void* unused);
 extern int sched_yield(void);
 #else
 #include <pthread.h>
 #include <stdatomic.h>
 typedef void* thread_ret_t;
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
 typedef pthread_t ggml_thread_t;
 // barrier via spin lock
 extern void ggml_critical_section_start(void);
 extern void ggml_critical_section_end(void);
 extern void ggml_numa_zero();
 extern void ggml_numa_init(enum ggml_numa_strategy numa_flag);
 //
 // thread data
 //
 // synchronization is done via busy loops
 // I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
 //
 #ifdef __APPLE__
 //#include <os/lock.h>
 //
 //typedef os_unfair_lock ggml_lock_t;
 //
 //#define ggml_lock_init(x)    UNUSED(x)
 //#define ggml_lock_destroy(x) UNUSED(x)
 //#define ggml_lock_lock       os_unfair_lock_lock
 //#define ggml_lock_unlock     os_unfair_lock_unlock
 //
 //#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
 typedef int ggml_lock_t;
 #define ggml_lock_init(x)    UNUSED(x)
 #define ggml_lock_destroy(x) UNUSED(x)
 #define ggml_lock_lock(x)    UNUSED(x)
 #define ggml_lock_unlock(x)  UNUSED(x)
 #define GGML_LOCK_INITIALIZER 0
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
 #else
 //typedef pthread_spinlock_t ggml_lock_t;
 //#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
 //#define ggml_lock_destroy pthread_spin_destroy
 //#define ggml_lock_lock    pthread_spin_lock
 //#define ggml_lock_unlock  pthread_spin_unlock
 typedef int ggml_lock_t;
 #define ggml_lock_init(x)    UNUSED(x)
 #define ggml_lock_destroy(x) UNUSED(x)
 #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
 #define ggml_lock_lock(x)    _mm_pause()
 #else
 #define ggml_lock_lock(x)    UNUSED(x)
 #endif
 #define ggml_lock_unlock(x)  UNUSED(x)
 #define GGML_LOCK_INITIALIZER 0
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
 #endif
 extern void set_numa_thread_affinity(int thread_n);
 extern void clear_numa_thread_affinity(void);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml.c
+++ b/ggml.c
@ -50,69 +50,7 @@
 #pragma warning(disable: 4996)
 #endif
-#if defined(_WIN32)
+#include "ggml-threading.h"
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
    #define NOMINMAX
 #endif
 #include <windows.h>
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 static void atomic_store(atomic_int * ptr, LONG val) {
    InterlockedExchange(ptr, val);
 }
 static LONG atomic_load(atomic_int * ptr) {
    return InterlockedCompareExchange(ptr, 0, 0);
 }
 static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
    return InterlockedExchangeAdd(ptr, inc);
 }
 static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
    return atomic_fetch_add(ptr, -(dec));
 }
 typedef HANDLE pthread_t;
 typedef DWORD thread_ret_t;
 static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
    (void) unused;
    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
    if (handle == NULL)
    {
        return EAGAIN;
    }
    *out = handle;
    return 0;
 }
 static int pthread_join(pthread_t thread, void * unused) {
    (void) unused;
    int ret = (int) WaitForSingleObject(thread, INFINITE);
    CloseHandle(thread);
    return ret;
 }
 static int sched_yield (void) {
    Sleep (0);
    return 0;
 }
 #else
 #include <pthread.h>
 #include <stdatomic.h>
 typedef void * thread_ret_t;
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
 typedef pthread_t ggml_thread_t;
 #ifdef GGML_USE_CPU_HBM
 #include <hbwmalloc.h>
@ -2847,30 +2785,7 @@ static void ggml_setup_op_has_task_pass(void) {
    }
 }
 //
 // NUMA support
 //
 #define GGML_NUMA_MAX_NODES 8
 #define GGML_NUMA_MAX_CPUS 512
 struct ggml_numa_node {
    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
    uint32_t n_cpus;
 };
 struct ggml_numa_nodes {
    enum ggml_numa_strategy numa_strategy;
    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
    uint32_t n_nodes;
    uint32_t total_cpus; // hardware threads on system
    uint32_t current_node; // node on which main process is execting
 #if defined(__gnu_linux__)
    cpu_set_t cpuset; // cpuset from numactl
 #else
    uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
 #endif
 };
 //
 // ggml state
@ -2878,139 +2793,10 @@ struct ggml_numa_nodes {
 struct ggml_state {
    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
    struct ggml_numa_nodes numa;
 };
 // global state
 static struct ggml_state g_state;
 static atomic_int g_state_barrier = 0;
 // barrier via spin lock
 inline static void ggml_critical_section_start(void) {
    int processing = atomic_fetch_add(&g_state_barrier, 1);
    while (processing > 0) {
        // wait for other threads to finish
        atomic_fetch_sub(&g_state_barrier, 1);
        sched_yield(); // TODO: reconsider this
        processing = atomic_fetch_add(&g_state_barrier, 1);
    }
 }
 // TODO: make this somehow automatically executed
 //       some sort of "sentry" mechanism
 inline static void ggml_critical_section_end(void) {
    atomic_fetch_sub(&g_state_barrier, 1);
 }
 #if defined(__gnu_linux__)
 static cpu_set_t ggml_get_numa_affinity(void) {
    cpu_set_t cpuset;
    pthread_t thread;
    thread = pthread_self();
    CPU_ZERO(&cpuset);
    pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
    return cpuset;
 }
 #else
 static uint32_t ggml_get_numa_affinity(void) {
    return 0; // no NUMA support
 }
 #endif
 void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
    if (g_state.numa.n_nodes > 0) {
        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
        return;
    }
 #if defined(__gnu_linux__)
    struct stat st;
    char path[256];
    int rv;
    // set numa scheme
    g_state.numa.numa_strategy = numa_flag;
    GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
    g_state.numa.cpuset = ggml_get_numa_affinity();
    // enumerate nodes
    while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        if (stat(path, &st) != 0) { break; }
        ++g_state.numa.n_nodes;
    }
    // enumerate CPUs
    while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        if (stat(path, &st) != 0) { break; }
        ++g_state.numa.total_cpus;
    }
    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
    // figure out which node we're on
    uint current_cpu;
    int getcpu_ret = 0;
 #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
    getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
 #else
    // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
 #   if !defined(SYS_getcpu) && defined(SYS_get_cpu)
 #       define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
 #   endif
    getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
 #endif
    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
        g_state.numa.n_nodes = 0;
        return;
    }
    GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
        struct ggml_numa_node * node = &g_state.numa.nodes[n];
        GGML_PRINT_DEBUG("CPUs on node %u:", n);
        node->n_cpus = 0;
        for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
            if (stat(path, &st) == 0) {
                node->cpus[node->n_cpus++] = c;
                GGML_PRINT_DEBUG(" %u", c);
            }
        }
        GGML_PRINT_DEBUG("\n");
    }
    if (ggml_is_numa()) {
        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
        if (fptr != NULL) {
            char buf[42];
            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
            }
            fclose(fptr);
        }
    }
 #else
    GGML_UNUSED(numa_flag);
    // TODO
 #endif
 }
 bool ggml_is_numa(void) {
    return g_state.numa.n_nodes > 1;
 }
 ////////////////////////////////////////////////////////////////////////////////
 void ggml_print_object(const struct ggml_object * obj) {
    GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
@ -3342,12 +3128,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
            g_state = (struct ggml_state) {
                /*.contexts =*/ { { 0 } },
                /*.numa =*/ {
                    .n_nodes = 0,
                    .total_cpus = 0,
                },
            };
            ggml_numa_zero();
            for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
                g_state.contexts[i].used = false;
            }
@ -18893,138 +18677,6 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
    memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
 }
 //
 // thread data
 //
 // synchronization is done via busy loops
 // I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
 //
 #ifdef __APPLE__
 //#include <os/lock.h>
 //
 //typedef os_unfair_lock ggml_lock_t;
 //
 //#define ggml_lock_init(x)    UNUSED(x)
 //#define ggml_lock_destroy(x) UNUSED(x)
 //#define ggml_lock_lock       os_unfair_lock_lock
 //#define ggml_lock_unlock     os_unfair_lock_unlock
 //
 //#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
 typedef int ggml_lock_t;
 #define ggml_lock_init(x)    UNUSED(x)
 #define ggml_lock_destroy(x) UNUSED(x)
 #define ggml_lock_lock(x)    UNUSED(x)
 #define ggml_lock_unlock(x)  UNUSED(x)
 #define GGML_LOCK_INITIALIZER 0
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
 #else
 //typedef pthread_spinlock_t ggml_lock_t;
 //#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
 //#define ggml_lock_destroy pthread_spin_destroy
 //#define ggml_lock_lock    pthread_spin_lock
 //#define ggml_lock_unlock  pthread_spin_unlock
 typedef int ggml_lock_t;
 #define ggml_lock_init(x)    UNUSED(x)
 #define ggml_lock_destroy(x) UNUSED(x)
 #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
 #define ggml_lock_lock(x)    _mm_pause()
 #else
 #define ggml_lock_lock(x)    UNUSED(x)
 #endif
 #define ggml_lock_unlock(x)  UNUSED(x)
 #define GGML_LOCK_INITIALIZER 0
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
 #endif
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
 static void set_numa_thread_affinity(int thread_n) {
    if (!ggml_is_numa()) {
        return;
    }
    int node_num;
    int rv;
    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
    switch(g_state.numa.numa_strategy) {
        case GGML_NUMA_STRATEGY_DISTRIBUTE:
            // run thread on node_num thread_n / (threads per node)
            node_num = thread_n % g_state.numa.n_nodes;
            break;
        case GGML_NUMA_STRATEGY_ISOLATE:
            // run thread on current_node
            node_num = g_state.numa.current_node;
            break;
        case GGML_NUMA_STRATEGY_NUMACTL:
            // use the cpuset that numactl gave us
            rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
            if (rv) {
                fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
            }
            return;
        default:
            return;
    }
    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
    CPU_ZERO_S(setsize, cpus);
    for (size_t i = 0; i < node->n_cpus; ++i) {
        CPU_SET_S(node->cpus[i], setsize, cpus);
    }
    rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
    if (rv) {
            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
    }
    CPU_FREE(cpus);
 }
 static void clear_numa_thread_affinity(void) {
    if (!ggml_is_numa()) {
        return;
    }
    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
    CPU_ZERO_S(setsize, cpus);
    for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
        CPU_SET_S(i, setsize, cpus);
    }
    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
    if (rv) {
        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
    }
    CPU_FREE(cpus);
 }
 #else
 // TODO: Windows etc.
 // (the linux implementation may also work on BSD, someone should test)
 static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
 static void clear_numa_thread_affinity(void) {}
 #endif
 static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
    int64_t cycles_cur  = ggml_perf_cycles()  - st->perf_node_start_cycles;
--- a/ggml.h
+++ b/ggml.h
@ -689,16 +689,6 @@ extern "C" {
        void * wdata;
    };
    // numa strategies
    enum ggml_numa_strategy {
        GGML_NUMA_STRATEGY_DISABLED   = 0,
        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
        GGML_NUMA_STRATEGY_ISOLATE    = 2,
        GGML_NUMA_STRATEGY_NUMACTL    = 3,
        GGML_NUMA_STRATEGY_MIRROR     = 4,
        GGML_NUMA_STRATEGY_COUNT
    };
    //
    // GUID
    //
@ -722,6 +712,16 @@ extern "C" {
    // accepts a UTF-8 path, even on Windows
    GGML_API FILE *  ggml_fopen(const char * fname, const char * mode);
    // numa strategies
    enum ggml_numa_strategy {
        GGML_NUMA_STRATEGY_DISABLED = 0,
        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
        GGML_NUMA_STRATEGY_ISOLATE = 2,
        GGML_NUMA_STRATEGY_NUMACTL = 3,
        GGML_NUMA_STRATEGY_MIRROR = 4,
        GGML_NUMA_STRATEGY_COUNT
    };
    GGML_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node