From db0aba2d53ccb6aa2c1f47f3c7985f4b938f7d7c Mon Sep 17 00:00:00 2001 From: Kunnis Date: Mon, 27 May 2024 16:26:29 -0500 Subject: [PATCH 1/2] adding in x64 targets. --- CMakePresets.json | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index ad1af7ecc..e2b7a79e3 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -1,4 +1,4 @@ -{ +{ "version": 4, "configurePresets": [ { @@ -40,6 +40,10 @@ { "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] }, { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] }, - { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] } + { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }, + + { "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] }, + { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] }, + { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] } ] } From b414599a720f4ed847e7f9f88051790d1671c4f6 Mon Sep 17 00:00:00 2001 From: Kunnis Date: Mon, 27 May 2024 18:37:46 -0500 Subject: [PATCH 2/2] Moving all the threading operations to it's own file --- CMakeLists.txt | 2 +- ggml-threading.cpp | 280 +++++++++++++++++++++++++++++++++++ ggml-threading.h | 120 +++++++++++++++ ggml.c | 354 +-------------------------------------------- ggml.h | 20 +-- 5 files changed, 414 insertions(+), 362 deletions(-) create mode 100644 ggml-threading.cpp create mode 100644 ggml-threading.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c5add8239..0a2750588 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1221,7 +1221,7 @@ add_library(ggml OBJECT ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} - ) + "ggml-threading.cpp" "ggml-threading.h") target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) target_compile_features (ggml PUBLIC c_std_11) # don't bump diff --git a/ggml-threading.cpp b/ggml-threading.cpp new file mode 100644 index 000000000..6a7853e48 --- /dev/null +++ b/ggml-threading.cpp @@ -0,0 +1,280 @@ +#include "ggml-threading.h" +#include + +#define GGML_UNUSED(x) (void)(x) + +// +// NUMA support +// + +#define GGML_NUMA_MAX_NODES 8 +#define GGML_NUMA_MAX_CPUS 512 + +struct ggml_numa_node { + uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node + uint32_t n_cpus; +}; + +struct ggml_numa_nodes { + enum ggml_numa_strategy numa_strategy; + struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; + uint32_t n_nodes; + uint32_t total_cpus; // hardware threads on system + uint32_t current_node; // node on which main process is execting +#if defined(__gnu_linux__) + cpu_set_t cpuset; // cpuset from numactl +#else + uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype +#endif +}; + +struct ggml_numa_nodes g_state_numa; + +#if defined(__gnu_linux__) +static cpu_set_t ggml_get_numa_affinity(void) { + cpu_set_t cpuset; + pthread_t thread; + thread = pthread_self(); + CPU_ZERO(&cpuset); + pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset); + return cpuset; +} +#else +static uint32_t ggml_get_numa_affinity(void) { + return 0; // no NUMA support +} +#endif + +void ggml_numa_zero() { + g_state_numa.n_nodes = 0; + g_state_numa.total_cpus = 0; +} + +void ggml_numa_init(enum ggml_numa_strategy numa_flag) { + if (g_state_numa.n_nodes > 0) { + fprintf(stderr, "ggml_numa_init: NUMA already initialized\n"); + + return; + } + +#if defined(__gnu_linux__) + struct stat st; + char path[256]; + int rv; + + // set numa scheme + g_state_numa.numa_strategy = numa_flag; + + GGML_PRINT_DEBUG("numa strategy %u\n", g_state_numa.numa_strategy); + + g_state_numa.cpuset = ggml_get_numa_affinity(); + + // enumerate nodes + while (g_state_numa.n_nodes < GGML_NUMA_MAX_NODES) { + rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state_numa.n_nodes); + GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); + if (stat(path, &st) != 0) { break; } + ++g_state_numa.n_nodes; + } + + // enumerate CPUs + while (g_state_numa.total_cpus < GGML_NUMA_MAX_CPUS) { + rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state_numa.total_cpus); + GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); + if (stat(path, &st) != 0) { break; } + ++g_state_numa.total_cpus; + } + + GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state_numa.n_nodes, g_state_numa.total_cpus); + + // figure out which node we're on + uint current_cpu; + int getcpu_ret = 0; +#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__) + getcpu_ret = getcpu(¤t_cpu, &g_state_numa.current_node); +#else + // old glibc doesn't have a wrapper for this call. Fall back on direct syscall +# if !defined(SYS_getcpu) && defined(SYS_get_cpu) +# define SYS_getcpu SYS_get_cpu // some older glibc versions use this name +# endif + getcpu_ret = syscall(SYS_getcpu, ¤t_cpu, &g_state_numa.current_node); +#endif + + if (g_state_numa.n_nodes < 1 || g_state_numa.total_cpus < 1 || getcpu_ret != 0) { + g_state_numa.n_nodes = 0; + return; + } + + GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state_numa.current_node, current_cpu); + + for (uint32_t n = 0; n < g_state_numa.n_nodes; ++n) { + struct ggml_numa_node* node = &g_state_numa.nodes[n]; + GGML_PRINT_DEBUG("CPUs on node %u:", n); + node->n_cpus = 0; + for (uint32_t c = 0; c < g_state_numa.total_cpus; ++c) { + rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c); + GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); + if (stat(path, &st) == 0) { + node->cpus[node->n_cpus++] = c; + GGML_PRINT_DEBUG(" %u", c); + } + } + GGML_PRINT_DEBUG("\n"); + } + + if (ggml_is_numa()) { + FILE* fptr = fopen("/proc/sys/kernel/numa_balancing", "r"); + if (fptr != NULL) { + char buf[42]; + if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) { + GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); + } + fclose(fptr); + } + } +#else + GGML_UNUSED(numa_flag); + // TODO +#endif +} + +bool ggml_is_numa(void) { + return g_state_numa.n_nodes > 1; +} + +//////////////////////////////////////////////////////////////////////////////// + +#if defined(_WIN32) + +void atomic_store(atomic_int* ptr, LONG val) { + InterlockedExchange(ptr, val); +} +LONG atomic_load(atomic_int* ptr) { + return InterlockedCompareExchange(ptr, 0, 0); +} +LONG atomic_fetch_add(atomic_int* ptr, LONG inc) { + return InterlockedExchangeAdd(ptr, inc); +} +LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { + return atomic_fetch_add(ptr, -(dec)); +} + +int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { + (void)unused; + HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL); + if (handle == NULL) + { + return EAGAIN; + } + + *out = handle; + return 0; +} + +int pthread_join(pthread_t thread, void* unused) { + (void)unused; + int ret = (int)WaitForSingleObject(thread, INFINITE); + CloseHandle(thread); + return ret; +} + +int sched_yield(void) { + Sleep(0); + return 0; +} + +#endif + +static atomic_int g_state_barrier = 0; + +// barrier via spin lock for g_state +void ggml_critical_section_start(void) { + int processing = atomic_fetch_add(&g_state_barrier, 1); + + while (processing > 0) { + // wait for other threads to finish + atomic_fetch_sub(&g_state_barrier, 1); + sched_yield(); // TODO: reconsider this + processing = atomic_fetch_add(&g_state_barrier, 1); + } +} + +// TODO: make this somehow automatically executed +// some sort of "sentry" mechanism +void ggml_critical_section_end(void) { + atomic_fetch_sub(&g_state_barrier, 1); +} + +// Android's libc implementation "bionic" does not support setting affinity +#if defined(__gnu_linux__) +void set_numa_thread_affinity(int thread_n) { + if (!ggml_is_numa()) { + return; + } + + int node_num; + int rv; + size_t setsize = CPU_ALLOC_SIZE(g_state_numa.total_cpus); + + switch (g_state_numa.numa_strategy) { + case GGML_NUMA_STRATEGY_DISTRIBUTE: + // run thread on node_num thread_n / (threads per node) + node_num = thread_n % g_state_numa.n_nodes; + break; + case GGML_NUMA_STRATEGY_ISOLATE: + // run thread on current_node + node_num = g_state_numa.current_node; + break; + case GGML_NUMA_STRATEGY_NUMACTL: + // use the cpuset that numactl gave us + rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state_numa.cpuset); + if (rv) { + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); + } + return; + default: + return; + } + + struct ggml_numa_node* node = &g_state_numa.nodes[node_num]; + + cpu_set_t* cpus = CPU_ALLOC(g_state_numa.total_cpus); + CPU_ZERO_S(setsize, cpus); + for (size_t i = 0; i < node->n_cpus; ++i) { + CPU_SET_S(node->cpus[i], setsize, cpus); + } + + rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); + if (rv) { + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); + } + + CPU_FREE(cpus); +} + +void clear_numa_thread_affinity(void) { + if (!ggml_is_numa()) { + return; + } + + size_t setsize = CPU_ALLOC_SIZE(g_state_numa.total_cpus); + + cpu_set_t* cpus = CPU_ALLOC(g_state_numa.total_cpus); + CPU_ZERO_S(setsize, cpus); + for (unsigned i = 0; i < g_state_numa.total_cpus; ++i) { + CPU_SET_S(i, setsize, cpus); + } + + int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); + if (rv) { + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); + } + + CPU_FREE(cpus); +} +#else +// TODO: Windows etc. +// (the linux implementation may also work on BSD, someone should test) +void set_numa_thread_affinity(int thread_n) { GGML_UNUSED(thread_n); } +void clear_numa_thread_affinity(void) {} +#endif diff --git a/ggml-threading.h b/ggml-threading.h new file mode 100644 index 000000000..d935d0061 --- /dev/null +++ b/ggml-threading.h @@ -0,0 +1,120 @@ +#include +#include + +#if defined(_WIN32) + +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef volatile LONG atomic_int; +typedef atomic_int atomic_bool; + +extern void atomic_store(atomic_int* ptr, LONG val); +extern LONG atomic_load(atomic_int* ptr); +extern LONG atomic_fetch_add(atomic_int* ptr, LONG inc); +extern LONG atomic_fetch_sub(atomic_int* ptr, LONG dec); + +typedef HANDLE pthread_t; + +typedef DWORD thread_ret_t; + +extern int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg); +extern int pthread_join(pthread_t thread, void* unused); + +extern int sched_yield(void); + +#else +#include +#include + +typedef void* thread_ret_t; + +#include +#include +#include + +#endif + +typedef pthread_t ggml_thread_t; + +// barrier via spin lock +extern void ggml_critical_section_start(void); +extern void ggml_critical_section_end(void); + +extern void ggml_numa_zero(); + +extern void ggml_numa_init(enum ggml_numa_strategy numa_flag); + + +// +// thread data +// +// synchronization is done via busy loops +// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops +// + +#ifdef __APPLE__ + +//#include +// +//typedef os_unfair_lock ggml_lock_t; +// +//#define ggml_lock_init(x) UNUSED(x) +//#define ggml_lock_destroy(x) UNUSED(x) +//#define ggml_lock_lock os_unfair_lock_lock +//#define ggml_lock_unlock os_unfair_lock_unlock +// +//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT + +typedef int ggml_lock_t; + +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#define ggml_lock_lock(x) UNUSED(x) +#define ggml_lock_unlock(x) UNUSED(x) + +#define GGML_LOCK_INITIALIZER 0 + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#else + +//typedef pthread_spinlock_t ggml_lock_t; + +//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) +//#define ggml_lock_destroy pthread_spin_destroy +//#define ggml_lock_lock pthread_spin_lock +//#define ggml_lock_unlock pthread_spin_unlock + +typedef int ggml_lock_t; + +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#define ggml_lock_lock(x) _mm_pause() +#else +#define ggml_lock_lock(x) UNUSED(x) +#endif +#define ggml_lock_unlock(x) UNUSED(x) + +#define GGML_LOCK_INITIALIZER 0 + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#endif + +extern void set_numa_thread_affinity(int thread_n); +extern void clear_numa_thread_affinity(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index 5145ceec9..d15b77f1e 100644 --- a/ggml.c +++ b/ggml.c @@ -50,69 +50,7 @@ #pragma warning(disable: 4996) #endif -#if defined(_WIN32) - -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX - #define NOMINMAX -#endif -#include - -typedef volatile LONG atomic_int; -typedef atomic_int atomic_bool; - -static void atomic_store(atomic_int * ptr, LONG val) { - InterlockedExchange(ptr, val); -} -static LONG atomic_load(atomic_int * ptr) { - return InterlockedCompareExchange(ptr, 0, 0); -} -static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { - return InterlockedExchangeAdd(ptr, inc); -} -static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) { - return atomic_fetch_add(ptr, -(dec)); -} - -typedef HANDLE pthread_t; - -typedef DWORD thread_ret_t; -static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) { - (void) unused; - HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); - if (handle == NULL) - { - return EAGAIN; - } - - *out = handle; - return 0; -} - -static int pthread_join(pthread_t thread, void * unused) { - (void) unused; - int ret = (int) WaitForSingleObject(thread, INFINITE); - CloseHandle(thread); - return ret; -} - -static int sched_yield (void) { - Sleep (0); - return 0; -} -#else -#include -#include - -typedef void * thread_ret_t; - -#include -#include -#include - -#endif - -typedef pthread_t ggml_thread_t; +#include "ggml-threading.h" #ifdef GGML_USE_CPU_HBM #include @@ -2847,30 +2785,7 @@ static void ggml_setup_op_has_task_pass(void) { } } -// -// NUMA support -// -#define GGML_NUMA_MAX_NODES 8 -#define GGML_NUMA_MAX_CPUS 512 - -struct ggml_numa_node { - uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node - uint32_t n_cpus; -}; - -struct ggml_numa_nodes { - enum ggml_numa_strategy numa_strategy; - struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; - uint32_t n_nodes; - uint32_t total_cpus; // hardware threads on system - uint32_t current_node; // node on which main process is execting -#if defined(__gnu_linux__) - cpu_set_t cpuset; // cpuset from numactl -#else - uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype -#endif -}; // // ggml state @@ -2878,139 +2793,10 @@ struct ggml_numa_nodes { struct ggml_state { struct ggml_context_container contexts[GGML_MAX_CONTEXTS]; - struct ggml_numa_nodes numa; }; // global state static struct ggml_state g_state; -static atomic_int g_state_barrier = 0; - -// barrier via spin lock -inline static void ggml_critical_section_start(void) { - int processing = atomic_fetch_add(&g_state_barrier, 1); - - while (processing > 0) { - // wait for other threads to finish - atomic_fetch_sub(&g_state_barrier, 1); - sched_yield(); // TODO: reconsider this - processing = atomic_fetch_add(&g_state_barrier, 1); - } -} - -// TODO: make this somehow automatically executed -// some sort of "sentry" mechanism -inline static void ggml_critical_section_end(void) { - atomic_fetch_sub(&g_state_barrier, 1); -} - -#if defined(__gnu_linux__) -static cpu_set_t ggml_get_numa_affinity(void) { - cpu_set_t cpuset; - pthread_t thread; - thread = pthread_self(); - CPU_ZERO(&cpuset); - pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset); - return cpuset; -} -#else -static uint32_t ggml_get_numa_affinity(void) { - return 0; // no NUMA support -} -#endif - -void ggml_numa_init(enum ggml_numa_strategy numa_flag) { - if (g_state.numa.n_nodes > 0) { - fprintf(stderr, "ggml_numa_init: NUMA already initialized\n"); - - return; - } - -#if defined(__gnu_linux__) - struct stat st; - char path[256]; - int rv; - - // set numa scheme - g_state.numa.numa_strategy = numa_flag; - - GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy); - - g_state.numa.cpuset = ggml_get_numa_affinity(); - - // enumerate nodes - while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) { - rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes); - GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); - if (stat(path, &st) != 0) { break; } - ++g_state.numa.n_nodes; - } - - // enumerate CPUs - while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) { - rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus); - GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); - if (stat(path, &st) != 0) { break; } - ++g_state.numa.total_cpus; - } - - GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); - - // figure out which node we're on - uint current_cpu; - int getcpu_ret = 0; -#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__) - getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node); -#else - // old glibc doesn't have a wrapper for this call. Fall back on direct syscall -# if !defined(SYS_getcpu) && defined(SYS_get_cpu) -# define SYS_getcpu SYS_get_cpu // some older glibc versions use this name -# endif - getcpu_ret = syscall(SYS_getcpu, ¤t_cpu, &g_state.numa.current_node); -#endif - - if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) { - g_state.numa.n_nodes = 0; - return; - } - - GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu); - - for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) { - struct ggml_numa_node * node = &g_state.numa.nodes[n]; - GGML_PRINT_DEBUG("CPUs on node %u:", n); - node->n_cpus = 0; - for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) { - rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c); - GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); - if (stat(path, &st) == 0) { - node->cpus[node->n_cpus++] = c; - GGML_PRINT_DEBUG(" %u", c); - } - } - GGML_PRINT_DEBUG("\n"); - } - - if (ggml_is_numa()) { - FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r"); - if (fptr != NULL) { - char buf[42]; - if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) { - GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); - } - fclose(fptr); - } - } -#else - GGML_UNUSED(numa_flag); - // TODO -#endif -} - -bool ggml_is_numa(void) { - return g_state.numa.n_nodes > 1; -} - -//////////////////////////////////////////////////////////////////////////////// void ggml_print_object(const struct ggml_object * obj) { GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n", @@ -3342,12 +3128,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { g_state = (struct ggml_state) { /*.contexts =*/ { { 0 } }, - /*.numa =*/ { - .n_nodes = 0, - .total_cpus = 0, - }, }; + ggml_numa_zero(); + for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) { g_state.contexts[i].used = false; } @@ -18893,138 +18677,6 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) { memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *)); } -// -// thread data -// -// synchronization is done via busy loops -// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops -// - -#ifdef __APPLE__ - -//#include -// -//typedef os_unfair_lock ggml_lock_t; -// -//#define ggml_lock_init(x) UNUSED(x) -//#define ggml_lock_destroy(x) UNUSED(x) -//#define ggml_lock_lock os_unfair_lock_lock -//#define ggml_lock_unlock os_unfair_lock_unlock -// -//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT - -typedef int ggml_lock_t; - -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#define ggml_lock_lock(x) UNUSED(x) -#define ggml_lock_unlock(x) UNUSED(x) - -#define GGML_LOCK_INITIALIZER 0 - -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join - -#else - -//typedef pthread_spinlock_t ggml_lock_t; - -//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) -//#define ggml_lock_destroy pthread_spin_destroy -//#define ggml_lock_lock pthread_spin_lock -//#define ggml_lock_unlock pthread_spin_unlock - -typedef int ggml_lock_t; - -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) -#define ggml_lock_lock(x) _mm_pause() -#else -#define ggml_lock_lock(x) UNUSED(x) -#endif -#define ggml_lock_unlock(x) UNUSED(x) - -#define GGML_LOCK_INITIALIZER 0 - -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join - -#endif - -// Android's libc implementation "bionic" does not support setting affinity -#if defined(__gnu_linux__) -static void set_numa_thread_affinity(int thread_n) { - if (!ggml_is_numa()) { - return; - } - - int node_num; - int rv; - size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); - - switch(g_state.numa.numa_strategy) { - case GGML_NUMA_STRATEGY_DISTRIBUTE: - // run thread on node_num thread_n / (threads per node) - node_num = thread_n % g_state.numa.n_nodes; - break; - case GGML_NUMA_STRATEGY_ISOLATE: - // run thread on current_node - node_num = g_state.numa.current_node; - break; - case GGML_NUMA_STRATEGY_NUMACTL: - // use the cpuset that numactl gave us - rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset); - if (rv) { - fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv)); - } - return; - default: - return; - } - - struct ggml_numa_node * node = &g_state.numa.nodes[node_num]; - - cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); - CPU_ZERO_S(setsize, cpus); - for (size_t i = 0; i < node->n_cpus; ++i) { - CPU_SET_S(node->cpus[i], setsize, cpus); - } - - rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); - if (rv) { - fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); - } - - CPU_FREE(cpus); -} - -static void clear_numa_thread_affinity(void) { - if (!ggml_is_numa()) { - return; - } - - size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); - - cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); - CPU_ZERO_S(setsize, cpus); - for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) { - CPU_SET_S(i, setsize, cpus); - } - - int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); - if (rv) { - fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); - } - - CPU_FREE(cpus); -} -#else -// TODO: Windows etc. -// (the linux implementation may also work on BSD, someone should test) -static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } -static void clear_numa_thread_affinity(void) {} -#endif static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles; diff --git a/ggml.h b/ggml.h index f803ba724..d31083244 100644 --- a/ggml.h +++ b/ggml.h @@ -689,16 +689,6 @@ extern "C" { void * wdata; }; - // numa strategies - enum ggml_numa_strategy { - GGML_NUMA_STRATEGY_DISABLED = 0, - GGML_NUMA_STRATEGY_DISTRIBUTE = 1, - GGML_NUMA_STRATEGY_ISOLATE = 2, - GGML_NUMA_STRATEGY_NUMACTL = 3, - GGML_NUMA_STRATEGY_MIRROR = 4, - GGML_NUMA_STRATEGY_COUNT - }; - // // GUID // @@ -722,6 +712,16 @@ extern "C" { // accepts a UTF-8 path, even on Windows GGML_API FILE * ggml_fopen(const char * fname, const char * mode); + // numa strategies + enum ggml_numa_strategy { + GGML_NUMA_STRATEGY_DISABLED = 0, + GGML_NUMA_STRATEGY_DISTRIBUTE = 1, + GGML_NUMA_STRATEGY_ISOLATE = 2, + GGML_NUMA_STRATEGY_NUMACTL = 3, + GGML_NUMA_STRATEGY_MIRROR = 4, + GGML_NUMA_STRATEGY_COUNT + }; + GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node