Moving all the threading operations to it's own file
This commit is contained in:
parent
db0aba2d53
commit
b414599a72
5 changed files with 414 additions and 362 deletions
|
@ -1221,7 +1221,7 @@ add_library(ggml OBJECT
|
||||||
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
||||||
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
|
||||||
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
|
||||||
)
|
"ggml-threading.cpp" "ggml-threading.h")
|
||||||
|
|
||||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||||
target_compile_features (ggml PUBLIC c_std_11) # don't bump
|
target_compile_features (ggml PUBLIC c_std_11) # don't bump
|
||||||
|
|
280
ggml-threading.cpp
Normal file
280
ggml-threading.cpp
Normal file
|
@ -0,0 +1,280 @@
|
||||||
|
#include "ggml-threading.h"
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#define GGML_UNUSED(x) (void)(x)
|
||||||
|
|
||||||
|
//
|
||||||
|
// NUMA support
|
||||||
|
//
|
||||||
|
|
||||||
|
#define GGML_NUMA_MAX_NODES 8
|
||||||
|
#define GGML_NUMA_MAX_CPUS 512
|
||||||
|
|
||||||
|
struct ggml_numa_node {
|
||||||
|
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
|
||||||
|
uint32_t n_cpus;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_numa_nodes {
|
||||||
|
enum ggml_numa_strategy numa_strategy;
|
||||||
|
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
||||||
|
uint32_t n_nodes;
|
||||||
|
uint32_t total_cpus; // hardware threads on system
|
||||||
|
uint32_t current_node; // node on which main process is execting
|
||||||
|
#if defined(__gnu_linux__)
|
||||||
|
cpu_set_t cpuset; // cpuset from numactl
|
||||||
|
#else
|
||||||
|
uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_numa_nodes g_state_numa;
|
||||||
|
|
||||||
|
#if defined(__gnu_linux__)
|
||||||
|
static cpu_set_t ggml_get_numa_affinity(void) {
|
||||||
|
cpu_set_t cpuset;
|
||||||
|
pthread_t thread;
|
||||||
|
thread = pthread_self();
|
||||||
|
CPU_ZERO(&cpuset);
|
||||||
|
pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
|
||||||
|
return cpuset;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static uint32_t ggml_get_numa_affinity(void) {
|
||||||
|
return 0; // no NUMA support
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void ggml_numa_zero() {
|
||||||
|
g_state_numa.n_nodes = 0;
|
||||||
|
g_state_numa.total_cpus = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
||||||
|
if (g_state_numa.n_nodes > 0) {
|
||||||
|
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__gnu_linux__)
|
||||||
|
struct stat st;
|
||||||
|
char path[256];
|
||||||
|
int rv;
|
||||||
|
|
||||||
|
// set numa scheme
|
||||||
|
g_state_numa.numa_strategy = numa_flag;
|
||||||
|
|
||||||
|
GGML_PRINT_DEBUG("numa strategy %u\n", g_state_numa.numa_strategy);
|
||||||
|
|
||||||
|
g_state_numa.cpuset = ggml_get_numa_affinity();
|
||||||
|
|
||||||
|
// enumerate nodes
|
||||||
|
while (g_state_numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
||||||
|
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state_numa.n_nodes);
|
||||||
|
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
||||||
|
if (stat(path, &st) != 0) { break; }
|
||||||
|
++g_state_numa.n_nodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
// enumerate CPUs
|
||||||
|
while (g_state_numa.total_cpus < GGML_NUMA_MAX_CPUS) {
|
||||||
|
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state_numa.total_cpus);
|
||||||
|
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
||||||
|
if (stat(path, &st) != 0) { break; }
|
||||||
|
++g_state_numa.total_cpus;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state_numa.n_nodes, g_state_numa.total_cpus);
|
||||||
|
|
||||||
|
// figure out which node we're on
|
||||||
|
uint current_cpu;
|
||||||
|
int getcpu_ret = 0;
|
||||||
|
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
|
||||||
|
getcpu_ret = getcpu(¤t_cpu, &g_state_numa.current_node);
|
||||||
|
#else
|
||||||
|
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
|
||||||
|
# if !defined(SYS_getcpu) && defined(SYS_get_cpu)
|
||||||
|
# define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
|
||||||
|
# endif
|
||||||
|
getcpu_ret = syscall(SYS_getcpu, ¤t_cpu, &g_state_numa.current_node);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (g_state_numa.n_nodes < 1 || g_state_numa.total_cpus < 1 || getcpu_ret != 0) {
|
||||||
|
g_state_numa.n_nodes = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state_numa.current_node, current_cpu);
|
||||||
|
|
||||||
|
for (uint32_t n = 0; n < g_state_numa.n_nodes; ++n) {
|
||||||
|
struct ggml_numa_node* node = &g_state_numa.nodes[n];
|
||||||
|
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
||||||
|
node->n_cpus = 0;
|
||||||
|
for (uint32_t c = 0; c < g_state_numa.total_cpus; ++c) {
|
||||||
|
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
|
||||||
|
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
||||||
|
if (stat(path, &st) == 0) {
|
||||||
|
node->cpus[node->n_cpus++] = c;
|
||||||
|
GGML_PRINT_DEBUG(" %u", c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GGML_PRINT_DEBUG("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ggml_is_numa()) {
|
||||||
|
FILE* fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
|
||||||
|
if (fptr != NULL) {
|
||||||
|
char buf[42];
|
||||||
|
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
|
||||||
|
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
|
||||||
|
}
|
||||||
|
fclose(fptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_UNUSED(numa_flag);
|
||||||
|
// TODO
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_is_numa(void) {
|
||||||
|
return g_state_numa.n_nodes > 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
|
||||||
|
void atomic_store(atomic_int* ptr, LONG val) {
|
||||||
|
InterlockedExchange(ptr, val);
|
||||||
|
}
|
||||||
|
LONG atomic_load(atomic_int* ptr) {
|
||||||
|
return InterlockedCompareExchange(ptr, 0, 0);
|
||||||
|
}
|
||||||
|
LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {
|
||||||
|
return InterlockedExchangeAdd(ptr, inc);
|
||||||
|
}
|
||||||
|
LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
|
||||||
|
return atomic_fetch_add(ptr, -(dec));
|
||||||
|
}
|
||||||
|
|
||||||
|
int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
|
||||||
|
(void)unused;
|
||||||
|
HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL);
|
||||||
|
if (handle == NULL)
|
||||||
|
{
|
||||||
|
return EAGAIN;
|
||||||
|
}
|
||||||
|
|
||||||
|
*out = handle;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int pthread_join(pthread_t thread, void* unused) {
|
||||||
|
(void)unused;
|
||||||
|
int ret = (int)WaitForSingleObject(thread, INFINITE);
|
||||||
|
CloseHandle(thread);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
int sched_yield(void) {
|
||||||
|
Sleep(0);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static atomic_int g_state_barrier = 0;
|
||||||
|
|
||||||
|
// barrier via spin lock for g_state
|
||||||
|
void ggml_critical_section_start(void) {
|
||||||
|
int processing = atomic_fetch_add(&g_state_barrier, 1);
|
||||||
|
|
||||||
|
while (processing > 0) {
|
||||||
|
// wait for other threads to finish
|
||||||
|
atomic_fetch_sub(&g_state_barrier, 1);
|
||||||
|
sched_yield(); // TODO: reconsider this
|
||||||
|
processing = atomic_fetch_add(&g_state_barrier, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: make this somehow automatically executed
|
||||||
|
// some sort of "sentry" mechanism
|
||||||
|
void ggml_critical_section_end(void) {
|
||||||
|
atomic_fetch_sub(&g_state_barrier, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Android's libc implementation "bionic" does not support setting affinity
|
||||||
|
#if defined(__gnu_linux__)
|
||||||
|
void set_numa_thread_affinity(int thread_n) {
|
||||||
|
if (!ggml_is_numa()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int node_num;
|
||||||
|
int rv;
|
||||||
|
size_t setsize = CPU_ALLOC_SIZE(g_state_numa.total_cpus);
|
||||||
|
|
||||||
|
switch (g_state_numa.numa_strategy) {
|
||||||
|
case GGML_NUMA_STRATEGY_DISTRIBUTE:
|
||||||
|
// run thread on node_num thread_n / (threads per node)
|
||||||
|
node_num = thread_n % g_state_numa.n_nodes;
|
||||||
|
break;
|
||||||
|
case GGML_NUMA_STRATEGY_ISOLATE:
|
||||||
|
// run thread on current_node
|
||||||
|
node_num = g_state_numa.current_node;
|
||||||
|
break;
|
||||||
|
case GGML_NUMA_STRATEGY_NUMACTL:
|
||||||
|
// use the cpuset that numactl gave us
|
||||||
|
rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state_numa.cpuset);
|
||||||
|
if (rv) {
|
||||||
|
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
default:
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_numa_node* node = &g_state_numa.nodes[node_num];
|
||||||
|
|
||||||
|
cpu_set_t* cpus = CPU_ALLOC(g_state_numa.total_cpus);
|
||||||
|
CPU_ZERO_S(setsize, cpus);
|
||||||
|
for (size_t i = 0; i < node->n_cpus; ++i) {
|
||||||
|
CPU_SET_S(node->cpus[i], setsize, cpus);
|
||||||
|
}
|
||||||
|
|
||||||
|
rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
||||||
|
if (rv) {
|
||||||
|
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
|
||||||
|
}
|
||||||
|
|
||||||
|
CPU_FREE(cpus);
|
||||||
|
}
|
||||||
|
|
||||||
|
void clear_numa_thread_affinity(void) {
|
||||||
|
if (!ggml_is_numa()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t setsize = CPU_ALLOC_SIZE(g_state_numa.total_cpus);
|
||||||
|
|
||||||
|
cpu_set_t* cpus = CPU_ALLOC(g_state_numa.total_cpus);
|
||||||
|
CPU_ZERO_S(setsize, cpus);
|
||||||
|
for (unsigned i = 0; i < g_state_numa.total_cpus; ++i) {
|
||||||
|
CPU_SET_S(i, setsize, cpus);
|
||||||
|
}
|
||||||
|
|
||||||
|
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
||||||
|
if (rv) {
|
||||||
|
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
|
||||||
|
}
|
||||||
|
|
||||||
|
CPU_FREE(cpus);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// TODO: Windows etc.
|
||||||
|
// (the linux implementation may also work on BSD, someone should test)
|
||||||
|
void set_numa_thread_affinity(int thread_n) { GGML_UNUSED(thread_n); }
|
||||||
|
void clear_numa_thread_affinity(void) {}
|
||||||
|
#endif
|
120
ggml-threading.h
Normal file
120
ggml-threading.h
Normal file
|
@ -0,0 +1,120 @@
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <llama.h>
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#ifndef NOMINMAX
|
||||||
|
#define NOMINMAX
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef volatile LONG atomic_int;
|
||||||
|
typedef atomic_int atomic_bool;
|
||||||
|
|
||||||
|
extern void atomic_store(atomic_int* ptr, LONG val);
|
||||||
|
extern LONG atomic_load(atomic_int* ptr);
|
||||||
|
extern LONG atomic_fetch_add(atomic_int* ptr, LONG inc);
|
||||||
|
extern LONG atomic_fetch_sub(atomic_int* ptr, LONG dec);
|
||||||
|
|
||||||
|
typedef HANDLE pthread_t;
|
||||||
|
|
||||||
|
typedef DWORD thread_ret_t;
|
||||||
|
|
||||||
|
extern int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg);
|
||||||
|
extern int pthread_join(pthread_t thread, void* unused);
|
||||||
|
|
||||||
|
extern int sched_yield(void);
|
||||||
|
|
||||||
|
#else
|
||||||
|
#include <pthread.h>
|
||||||
|
#include <stdatomic.h>
|
||||||
|
|
||||||
|
typedef void* thread_ret_t;
|
||||||
|
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef pthread_t ggml_thread_t;
|
||||||
|
|
||||||
|
// barrier via spin lock
|
||||||
|
extern void ggml_critical_section_start(void);
|
||||||
|
extern void ggml_critical_section_end(void);
|
||||||
|
|
||||||
|
extern void ggml_numa_zero();
|
||||||
|
|
||||||
|
extern void ggml_numa_init(enum ggml_numa_strategy numa_flag);
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// thread data
|
||||||
|
//
|
||||||
|
// synchronization is done via busy loops
|
||||||
|
// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
|
||||||
|
//#include <os/lock.h>
|
||||||
|
//
|
||||||
|
//typedef os_unfair_lock ggml_lock_t;
|
||||||
|
//
|
||||||
|
//#define ggml_lock_init(x) UNUSED(x)
|
||||||
|
//#define ggml_lock_destroy(x) UNUSED(x)
|
||||||
|
//#define ggml_lock_lock os_unfair_lock_lock
|
||||||
|
//#define ggml_lock_unlock os_unfair_lock_unlock
|
||||||
|
//
|
||||||
|
//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
|
||||||
|
|
||||||
|
typedef int ggml_lock_t;
|
||||||
|
|
||||||
|
#define ggml_lock_init(x) UNUSED(x)
|
||||||
|
#define ggml_lock_destroy(x) UNUSED(x)
|
||||||
|
#define ggml_lock_lock(x) UNUSED(x)
|
||||||
|
#define ggml_lock_unlock(x) UNUSED(x)
|
||||||
|
|
||||||
|
#define GGML_LOCK_INITIALIZER 0
|
||||||
|
|
||||||
|
#define ggml_thread_create pthread_create
|
||||||
|
#define ggml_thread_join pthread_join
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
//typedef pthread_spinlock_t ggml_lock_t;
|
||||||
|
|
||||||
|
//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
|
||||||
|
//#define ggml_lock_destroy pthread_spin_destroy
|
||||||
|
//#define ggml_lock_lock pthread_spin_lock
|
||||||
|
//#define ggml_lock_unlock pthread_spin_unlock
|
||||||
|
|
||||||
|
typedef int ggml_lock_t;
|
||||||
|
|
||||||
|
#define ggml_lock_init(x) UNUSED(x)
|
||||||
|
#define ggml_lock_destroy(x) UNUSED(x)
|
||||||
|
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
|
||||||
|
#define ggml_lock_lock(x) _mm_pause()
|
||||||
|
#else
|
||||||
|
#define ggml_lock_lock(x) UNUSED(x)
|
||||||
|
#endif
|
||||||
|
#define ggml_lock_unlock(x) UNUSED(x)
|
||||||
|
|
||||||
|
#define GGML_LOCK_INITIALIZER 0
|
||||||
|
|
||||||
|
#define ggml_thread_create pthread_create
|
||||||
|
#define ggml_thread_join pthread_join
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern void set_numa_thread_affinity(int thread_n);
|
||||||
|
extern void clear_numa_thread_affinity(void);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
354
ggml.c
354
ggml.c
|
@ -50,69 +50,7 @@
|
||||||
#pragma warning(disable: 4996)
|
#pragma warning(disable: 4996)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#include "ggml-threading.h"
|
||||||
|
|
||||||
#define WIN32_LEAN_AND_MEAN
|
|
||||||
#ifndef NOMINMAX
|
|
||||||
#define NOMINMAX
|
|
||||||
#endif
|
|
||||||
#include <windows.h>
|
|
||||||
|
|
||||||
typedef volatile LONG atomic_int;
|
|
||||||
typedef atomic_int atomic_bool;
|
|
||||||
|
|
||||||
static void atomic_store(atomic_int * ptr, LONG val) {
|
|
||||||
InterlockedExchange(ptr, val);
|
|
||||||
}
|
|
||||||
static LONG atomic_load(atomic_int * ptr) {
|
|
||||||
return InterlockedCompareExchange(ptr, 0, 0);
|
|
||||||
}
|
|
||||||
static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
|
|
||||||
return InterlockedExchangeAdd(ptr, inc);
|
|
||||||
}
|
|
||||||
static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
|
|
||||||
return atomic_fetch_add(ptr, -(dec));
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef HANDLE pthread_t;
|
|
||||||
|
|
||||||
typedef DWORD thread_ret_t;
|
|
||||||
static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
|
|
||||||
(void) unused;
|
|
||||||
HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
|
|
||||||
if (handle == NULL)
|
|
||||||
{
|
|
||||||
return EAGAIN;
|
|
||||||
}
|
|
||||||
|
|
||||||
*out = handle;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int pthread_join(pthread_t thread, void * unused) {
|
|
||||||
(void) unused;
|
|
||||||
int ret = (int) WaitForSingleObject(thread, INFINITE);
|
|
||||||
CloseHandle(thread);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int sched_yield (void) {
|
|
||||||
Sleep (0);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#include <pthread.h>
|
|
||||||
#include <stdatomic.h>
|
|
||||||
|
|
||||||
typedef void * thread_ret_t;
|
|
||||||
|
|
||||||
#include <sys/types.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef pthread_t ggml_thread_t;
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
#ifdef GGML_USE_CPU_HBM
|
||||||
#include <hbwmalloc.h>
|
#include <hbwmalloc.h>
|
||||||
|
@ -2847,30 +2785,7 @@ static void ggml_setup_op_has_task_pass(void) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// NUMA support
|
|
||||||
//
|
|
||||||
|
|
||||||
#define GGML_NUMA_MAX_NODES 8
|
|
||||||
#define GGML_NUMA_MAX_CPUS 512
|
|
||||||
|
|
||||||
struct ggml_numa_node {
|
|
||||||
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
|
|
||||||
uint32_t n_cpus;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_numa_nodes {
|
|
||||||
enum ggml_numa_strategy numa_strategy;
|
|
||||||
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
|
||||||
uint32_t n_nodes;
|
|
||||||
uint32_t total_cpus; // hardware threads on system
|
|
||||||
uint32_t current_node; // node on which main process is execting
|
|
||||||
#if defined(__gnu_linux__)
|
|
||||||
cpu_set_t cpuset; // cpuset from numactl
|
|
||||||
#else
|
|
||||||
uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// ggml state
|
// ggml state
|
||||||
|
@ -2878,139 +2793,10 @@ struct ggml_numa_nodes {
|
||||||
|
|
||||||
struct ggml_state {
|
struct ggml_state {
|
||||||
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
|
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
|
||||||
struct ggml_numa_nodes numa;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// global state
|
// global state
|
||||||
static struct ggml_state g_state;
|
static struct ggml_state g_state;
|
||||||
static atomic_int g_state_barrier = 0;
|
|
||||||
|
|
||||||
// barrier via spin lock
|
|
||||||
inline static void ggml_critical_section_start(void) {
|
|
||||||
int processing = atomic_fetch_add(&g_state_barrier, 1);
|
|
||||||
|
|
||||||
while (processing > 0) {
|
|
||||||
// wait for other threads to finish
|
|
||||||
atomic_fetch_sub(&g_state_barrier, 1);
|
|
||||||
sched_yield(); // TODO: reconsider this
|
|
||||||
processing = atomic_fetch_add(&g_state_barrier, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: make this somehow automatically executed
|
|
||||||
// some sort of "sentry" mechanism
|
|
||||||
inline static void ggml_critical_section_end(void) {
|
|
||||||
atomic_fetch_sub(&g_state_barrier, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(__gnu_linux__)
|
|
||||||
static cpu_set_t ggml_get_numa_affinity(void) {
|
|
||||||
cpu_set_t cpuset;
|
|
||||||
pthread_t thread;
|
|
||||||
thread = pthread_self();
|
|
||||||
CPU_ZERO(&cpuset);
|
|
||||||
pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
|
|
||||||
return cpuset;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static uint32_t ggml_get_numa_affinity(void) {
|
|
||||||
return 0; // no NUMA support
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
|
||||||
if (g_state.numa.n_nodes > 0) {
|
|
||||||
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(__gnu_linux__)
|
|
||||||
struct stat st;
|
|
||||||
char path[256];
|
|
||||||
int rv;
|
|
||||||
|
|
||||||
// set numa scheme
|
|
||||||
g_state.numa.numa_strategy = numa_flag;
|
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
|
|
||||||
|
|
||||||
g_state.numa.cpuset = ggml_get_numa_affinity();
|
|
||||||
|
|
||||||
// enumerate nodes
|
|
||||||
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
|
||||||
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
|
||||||
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
|
||||||
if (stat(path, &st) != 0) { break; }
|
|
||||||
++g_state.numa.n_nodes;
|
|
||||||
}
|
|
||||||
|
|
||||||
// enumerate CPUs
|
|
||||||
while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
|
|
||||||
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
|
|
||||||
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
|
||||||
if (stat(path, &st) != 0) { break; }
|
|
||||||
++g_state.numa.total_cpus;
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
|
||||||
|
|
||||||
// figure out which node we're on
|
|
||||||
uint current_cpu;
|
|
||||||
int getcpu_ret = 0;
|
|
||||||
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
|
|
||||||
getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
|
||||||
#else
|
|
||||||
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
|
|
||||||
# if !defined(SYS_getcpu) && defined(SYS_get_cpu)
|
|
||||||
# define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
|
|
||||||
# endif
|
|
||||||
getcpu_ret = syscall(SYS_getcpu, ¤t_cpu, &g_state.numa.current_node);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
|
|
||||||
g_state.numa.n_nodes = 0;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
|
|
||||||
|
|
||||||
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
|
||||||
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
|
||||||
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
|
||||||
node->n_cpus = 0;
|
|
||||||
for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
|
|
||||||
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
|
|
||||||
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
|
||||||
if (stat(path, &st) == 0) {
|
|
||||||
node->cpus[node->n_cpus++] = c;
|
|
||||||
GGML_PRINT_DEBUG(" %u", c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
GGML_PRINT_DEBUG("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ggml_is_numa()) {
|
|
||||||
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
|
|
||||||
if (fptr != NULL) {
|
|
||||||
char buf[42];
|
|
||||||
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
|
|
||||||
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
|
|
||||||
}
|
|
||||||
fclose(fptr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
GGML_UNUSED(numa_flag);
|
|
||||||
// TODO
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ggml_is_numa(void) {
|
|
||||||
return g_state.numa.n_nodes > 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
void ggml_print_object(const struct ggml_object * obj) {
|
void ggml_print_object(const struct ggml_object * obj) {
|
||||||
GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
|
GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
|
||||||
|
@ -3342,12 +3128,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
|
|
||||||
g_state = (struct ggml_state) {
|
g_state = (struct ggml_state) {
|
||||||
/*.contexts =*/ { { 0 } },
|
/*.contexts =*/ { { 0 } },
|
||||||
/*.numa =*/ {
|
|
||||||
.n_nodes = 0,
|
|
||||||
.total_cpus = 0,
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
ggml_numa_zero();
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
|
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
|
||||||
g_state.contexts[i].used = false;
|
g_state.contexts[i].used = false;
|
||||||
}
|
}
|
||||||
|
@ -18893,138 +18677,6 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
|
||||||
memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
|
memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// thread data
|
|
||||||
//
|
|
||||||
// synchronization is done via busy loops
|
|
||||||
// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
|
|
||||||
//
|
|
||||||
|
|
||||||
#ifdef __APPLE__
|
|
||||||
|
|
||||||
//#include <os/lock.h>
|
|
||||||
//
|
|
||||||
//typedef os_unfair_lock ggml_lock_t;
|
|
||||||
//
|
|
||||||
//#define ggml_lock_init(x) UNUSED(x)
|
|
||||||
//#define ggml_lock_destroy(x) UNUSED(x)
|
|
||||||
//#define ggml_lock_lock os_unfair_lock_lock
|
|
||||||
//#define ggml_lock_unlock os_unfair_lock_unlock
|
|
||||||
//
|
|
||||||
//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
|
|
||||||
|
|
||||||
typedef int ggml_lock_t;
|
|
||||||
|
|
||||||
#define ggml_lock_init(x) UNUSED(x)
|
|
||||||
#define ggml_lock_destroy(x) UNUSED(x)
|
|
||||||
#define ggml_lock_lock(x) UNUSED(x)
|
|
||||||
#define ggml_lock_unlock(x) UNUSED(x)
|
|
||||||
|
|
||||||
#define GGML_LOCK_INITIALIZER 0
|
|
||||||
|
|
||||||
#define ggml_thread_create pthread_create
|
|
||||||
#define ggml_thread_join pthread_join
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
//typedef pthread_spinlock_t ggml_lock_t;
|
|
||||||
|
|
||||||
//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
|
|
||||||
//#define ggml_lock_destroy pthread_spin_destroy
|
|
||||||
//#define ggml_lock_lock pthread_spin_lock
|
|
||||||
//#define ggml_lock_unlock pthread_spin_unlock
|
|
||||||
|
|
||||||
typedef int ggml_lock_t;
|
|
||||||
|
|
||||||
#define ggml_lock_init(x) UNUSED(x)
|
|
||||||
#define ggml_lock_destroy(x) UNUSED(x)
|
|
||||||
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
|
|
||||||
#define ggml_lock_lock(x) _mm_pause()
|
|
||||||
#else
|
|
||||||
#define ggml_lock_lock(x) UNUSED(x)
|
|
||||||
#endif
|
|
||||||
#define ggml_lock_unlock(x) UNUSED(x)
|
|
||||||
|
|
||||||
#define GGML_LOCK_INITIALIZER 0
|
|
||||||
|
|
||||||
#define ggml_thread_create pthread_create
|
|
||||||
#define ggml_thread_join pthread_join
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Android's libc implementation "bionic" does not support setting affinity
|
|
||||||
#if defined(__gnu_linux__)
|
|
||||||
static void set_numa_thread_affinity(int thread_n) {
|
|
||||||
if (!ggml_is_numa()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int node_num;
|
|
||||||
int rv;
|
|
||||||
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
|
||||||
|
|
||||||
switch(g_state.numa.numa_strategy) {
|
|
||||||
case GGML_NUMA_STRATEGY_DISTRIBUTE:
|
|
||||||
// run thread on node_num thread_n / (threads per node)
|
|
||||||
node_num = thread_n % g_state.numa.n_nodes;
|
|
||||||
break;
|
|
||||||
case GGML_NUMA_STRATEGY_ISOLATE:
|
|
||||||
// run thread on current_node
|
|
||||||
node_num = g_state.numa.current_node;
|
|
||||||
break;
|
|
||||||
case GGML_NUMA_STRATEGY_NUMACTL:
|
|
||||||
// use the cpuset that numactl gave us
|
|
||||||
rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
|
|
||||||
if (rv) {
|
|
||||||
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
default:
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
|
|
||||||
|
|
||||||
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
|
||||||
CPU_ZERO_S(setsize, cpus);
|
|
||||||
for (size_t i = 0; i < node->n_cpus; ++i) {
|
|
||||||
CPU_SET_S(node->cpus[i], setsize, cpus);
|
|
||||||
}
|
|
||||||
|
|
||||||
rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
|
||||||
if (rv) {
|
|
||||||
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
|
|
||||||
}
|
|
||||||
|
|
||||||
CPU_FREE(cpus);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void clear_numa_thread_affinity(void) {
|
|
||||||
if (!ggml_is_numa()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
|
||||||
|
|
||||||
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
|
||||||
CPU_ZERO_S(setsize, cpus);
|
|
||||||
for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
|
|
||||||
CPU_SET_S(i, setsize, cpus);
|
|
||||||
}
|
|
||||||
|
|
||||||
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
|
||||||
if (rv) {
|
|
||||||
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
|
|
||||||
}
|
|
||||||
|
|
||||||
CPU_FREE(cpus);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
// TODO: Windows etc.
|
|
||||||
// (the linux implementation may also work on BSD, someone should test)
|
|
||||||
static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
|
|
||||||
static void clear_numa_thread_affinity(void) {}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
||||||
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
||||||
|
|
20
ggml.h
20
ggml.h
|
@ -689,16 +689,6 @@ extern "C" {
|
||||||
void * wdata;
|
void * wdata;
|
||||||
};
|
};
|
||||||
|
|
||||||
// numa strategies
|
|
||||||
enum ggml_numa_strategy {
|
|
||||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
|
||||||
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
|
||||||
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
|
||||||
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
|
||||||
GGML_NUMA_STRATEGY_MIRROR = 4,
|
|
||||||
GGML_NUMA_STRATEGY_COUNT
|
|
||||||
};
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// GUID
|
// GUID
|
||||||
//
|
//
|
||||||
|
@ -722,6 +712,16 @@ extern "C" {
|
||||||
// accepts a UTF-8 path, even on Windows
|
// accepts a UTF-8 path, even on Windows
|
||||||
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
|
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
|
||||||
|
|
||||||
|
// numa strategies
|
||||||
|
enum ggml_numa_strategy {
|
||||||
|
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||||
|
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||||
|
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||||
|
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||||
|
GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||||
|
GGML_NUMA_STRATEGY_COUNT
|
||||||
|
};
|
||||||
|
|
||||||
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue