From 0fe4b00de249194c134b72fd7a89c0550c4e84b7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 26 Jun 2023 20:24:17 +0300 Subject: [PATCH] llama : allow to initialize backend with NUMA support --- examples/embedding/embedding.cpp | 2 +- examples/main/main.cpp | 6 +----- examples/perplexity/perplexity.cpp | 2 +- examples/quantize/quantize.cpp | 2 +- examples/simple/simple.cpp | 2 +- ggml.c | 26 ++++++++++++++++++-------- llama.cpp | 6 +++++- llama.h | 3 ++- 8 files changed, 30 insertions(+), 19 deletions(-) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 369eac1d1..3cd5bb794 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -35,7 +35,7 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } - llama_init_backend(); + llama_init_backend(params.numa); llama_model * model; llama_context * ctx; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index c7193627a..bcdc98d61 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -5,7 +5,6 @@ #include "common.h" #include "llama.h" -#include "ggml.h" #include "build-info.h" #include @@ -106,10 +105,7 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } - llama_init_backend(); - if (params.numa) { - ggml_numa_init(); - } + llama_init_backend(params.numa); llama_model * model; llama_context * ctx; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index b59f5971e..f8a6cb516 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -147,7 +147,7 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } - llama_init_backend(); + llama_init_backend(params.numa); llama_model * model; llama_context * ctx; diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 4e8e6f523..1eb0f75d6 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -180,7 +180,7 @@ int main(int argc, char ** argv) { usage(argv[0]); } - llama_init_backend(); + llama_init_backend(false); // parse command line arguments const std::string fname_inp = argv[arg_idx]; diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index fc45c9340..2d913cebb 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -66,7 +66,7 @@ int main(int argc, char ** argv) // Init LLM : //--------------------------------- - llama_init_backend(); + llama_init_backend(params.numa); llama_model * model; llama_context * ctx; diff --git a/ggml.c b/ggml.c index 7ff6254c5..df8370960 100644 --- a/ggml.c +++ b/ggml.c @@ -3879,14 +3879,12 @@ struct ggml_context_container { #define GGML_NUMA_MAX_NODES 8 #define GGML_NUMA_MAX_CPUS 512 -struct ggml_numa_node -{ +struct ggml_numa_node { uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node uint32_t n_cpus; }; -struct ggml_numa_nodes -{ +struct ggml_numa_nodes { struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; uint32_t n_nodes; uint32_t total_cpus; // hardware threads on system @@ -3923,13 +3921,18 @@ inline static void ggml_critical_section_end(void) { atomic_fetch_sub(&g_state_barrier, 1); } -void ggml_numa_init(void) -{ - if (g_state.numa.n_nodes > 0) { return; } +void ggml_numa_init(void) { + if (g_state.numa.n_nodes > 0) { + fprintf(stderr, "ggml_numa_init: NUMA already initialized\n"); + + return; + } + #ifdef __linux__ struct stat st; char path[256]; int rv; + // enumerate nodes while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) { rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes); @@ -3937,6 +3940,7 @@ void ggml_numa_init(void) if (stat(path, &st) != 0) { break; } ++g_state.numa.n_nodes; } + // enumerate CPUs while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) { rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus); @@ -3944,11 +3948,14 @@ void ggml_numa_init(void) if (stat(path, &st) != 0) { break; } ++g_state.numa.total_cpus; } + GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); + if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) { g_state.numa.n_nodes = 0; return; } + for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) { struct ggml_numa_node * node = &g_state.numa.nodes[n]; GGML_PRINT_DEBUG("CPUs on node %u:", n); @@ -3963,6 +3970,7 @@ void ggml_numa_init(void) } GGML_PRINT_DEBUG("\n"); } + if (ggml_is_numa()) { FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r"); if (fptr != NULL) { @@ -3978,7 +3986,9 @@ void ggml_numa_init(void) #endif } -bool ggml_is_numa(void) { return g_state.numa.n_nodes > 1; } +bool ggml_is_numa(void) { + return g_state.numa.n_nodes > 1; +} //////////////////////////////////////////////////////////////////////////////// diff --git a/llama.cpp b/llama.cpp index c41c2a8a3..e932636fc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -977,7 +977,7 @@ bool llama_mlock_supported() { return llama_mlock::SUPPORTED; } -void llama_init_backend() { +void llama_init_backend(bool numa) { ggml_time_init(); // needed to initialize f16 tables @@ -986,6 +986,10 @@ void llama_init_backend() { struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } + + if (numa) { + ggml_numa_init(); + } } int64_t llama_time_us() { diff --git a/llama.h b/llama.h index a833a7f4d..76239be25 100644 --- a/llama.h +++ b/llama.h @@ -140,8 +140,9 @@ extern "C" { // TODO: not great API - very likely to change // Initialize the llama + ggml backend + // If numa is true, use NUMA optimizations // Call once at the start of the program - LLAMA_API void llama_init_backend(); + LLAMA_API void llama_init_backend(bool numa); LLAMA_API int64_t llama_time_us();