llama : allow to initialize backend with NUMA support

2023-06-26 20:24:17 +03:00 · 2023-06-26 20:24:17 +03:00 · 0fe4b00de2
commit 0fe4b00de2
parent 8f98035e0a
8 changed files with 30 additions and 19 deletions
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }
-    llama_init_backend();
+    llama_init_backend(params.numa);
    llama_model * model;
    llama_context * ctx;
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -5,7 +5,6 @@
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
 #include "build-info.h"
 #include <cassert>
@ -106,10 +105,7 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }
-    llama_init_backend();
+    llama_init_backend(params.numa);
    if (params.numa) {
        ggml_numa_init();
    }
    llama_model * model;
    llama_context * ctx;
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }
-    llama_init_backend();
+    llama_init_backend(params.numa);
    llama_model * model;
    llama_context * ctx;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
        usage(argv[0]);
    }
-    llama_init_backend();
+    llama_init_backend(false);
    // parse command line arguments
    const std::string fname_inp = argv[arg_idx];
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -66,7 +66,7 @@ int main(int argc, char ** argv)
    // Init LLM :
    //---------------------------------
-    llama_init_backend();
+    llama_init_backend(params.numa);
    llama_model * model;
    llama_context * ctx;
--- a/ggml.c
+++ b/ggml.c
@ -3879,14 +3879,12 @@ struct ggml_context_container {
 #define GGML_NUMA_MAX_NODES 8
 #define GGML_NUMA_MAX_CPUS 512
-struct ggml_numa_node
+struct ggml_numa_node {
 {
    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
    uint32_t n_cpus;
 };
-struct ggml_numa_nodes
+struct ggml_numa_nodes {
 {
    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
    uint32_t n_nodes;
    uint32_t total_cpus; // hardware threads on system
@ -3923,13 +3921,18 @@ inline static void ggml_critical_section_end(void) {
    atomic_fetch_sub(&g_state_barrier, 1);
 }
-void ggml_numa_init(void)
+void ggml_numa_init(void) {
-{
+    if (g_state.numa.n_nodes > 0) {
-    if (g_state.numa.n_nodes > 0) { return; }
+        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
        return;
    }
 #ifdef __linux__
    struct stat st;
    char path[256];
    int rv;
    // enumerate nodes
    while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
@ -3937,6 +3940,7 @@ void ggml_numa_init(void)
        if (stat(path, &st) != 0) { break; }
        ++g_state.numa.n_nodes;
    }
    // enumerate CPUs
    while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
@ -3944,11 +3948,14 @@ void ggml_numa_init(void)
        if (stat(path, &st) != 0) { break; }
        ++g_state.numa.total_cpus;
    }
    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
        g_state.numa.n_nodes = 0;
        return;
    }
    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
        struct ggml_numa_node * node = &g_state.numa.nodes[n];
        GGML_PRINT_DEBUG("CPUs on node %u:", n);
@ -3963,6 +3970,7 @@ void ggml_numa_init(void)
        }
        GGML_PRINT_DEBUG("\n");
    }
    if (ggml_is_numa()) {
        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
        if (fptr != NULL) {
@ -3978,7 +3986,9 @@ void ggml_numa_init(void)
 #endif
 }
-bool ggml_is_numa(void) { return g_state.numa.n_nodes > 1; }
+bool ggml_is_numa(void) {
    return g_state.numa.n_nodes > 1;
 }
 ////////////////////////////////////////////////////////////////////////////////
--- a/llama.cpp
+++ b/llama.cpp
@ -977,7 +977,7 @@ bool llama_mlock_supported() {
    return llama_mlock::SUPPORTED;
 }
-void llama_init_backend() {
+void llama_init_backend(bool numa) {
    ggml_time_init();
    // needed to initialize f16 tables
@ -986,6 +986,10 @@ void llama_init_backend() {
        struct ggml_context * ctx = ggml_init(params);
        ggml_free(ctx);
    }
    if (numa) {
        ggml_numa_init();
    }
 }
 int64_t llama_time_us() {
--- a/llama.h
+++ b/llama.h
@ -140,8 +140,9 @@ extern "C" {
    // TODO: not great API - very likely to change
    // Initialize the llama + ggml backend
    // If numa is true, use NUMA optimizations
    // Call once at the start of the program
-    LLAMA_API void llama_init_backend();
+    LLAMA_API void llama_init_backend(bool numa);
    LLAMA_API int64_t llama_time_us();