From 0fe4b00de249194c134b72fd7a89c0550c4e84b7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 26 Jun 2023 20:24:17 +0300
Subject: [PATCH] llama : allow to initialize backend with NUMA support

---
 examples/embedding/embedding.cpp   |  2 +-
 examples/main/main.cpp             |  6 +-----
 examples/perplexity/perplexity.cpp |  2 +-
 examples/quantize/quantize.cpp     |  2 +-
 examples/simple/simple.cpp         |  2 +-
 ggml.c                             | 26 ++++++++++++++++++--------
 llama.cpp                          |  6 +++++-
 llama.h                            |  3 ++-
 8 files changed, 30 insertions(+), 19 deletions(-)
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 369eac1d1..3cd5bb794 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend();
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index c7193627a..bcdc98d61 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -5,7 +5,6 @@
 
 #include "common.h"
 #include "llama.h"
-#include "ggml.h"
 #include "build-info.h"
 
 #include <cassert>
@@ -106,10 +105,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend();
-    if (params.numa) {
-        ggml_numa_init();
-    }
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index b59f5971e..f8a6cb516 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend();
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 4e8e6f523..1eb0f75d6 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
         usage(argv[0]);
     }
 
-    llama_init_backend();
+    llama_init_backend(false);
 
     // parse command line arguments
     const std::string fname_inp = argv[arg_idx];
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index fc45c9340..2d913cebb 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
     // Init LLM :
     //---------------------------------
 
-    llama_init_backend();
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/ggml.c b/ggml.c
index 7ff6254c5..df8370960 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3879,14 +3879,12 @@ struct ggml_context_container {
 #define GGML_NUMA_MAX_NODES 8
 #define GGML_NUMA_MAX_CPUS 512
 
-struct ggml_numa_node
-{
+struct ggml_numa_node {
     uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
     uint32_t n_cpus;
 };
 
-struct ggml_numa_nodes
-{
+struct ggml_numa_nodes {
     struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
     uint32_t n_nodes;
     uint32_t total_cpus; // hardware threads on system
@@ -3923,13 +3921,18 @@ inline static void ggml_critical_section_end(void) {
     atomic_fetch_sub(&g_state_barrier, 1);
 }
 
-void ggml_numa_init(void)
-{
-    if (g_state.numa.n_nodes > 0) { return; }
+void ggml_numa_init(void) {
+    if (g_state.numa.n_nodes > 0) {
+        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
+
+        return;
+    }
+
 #ifdef __linux__
     struct stat st;
     char path[256];
     int rv;
+
     // enumerate nodes
     while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
         rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
@@ -3937,6 +3940,7 @@ void ggml_numa_init(void)
         if (stat(path, &st) != 0) { break; }
         ++g_state.numa.n_nodes;
     }
+
     // enumerate CPUs
     while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
         rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
@@ -3944,11 +3948,14 @@ void ggml_numa_init(void)
         if (stat(path, &st) != 0) { break; }
         ++g_state.numa.total_cpus;
     }
+
     GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
+
     if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
         g_state.numa.n_nodes = 0;
         return;
     }
+
     for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
         struct ggml_numa_node * node = &g_state.numa.nodes[n];
         GGML_PRINT_DEBUG("CPUs on node %u:", n);
@@ -3963,6 +3970,7 @@ void ggml_numa_init(void)
         }
         GGML_PRINT_DEBUG("\n");
     }
+
     if (ggml_is_numa()) {
         FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
         if (fptr != NULL) {
@@ -3978,7 +3986,9 @@ void ggml_numa_init(void)
 #endif
 }
 
-bool ggml_is_numa(void) { return g_state.numa.n_nodes > 1; }
+bool ggml_is_numa(void) {
+    return g_state.numa.n_nodes > 1;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/llama.cpp b/llama.cpp
index c41c2a8a3..e932636fc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -977,7 +977,7 @@ bool llama_mlock_supported() {
     return llama_mlock::SUPPORTED;
 }
 
-void llama_init_backend() {
+void llama_init_backend(bool numa) {
     ggml_time_init();
 
     // needed to initialize f16 tables
@@ -986,6 +986,10 @@ void llama_init_backend() {
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
+
+    if (numa) {
+        ggml_numa_init();
+    }
 }
 
 int64_t llama_time_us() {
diff --git a/llama.h b/llama.h
index a833a7f4d..76239be25 100644
--- a/llama.h
+++ b/llama.h
@@ -140,8 +140,9 @@ extern "C" {
 
     // TODO: not great API - very likely to change
     // Initialize the llama + ggml backend
+    // If numa is true, use NUMA optimizations
     // Call once at the start of the program
-    LLAMA_API void llama_init_backend();
+    LLAMA_API void llama_init_backend(bool numa);
 
     LLAMA_API int64_t llama_time_us();