From 2f5bb462fd8179f484e6b04f119116f0a7bce8c4 Mon Sep 17 00:00:00 2001
From: zrm <trustiosity.zrm@gmail.com>
Date: Sun, 18 Jun 2023 11:59:27 -0400
Subject: [PATCH] move numa state to g_state

---
 ggml.c       | 176 ++++++++++++++++++++++++++-------------------------
 llama-util.h |   5 +-
 2 files changed, 91 insertions(+), 90 deletions(-)

diff --git a/ggml.c b/ggml.c
index 4fe0707c8..b76dc5e9a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -480,86 +480,6 @@ int64_t ggml_cycles_per_ms(void) {
 #define ggml_perf_cycles_per_ms() 0
 #endif
 
-//
-// NUMA support
-//
-
-#define GGML_NUMA_MAX_NODES 8
-#define GGML_NUMA_MAX_CPUS 512
-
-struct ggml_numa_node
-{
-    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
-    uint32_t n_cpus;
-};
-
-struct ggml_numa_nodes
-{
-    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
-    uint32_t n_nodes;
-    uint32_t total_cpus; // hardware threads on system
-};
-
-struct ggml_numa_nodes ggml_numa = {
-    .n_nodes = 0,
-    .total_cpus = 0,
-};
-
-void ggml_numa_init(void)
-{
-    if (ggml_numa.n_nodes > 0) { return; }
-#ifdef __linux__
-    struct stat st;
-    char path[256];
-    int rv;
-    // enumerate nodes
-    while (ggml_numa.n_nodes < GGML_NUMA_MAX_NODES) {
-        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", ggml_numa.n_nodes);
-        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-        if (stat(path, &st) != 0) { break; }
-        ++ggml_numa.n_nodes;
-    }
-    // enumerate CPUs
-    while (ggml_numa.total_cpus < GGML_NUMA_MAX_CPUS) {
-        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", ggml_numa.total_cpus);
-        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-        if (stat(path, &st) != 0) { break; }
-        ++ggml_numa.total_cpus;
-    }
-    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", ggml_numa.n_nodes, ggml_numa.total_cpus);
-    if (ggml_numa.n_nodes < 1 || ggml_numa.total_cpus < 1) {
-        ggml_numa.n_nodes = 0;
-        return;
-    }
-    for (uint32_t n = 0; n < ggml_numa.n_nodes; ++n) {
-        struct ggml_numa_node *node = &ggml_numa.nodes[n];
-        GGML_PRINT_DEBUG("CPUs on node %u:", n);
-        for (uint32_t c = 0; c < ggml_numa.total_cpus; ++c) {
-            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
-            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-            if (stat(path, &st) == 0) {
-                node->cpus[node->n_cpus++] = c;
-                GGML_PRINT_DEBUG(" %u", c);
-            }
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-    if (ggml_is_numa()) {
-        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
-        if (fptr != NULL) {
-            char buf[42];
-            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
-                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
-            }
-            fclose(fptr);
-        }
-    }
-#else
-    // TODO
-#endif
-}
-
-bool ggml_is_numa(void) { return ggml_numa.n_nodes > 1; }
 
 //
 // cache line
@@ -3750,12 +3670,33 @@ struct ggml_compute_params {
     void * wdata;
 };
 
+//
+// NUMA support
+//
+
+#define GGML_NUMA_MAX_NODES 8
+#define GGML_NUMA_MAX_CPUS 512
+
+struct ggml_numa_node
+{
+    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
+    uint32_t n_cpus;
+};
+
+struct ggml_numa_nodes
+{
+    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
+    uint32_t n_nodes;
+    uint32_t total_cpus; // hardware threads on system
+};
+
 //
 // ggml state
 //
 
 struct ggml_state {
     struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
+    struct ggml_numa_nodes numa;
 };
 
 // global state
@@ -3780,6 +3721,63 @@ inline static void ggml_critical_section_end(void) {
     atomic_fetch_sub(&g_state_barrier, 1);
 }
 
+void ggml_numa_init(void)
+{
+    if (g_state.numa.n_nodes > 0) { return; }
+#ifdef __linux__
+    struct stat st;
+    char path[256];
+    int rv;
+    // enumerate nodes
+    while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
+        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) { break; }
+        ++g_state.numa.n_nodes;
+    }
+    // enumerate CPUs
+    while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
+        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) { break; }
+        ++g_state.numa.total_cpus;
+    }
+    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
+    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
+        g_state.numa.n_nodes = 0;
+        return;
+    }
+    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
+        struct ggml_numa_node * node = &g_state.numa.nodes[n];
+        GGML_PRINT_DEBUG("CPUs on node %u:", n);
+        node->n_cpus = 0;
+        for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
+            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
+            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+            if (stat(path, &st) == 0) {
+                node->cpus[node->n_cpus++] = c;
+                GGML_PRINT_DEBUG(" %u", c);
+            }
+        }
+        GGML_PRINT_DEBUG("\n");
+    }
+    if (ggml_is_numa()) {
+        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
+        if (fptr != NULL) {
+            char buf[42];
+            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
+                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
+            }
+            fclose(fptr);
+        }
+    }
+#else
+    // TODO
+#endif
+}
+
+bool ggml_is_numa(void) { return g_state.numa.n_nodes > 1; }
+
 ////////////////////////////////////////////////////////////////////////////////
 
 void ggml_print_object(const struct ggml_object * obj) {
@@ -3995,6 +3993,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
             g_state = (struct ggml_state) {
                 /*.contexts =*/ { { 0 } },
+                /*.numa =*/ {
+                    .n_nodes = 0,
+                    .total_cpus = 0,
+                },
             };
 
             for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
@@ -14056,10 +14058,10 @@ void set_numa_thread_affinity(int thread_n, int n_threads)
 {
     if (!ggml_is_numa()) { return; }
     // run thread on node_num thread_n / (threads per node)
-    int node_num = thread_n / ((n_threads + ggml_numa.n_nodes - 1) / ggml_numa.n_nodes);
-    struct ggml_numa_node *node = &ggml_numa.nodes[node_num];
-    size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
-    cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
+    int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
+    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
+    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
     CPU_ZERO_S(setsize, cpus);
     for (size_t i = 0; i < node->n_cpus; ++i) {
         CPU_SET_S(node->cpus[i], setsize, cpus);
@@ -14074,10 +14076,10 @@ void set_numa_thread_affinity(int thread_n, int n_threads)
 void clear_numa_thread_affinity(void)
 {
     if (!ggml_is_numa()) { return; }
-    size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
-    cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
+    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
     CPU_ZERO_S(setsize, cpus);
-    for (unsigned i = 0; i < ggml_numa.total_cpus; ++i) {
+    for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
         CPU_SET_S(i, setsize, cpus);
     }
     int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
diff --git a/llama-util.h b/llama-util.h
index bb7155036..3d47be073 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -16,6 +16,8 @@
 #include <vector>
 #include <stdexcept>
 
+#include "ggml.h"
+
 #ifdef __has_include
     #if __has_include(<unistd.h>)
         #include <unistd.h>
@@ -163,9 +165,6 @@ static std::string llama_format_win_err(DWORD err) {
 }
 #endif
 
-extern "C" {
-bool ggml_is_numa();
-}
 struct llama_mmap {
     void * addr;
     size_t size;