llama : allow to initialize backend with NUMA support

This commit is contained in:
Georgi Gerganov 2023-06-26 20:24:17 +03:00
parent 8f98035e0a
commit 0fe4b00de2
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
8 changed files with 30 additions and 19 deletions

View file

@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng); params.prompt = gpt_random_prompt(rng);
} }
llama_init_backend(); llama_init_backend(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;

View file

@ -5,7 +5,6 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "ggml.h"
#include "build-info.h" #include "build-info.h"
#include <cassert> #include <cassert>
@ -106,10 +105,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng); params.prompt = gpt_random_prompt(rng);
} }
llama_init_backend(); llama_init_backend(params.numa);
if (params.numa) {
ggml_numa_init();
}
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;

View file

@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng); params.prompt = gpt_random_prompt(rng);
} }
llama_init_backend(); llama_init_backend(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;

View file

@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
usage(argv[0]); usage(argv[0]);
} }
llama_init_backend(); llama_init_backend(false);
// parse command line arguments // parse command line arguments
const std::string fname_inp = argv[arg_idx]; const std::string fname_inp = argv[arg_idx];

View file

@ -66,7 +66,7 @@ int main(int argc, char ** argv)
// Init LLM : // Init LLM :
//--------------------------------- //---------------------------------
llama_init_backend(); llama_init_backend(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;

26
ggml.c
View file

@ -3879,14 +3879,12 @@ struct ggml_context_container {
#define GGML_NUMA_MAX_NODES 8 #define GGML_NUMA_MAX_NODES 8
#define GGML_NUMA_MAX_CPUS 512 #define GGML_NUMA_MAX_CPUS 512
struct ggml_numa_node struct ggml_numa_node {
{
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
uint32_t n_cpus; uint32_t n_cpus;
}; };
struct ggml_numa_nodes struct ggml_numa_nodes {
{
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
uint32_t n_nodes; uint32_t n_nodes;
uint32_t total_cpus; // hardware threads on system uint32_t total_cpus; // hardware threads on system
@ -3923,13 +3921,18 @@ inline static void ggml_critical_section_end(void) {
atomic_fetch_sub(&g_state_barrier, 1); atomic_fetch_sub(&g_state_barrier, 1);
} }
void ggml_numa_init(void) void ggml_numa_init(void) {
{ if (g_state.numa.n_nodes > 0) {
if (g_state.numa.n_nodes > 0) { return; } fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
return;
}
#ifdef __linux__ #ifdef __linux__
struct stat st; struct stat st;
char path[256]; char path[256];
int rv; int rv;
// enumerate nodes // enumerate nodes
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) { while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes); rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
@ -3937,6 +3940,7 @@ void ggml_numa_init(void)
if (stat(path, &st) != 0) { break; } if (stat(path, &st) != 0) { break; }
++g_state.numa.n_nodes; ++g_state.numa.n_nodes;
} }
// enumerate CPUs // enumerate CPUs
while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) { while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus); rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
@ -3944,11 +3948,14 @@ void ggml_numa_init(void)
if (stat(path, &st) != 0) { break; } if (stat(path, &st) != 0) { break; }
++g_state.numa.total_cpus; ++g_state.numa.total_cpus;
} }
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) { if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
g_state.numa.n_nodes = 0; g_state.numa.n_nodes = 0;
return; return;
} }
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) { for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
struct ggml_numa_node * node = &g_state.numa.nodes[n]; struct ggml_numa_node * node = &g_state.numa.nodes[n];
GGML_PRINT_DEBUG("CPUs on node %u:", n); GGML_PRINT_DEBUG("CPUs on node %u:", n);
@ -3963,6 +3970,7 @@ void ggml_numa_init(void)
} }
GGML_PRINT_DEBUG("\n"); GGML_PRINT_DEBUG("\n");
} }
if (ggml_is_numa()) { if (ggml_is_numa()) {
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r"); FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
if (fptr != NULL) { if (fptr != NULL) {
@ -3978,7 +3986,9 @@ void ggml_numa_init(void)
#endif #endif
} }
bool ggml_is_numa(void) { return g_state.numa.n_nodes > 1; } bool ggml_is_numa(void) {
return g_state.numa.n_nodes > 1;
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View file

@ -977,7 +977,7 @@ bool llama_mlock_supported() {
return llama_mlock::SUPPORTED; return llama_mlock::SUPPORTED;
} }
void llama_init_backend() { void llama_init_backend(bool numa) {
ggml_time_init(); ggml_time_init();
// needed to initialize f16 tables // needed to initialize f16 tables
@ -986,6 +986,10 @@ void llama_init_backend() {
struct ggml_context * ctx = ggml_init(params); struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx); ggml_free(ctx);
} }
if (numa) {
ggml_numa_init();
}
} }
int64_t llama_time_us() { int64_t llama_time_us() {

View file

@ -140,8 +140,9 @@ extern "C" {
// TODO: not great API - very likely to change // TODO: not great API - very likely to change
// Initialize the llama + ggml backend // Initialize the llama + ggml backend
// If numa is true, use NUMA optimizations
// Call once at the start of the program // Call once at the start of the program
LLAMA_API void llama_init_backend(); LLAMA_API void llama_init_backend(bool numa);
LLAMA_API int64_t llama_time_us(); LLAMA_API int64_t llama_time_us();