llama : allow to initialize backend with NUMA support
This commit is contained in:
parent
8f98035e0a
commit
0fe4b00de2
8 changed files with 30 additions and 19 deletions
|
@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = gpt_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_init_backend();
|
llama_init_backend(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
|
@ -5,7 +5,6 @@
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
|
||||||
#include "build-info.h"
|
#include "build-info.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -106,10 +105,7 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = gpt_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_init_backend();
|
llama_init_backend(params.numa);
|
||||||
if (params.numa) {
|
|
||||||
ggml_numa_init();
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
|
@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = gpt_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_init_backend();
|
llama_init_backend(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
|
@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_init_backend();
|
llama_init_backend(false);
|
||||||
|
|
||||||
// parse command line arguments
|
// parse command line arguments
|
||||||
const std::string fname_inp = argv[arg_idx];
|
const std::string fname_inp = argv[arg_idx];
|
||||||
|
|
|
@ -66,7 +66,7 @@ int main(int argc, char ** argv)
|
||||||
// Init LLM :
|
// Init LLM :
|
||||||
//---------------------------------
|
//---------------------------------
|
||||||
|
|
||||||
llama_init_backend();
|
llama_init_backend(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
26
ggml.c
26
ggml.c
|
@ -3879,14 +3879,12 @@ struct ggml_context_container {
|
||||||
#define GGML_NUMA_MAX_NODES 8
|
#define GGML_NUMA_MAX_NODES 8
|
||||||
#define GGML_NUMA_MAX_CPUS 512
|
#define GGML_NUMA_MAX_CPUS 512
|
||||||
|
|
||||||
struct ggml_numa_node
|
struct ggml_numa_node {
|
||||||
{
|
|
||||||
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
|
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
|
||||||
uint32_t n_cpus;
|
uint32_t n_cpus;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_numa_nodes
|
struct ggml_numa_nodes {
|
||||||
{
|
|
||||||
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
||||||
uint32_t n_nodes;
|
uint32_t n_nodes;
|
||||||
uint32_t total_cpus; // hardware threads on system
|
uint32_t total_cpus; // hardware threads on system
|
||||||
|
@ -3923,13 +3921,18 @@ inline static void ggml_critical_section_end(void) {
|
||||||
atomic_fetch_sub(&g_state_barrier, 1);
|
atomic_fetch_sub(&g_state_barrier, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_numa_init(void)
|
void ggml_numa_init(void) {
|
||||||
{
|
if (g_state.numa.n_nodes > 0) {
|
||||||
if (g_state.numa.n_nodes > 0) { return; }
|
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
struct stat st;
|
struct stat st;
|
||||||
char path[256];
|
char path[256];
|
||||||
int rv;
|
int rv;
|
||||||
|
|
||||||
// enumerate nodes
|
// enumerate nodes
|
||||||
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
||||||
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
||||||
|
@ -3937,6 +3940,7 @@ void ggml_numa_init(void)
|
||||||
if (stat(path, &st) != 0) { break; }
|
if (stat(path, &st) != 0) { break; }
|
||||||
++g_state.numa.n_nodes;
|
++g_state.numa.n_nodes;
|
||||||
}
|
}
|
||||||
|
|
||||||
// enumerate CPUs
|
// enumerate CPUs
|
||||||
while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
|
while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
|
||||||
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
|
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
|
||||||
|
@ -3944,11 +3948,14 @@ void ggml_numa_init(void)
|
||||||
if (stat(path, &st) != 0) { break; }
|
if (stat(path, &st) != 0) { break; }
|
||||||
++g_state.numa.total_cpus;
|
++g_state.numa.total_cpus;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
||||||
|
|
||||||
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
|
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
|
||||||
g_state.numa.n_nodes = 0;
|
g_state.numa.n_nodes = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
||||||
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
||||||
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
||||||
|
@ -3963,6 +3970,7 @@ void ggml_numa_init(void)
|
||||||
}
|
}
|
||||||
GGML_PRINT_DEBUG("\n");
|
GGML_PRINT_DEBUG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ggml_is_numa()) {
|
if (ggml_is_numa()) {
|
||||||
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
|
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
|
||||||
if (fptr != NULL) {
|
if (fptr != NULL) {
|
||||||
|
@ -3978,7 +3986,9 @@ void ggml_numa_init(void)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_is_numa(void) { return g_state.numa.n_nodes > 1; }
|
bool ggml_is_numa(void) {
|
||||||
|
return g_state.numa.n_nodes > 1;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
|
@ -977,7 +977,7 @@ bool llama_mlock_supported() {
|
||||||
return llama_mlock::SUPPORTED;
|
return llama_mlock::SUPPORTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_init_backend() {
|
void llama_init_backend(bool numa) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
// needed to initialize f16 tables
|
// needed to initialize f16 tables
|
||||||
|
@ -986,6 +986,10 @@ void llama_init_backend() {
|
||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (numa) {
|
||||||
|
ggml_numa_init();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t llama_time_us() {
|
int64_t llama_time_us() {
|
||||||
|
|
3
llama.h
3
llama.h
|
@ -140,8 +140,9 @@ extern "C" {
|
||||||
|
|
||||||
// TODO: not great API - very likely to change
|
// TODO: not great API - very likely to change
|
||||||
// Initialize the llama + ggml backend
|
// Initialize the llama + ggml backend
|
||||||
|
// If numa is true, use NUMA optimizations
|
||||||
// Call once at the start of the program
|
// Call once at the start of the program
|
||||||
LLAMA_API void llama_init_backend();
|
LLAMA_API void llama_init_backend(bool numa);
|
||||||
|
|
||||||
LLAMA_API int64_t llama_time_us();
|
LLAMA_API int64_t llama_time_us();
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue