diff --git a/examples/common.cpp b/examples/common.cpp index 1308f8410..aad21898d 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -288,6 +288,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.use_mmap = false; } else if (arg == "--mtest") { params.mem_test = true; + } else if (arg == "--numa") { + params.numa = true; } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "-r" || arg == "--reverse-prompt") { @@ -421,6 +423,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { if (llama_mmap_supported()) { fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } + fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n"); + fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n"); + fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n"); fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); fprintf(stderr, " number of layers to store in VRAM\n"); fprintf(stderr, " --mtest compute maximum memory usage\n"); diff --git a/examples/common.h b/examples/common.h index 2b66382a6..9d74bd7b8 100644 --- a/examples/common.h +++ b/examples/common.h @@ -70,6 +70,7 @@ struct gpt_params { bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool mem_test = false; // compute maximum memory usage + bool numa = false; // attempt optimizations that help on some NUMA systems bool verbose_prompt = false; // print prompt tokens before generation }; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 47b418d97..100e9d65b 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -5,6 +5,7 @@ #include "common.h" #include "llama.h" +#include "ggml.h" #include "build-info.h" #include @@ -97,6 +98,9 @@ int main(int argc, char ** argv) { } llama_init_backend(); + if (params.numa) { + ggml_numa_init(); + } llama_context * ctx; g_ctx = &ctx; diff --git a/llama.cpp b/llama.cpp index 468f96cc1..4cbc8d6b6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -851,7 +851,6 @@ bool llama_mlock_supported() { void llama_init_backend() { ggml_time_init(); - ggml_numa_init(); // needed to initialize f16 tables {