make --numa a param

2023-06-17 15:03:14 -04:00 · 2023-06-17 15:03:14 -04:00 · bf83dcb279
commit bf83dcb279
parent 8502d5178e
4 changed files with 10 additions and 1 deletions
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -288,6 +288,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.use_mmap = false;
        } else if (arg == "--mtest") {
            params.mem_test = true;
+        } else if (arg == "--numa") {
+            params.numa = true;
        } else if (arg == "--verbose-prompt") {
            params.verbose_prompt = true;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
@ -421,6 +423,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    if (llama_mmap_supported()) {
        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
+    fprintf(stderr, "  --numa                attempt optimizations that help on some NUMA systems\n");
+    fprintf(stderr, "                        if run without this previously, it is recommended to drop the system page cache before using this\n");
+    fprintf(stderr, "                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
    fprintf(stderr, "                        number of layers to store in VRAM\n");
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
--- a/examples/common.h
+++ b/examples/common.h
@ -70,6 +70,7 @@ struct gpt_params {
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool mem_test          = false; // compute maximum memory usage
+    bool numa              = false; // attempt optimizations that help on some NUMA systems
    bool verbose_prompt    = false; // print prompt tokens before generation
 };

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -5,6 +5,7 @@

 #include "common.h"
 #include "llama.h"
+#include "ggml.h"
 #include "build-info.h"

 #include <cassert>
@ -97,6 +98,9 @@ int main(int argc, char ** argv) {
    }

    llama_init_backend();
+    if (params.numa) {
+        ggml_numa_init();
+    }

    llama_context * ctx;
    g_ctx = &ctx;
--- a/llama.cpp
+++ b/llama.cpp
@ -851,7 +851,6 @@ bool llama_mlock_supported() {

 void llama_init_backend() {
    ggml_time_init();
-    ggml_numa_init();

    // needed to initialize f16 tables
    {