Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

2024-02-07 19:49:07 +00:00 · 2024-02-07 19:49:07 +00:00 · c43808c625
commit c43808c625
parent 60b80b0e8a
10 changed files with 37 additions and 24 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -659,11 +659,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            } else {
               std::string value(argv[i]);
-               /**/ if (value == "interleave" || value == "" )   { params.numa = LLAMA_NUMA_STRATEGY_INTERLEAVE; }
-               else if (value == "isolate") { params.numa = LLAMA_NUMA_STRATEGY_ISOLATE; }
-               else if (value == "numactl")   { params.numa = LLAMA_NUMA_STRATEGY_NUMACTL; }
+               /**/ if (value == "interleave" || value == "" )   { params.numa = GGML_NUMA_STRATEGY_INTERLEAVE; }
+               else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+               else if (value == "numactl")   { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
 #ifdef GGUF_NUMA_MIRROR
-               else if (value == "mirror")   { params.numa = LLAMA_NUMA_STRATEGY_MIRROR; }
+               else if (value == "mirror")   { params.numa = GGML_NUMA_STRATEGY_MIRROR; }
 #endif
               else { invalid_param = true; break; }
            }
@ -1012,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                          - isolate: only spawn threads on CPUs on the node that execution started on\n");
    printf("                          - numactl: use the CPU map provided my numactl\n");
 #ifdef GGML_NUMA_MIRROR
-    printf("                          - mirror: attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n");
+    printf("                          - mirror: NOT YET IMPLEMENTED - attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n");
 #endif
    printf("                        if run without this previously, it is recommended to drop the system page cache before using this\n");
    printf("                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
--- a/common/common.h
+++ b/common/common.h
@ -76,7 +76,7 @@ struct gpt_params {
    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
    int32_t rope_scaling_type     = LLAMA_ROPE_SCALING_UNSPECIFIED;
-    int32_t numa                  = LLAMA_NUMA_STRATEGY_DISABLED;
+    ggml_numa_strategies numa     = GGML_NUMA_STRATEGY_DISABLED;

    // // sampling parameters
    struct llama_sampling_params sparams;
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -1151,7 +1151,7 @@ int main(int argc, char ** argv) {
    if (!params.verbose) {
        llama_log_set(llama_null_log_callback, NULL);
    }
-    bool numa = false;
+    enum ggml_numa_strategies numa = GGML_NUMA_STRATEGY_DISABLED;
    llama_backend_init(numa);

    // initialize printer
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@ -274,7 +274,7 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb

 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) {
+Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jint32 numa) {
    llama_backend_init(numa);
 }

--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -237,7 +237,7 @@ int main(int argc, char ** argv) {
        params.imatrix = &imatrix_data;
    }

-    llama_backend_init(false);
+    llama_backend_init(GGML_NUMA_STRATEGY_DISABLED);

    // parse command line arguments
    const std::string fname_inp = argv[arg_idx];
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1821,7 +1821,13 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    {
        printf("  --no-mmap                 do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
-    printf("  --numa                    attempt optimizations that help on some NUMA systems\n");
+    printf("  --numa TYPE           attempt optimizations that help on some NUMA systems\n");
+    printf("                          - interleave: (default) spread execution evenly over all nodes\n");
+    printf("                          - isolate: only spawn threads on CPUs on the node that execution started on\n");
+    printf("                          - numactl: use the CPU map provided my numactl\n");
+#ifdef GGML_NUMA_MIRROR
+    printf("                          - mirror: NOT YET IMPLEMENTED - attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n");
+#endif
    if (llama_supports_gpu_offload()) {
        printf("  -ngl N, --n-gpu-layers N\n");
        printf("                            number of layers to store in VRAM\n");
@ -2228,9 +2234,25 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        {
            params.use_mmap = false;
        }
+        else if (arg == "--numa") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            } else {
+                std::string value(argv[i]);
+                /**/ if (value == "interleave" || value == "" )   { params.numa = GGML_NUMA_STRATEGY_INTERLEAVE; }
+                else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+                else if (value == "numactl")   { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+#ifdef GGUF_NUMA_MIRROR
+                else if (value == "mirror")   { params.numa = GGML_NUMA_STRATEGY_MIRROR; }
+#endif
+                else { invalid_param = true; break; }
+            }
+        }
+
        else if (arg == "--numa")
        {
-            params.numa = true;
+            params.numa = GGML_NUMA_STRATEGY_DISABLED;
        }
        else if (arg == "--embedding")
        {
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@ -17,7 +17,7 @@ int main(int argc, char ** argv) {

    const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";

-    llama_backend_init(false);
+    llama_backend_init(GGML_NUMA_STRATEGY_DISABLED);

    llama_model_params model_params = llama_model_default_params();
    model_params.vocab_only = true;
--- a/ggml.c
+++ b/ggml.c
@ -25,7 +25,7 @@
 #include <signal.h>

 #ifdef GGML_NUMA_MIRROR
-#include <numanor.h>
+#include <numa.h>
 #endif

 #ifdef GGML_USE_METAL
--- a/llama.cpp
+++ b/llama.cpp
@ -10327,7 +10327,7 @@ bool llama_mlock_supported(void) {
    return llama_supports_mlock();
 }

-void llama_backend_init(uint32_t numa) {
+void llama_backend_init(enum ggml_numa_strategies numa) {
    ggml_time_init();

    // needed to initialize f16 tables
--- a/llama.h
+++ b/llama.h
@ -111,15 +111,6 @@ extern "C" {
        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
    };

-    enum llama_numa_strategies {
-        LLAMA_NUMA_STRATEGY_DISABLED = 0,
-        LLAMA_NUMA_STRATEGY_INTERLEAVE = 1,
-        LLAMA_NUMA_STRATEGY_ISOLATE = 2,
-        LLAMA_NUMA_STRATEGY_NUMACTL = 3,
-        LLAMA_NUMA_STRATEGY_MIRROR = 4,
-        LLAMA_NUMA_STRATEGY_MAX_VALUE = LLAMA_NUMA_STRATEGY_MIRROR,
-    };
-
    enum llama_split_mode {
        LLAMA_SPLIT_NONE    = 0, // single GPU
        LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
@ -313,7 +304,7 @@ extern "C" {
    // Initialize the llama + ggml backend
    // If numa is true, use NUMA optimizations
    // Call once at the start of the program
-    LLAMA_API void llama_backend_init(uint32_t numa);
+    LLAMA_API void llama_backend_init(enum ggml_numa_strategies numa);

    // Call once at the end of the program - currently only used for MPI
    LLAMA_API void llama_backend_free(void);