diff --git a/common/common.cpp b/common/common.cpp index 55fec9211..0f5fc11a7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -659,11 +659,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } else { std::string value(argv[i]); - /**/ if (value == "interleave" || value == "" ) { params.numa = LLAMA_NUMA_STRATEGY_INTERLEAVE; } - else if (value == "isolate") { params.numa = LLAMA_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = LLAMA_NUMA_STRATEGY_NUMACTL; } + /**/ if (value == "interleave" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_INTERLEAVE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } #ifdef GGUF_NUMA_MIRROR - else if (value == "mirror") { params.numa = LLAMA_NUMA_STRATEGY_MIRROR; } + else if (value == "mirror") { params.numa = GGML_NUMA_STRATEGY_MIRROR; } #endif else { invalid_param = true; break; } } @@ -1012,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n"); printf(" - numactl: use the CPU map provided my numactl\n"); #ifdef GGML_NUMA_MIRROR - printf(" - mirror: attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n"); + printf(" - mirror: NOT YET IMPLEMENTED - attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n"); #endif printf(" if run without this previously, it is recommended to drop the system page cache before using this\n"); printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n"); diff --git a/common/common.h b/common/common.h index 2c864c04c..58500d920 100644 --- a/common/common.h +++ b/common/common.h @@ -76,7 +76,7 @@ struct gpt_params { float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; - int32_t numa = LLAMA_NUMA_STRATEGY_DISABLED; + ggml_numa_strategies numa = GGML_NUMA_STRATEGY_DISABLED; // // sampling parameters struct llama_sampling_params sparams; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index ddb0ba064..2a4728612 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1151,7 +1151,7 @@ int main(int argc, char ** argv) { if (!params.verbose) { llama_log_set(llama_null_log_callback, NULL); } - bool numa = false; + enum ggml_numa_strategies numa = GGML_NUMA_STRATEGY_DISABLED; llama_backend_init(numa); // initialize printer diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp index d5e705dce..e2c2dc836 100644 --- a/examples/llama.android/app/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp @@ -274,7 +274,7 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb extern "C" JNIEXPORT void JNICALL -Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) { +Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jint32 numa) { llama_backend_init(numa); } diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 85f403ffc..5f1e3e71b 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -237,7 +237,7 @@ int main(int argc, char ** argv) { params.imatrix = &imatrix_data; } - llama_backend_init(false); + llama_backend_init(GGML_NUMA_STRATEGY_DISABLED); // parse command line arguments const std::string fname_inp = argv[arg_idx]; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fc7e723a1..3f3295ee4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1821,7 +1821,13 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, { printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } - printf(" --numa attempt optimizations that help on some NUMA systems\n"); + printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n"); + printf(" - interleave: (default) spread execution evenly over all nodes\n"); + printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n"); + printf(" - numactl: use the CPU map provided my numactl\n"); +#ifdef GGML_NUMA_MIRROR + printf(" - mirror: NOT YET IMPLEMENTED - attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n"); +#endif if (llama_supports_gpu_offload()) { printf(" -ngl N, --n-gpu-layers N\n"); printf(" number of layers to store in VRAM\n"); @@ -2228,9 +2234,25 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, { params.use_mmap = false; } + else if (arg == "--numa") { + if (++i >= argc) { + invalid_param = true; + break; + } else { + std::string value(argv[i]); + /**/ if (value == "interleave" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_INTERLEAVE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } +#ifdef GGUF_NUMA_MIRROR + else if (value == "mirror") { params.numa = GGML_NUMA_STRATEGY_MIRROR; } +#endif + else { invalid_param = true; break; } + } + } + else if (arg == "--numa") { - params.numa = true; + params.numa = GGML_NUMA_STRATEGY_DISABLED; } else if (arg == "--embedding") { diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 4ff8e3fa7..9fdcfc9dc 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -17,7 +17,7 @@ int main(int argc, char ** argv) { const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids"; - llama_backend_init(false); + llama_backend_init(GGML_NUMA_STRATEGY_DISABLED); llama_model_params model_params = llama_model_default_params(); model_params.vocab_only = true; diff --git a/ggml.c b/ggml.c index 6922934e3..48e156a5b 100644 --- a/ggml.c +++ b/ggml.c @@ -25,7 +25,7 @@ #include #ifdef GGML_NUMA_MIRROR -#include +#include #endif #ifdef GGML_USE_METAL diff --git a/llama.cpp b/llama.cpp index 4358aae43..e3dc329e3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10327,7 +10327,7 @@ bool llama_mlock_supported(void) { return llama_supports_mlock(); } -void llama_backend_init(uint32_t numa) { +void llama_backend_init(enum ggml_numa_strategies numa) { ggml_time_init(); // needed to initialize f16 tables diff --git a/llama.h b/llama.h index 378730b42..70b8f5d68 100644 --- a/llama.h +++ b/llama.h @@ -111,15 +111,6 @@ extern "C" { LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, }; - enum llama_numa_strategies { - LLAMA_NUMA_STRATEGY_DISABLED = 0, - LLAMA_NUMA_STRATEGY_INTERLEAVE = 1, - LLAMA_NUMA_STRATEGY_ISOLATE = 2, - LLAMA_NUMA_STRATEGY_NUMACTL = 3, - LLAMA_NUMA_STRATEGY_MIRROR = 4, - LLAMA_NUMA_STRATEGY_MAX_VALUE = LLAMA_NUMA_STRATEGY_MIRROR, - }; - enum llama_split_mode { LLAMA_SPLIT_NONE = 0, // single GPU LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs @@ -313,7 +304,7 @@ extern "C" { // Initialize the llama + ggml backend // If numa is true, use NUMA optimizations // Call once at the start of the program - LLAMA_API void llama_backend_init(uint32_t numa); + LLAMA_API void llama_backend_init(enum ggml_numa_strategies numa); // Call once at the end of the program - currently only used for MPI LLAMA_API void llama_backend_free(void);