From 0fb40ae755ee002669db52ae2faa4fca5bc319e6 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 14 Feb 2024 09:46:06 +0000 Subject: [PATCH] split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples --- examples/batched-bench/batched-bench.cpp | 3 ++- examples/batched.swift/Sources/main.swift | 2 +- examples/batched/batched.cpp | 3 ++- examples/beam-search/beam-search.cpp | 3 ++- examples/embedding/embedding.cpp | 3 ++- examples/imatrix/imatrix.cpp | 3 ++- examples/infill/infill.cpp | 3 ++- examples/llama-bench/llama-bench.cpp | 3 +-- .../llama.android/app/src/main/cpp/llama-android.cpp | 4 ++-- .../llama.swiftui/llama.cpp.swift/LibLlama.swift | 2 +- examples/llava/llava-cli.cpp | 3 ++- examples/lookahead/lookahead.cpp | 3 ++- examples/lookup/lookup.cpp | 3 ++- examples/main/main.cpp | 3 ++- examples/parallel/parallel.cpp | 3 ++- examples/passkey/passkey.cpp | 3 ++- examples/perplexity/perplexity.cpp | 3 ++- examples/quantize/quantize.cpp | 2 +- examples/server/server.cpp | 3 ++- examples/simple/simple.cpp | 3 ++- examples/speculative/speculative.cpp | 3 ++- examples/tokenize/tokenize.cpp | 2 +- ggml.c | 6 +++--- llama.cpp | 12 +++++++----- llama.h | 5 ++++- tests/test-autorelease.cpp | 2 +- tests/test-model-load-cancel.cpp | 2 +- tests/test-tokenizer-0-falcon.cpp | 2 +- tests/test-tokenizer-0-llama.cpp | 2 +- tests/test-tokenizer-1-bpe.cpp | 2 +- tests/test-tokenizer-1-llama.cpp | 2 +- 31 files changed, 59 insertions(+), 39 deletions(-) diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index b52d68457..55dfd9784 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -82,7 +82,8 @@ int main(int argc, char ** argv) { // init LLM - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); // initialize the model diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index cdbce8435..d75c503d5 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -17,7 +17,7 @@ let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(argu let n_len: Int = 32 // init LLM -llama_backend_init(GGML_NUMA_STRATEGY_DISABLED) +llama_backend_init() defer { llama_backend_free() } diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index b1775e0b0..eab636692 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -50,7 +50,8 @@ int main(int argc, char ** argv) { // init LLM - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); // initialize the model diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 679b382e1..866c6d7a6 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -119,7 +119,8 @@ int main(int argc, char ** argv) // Init LLM : //--------------------------------- - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model; llama_context * ctx; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index b4688cf51..acff715e9 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -74,7 +74,8 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model; llama_context * ctx; diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index bc9f6fa68..f21bc48f3 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -568,7 +568,8 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model_params mparams = llama_model_params_from_gpt_params(params); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 72fb133b4..92c67b7cf 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -202,7 +202,8 @@ int main(int argc, char ** argv) { std::mt19937 rng(params.seed); LOG("%s: llama backend init\n", __func__); - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model; llama_context * ctx; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 2a4728612..11410f8ae 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1151,8 +1151,7 @@ int main(int argc, char ** argv) { if (!params.verbose) { llama_log_set(llama_null_log_callback, NULL); } - enum ggml_numa_strategies numa = GGML_NUMA_STRATEGY_DISABLED; - llama_backend_init(numa); + llama_backend_init(); // initialize printer std::unique_ptr p; diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp index e2c2dc836..2beb1e0d5 100644 --- a/examples/llama.android/app/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp @@ -274,8 +274,8 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb extern "C" JNIEXPORT void JNICALL -Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jint32 numa) { - llama_backend_init(numa); +Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) { + llama_backend_init(); } extern "C" diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 2470415dc..58fcf40c6 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -51,7 +51,7 @@ actor LlamaContext { } static func create_context(path: String) throws -> LlamaContext { - llama_backend_init(GGML_NUMA_STRATEGY_DISABLED) + llama_backend_init() var model_params = llama_model_default_params() #if targetEnvironment(simulator) diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 031e9806d..98f0fd3b2 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -196,7 +196,8 @@ static struct llava_context * llava_init(gpt_params * params) { auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - llama_backend_init(params->numa); + llama_backend_init(); + llama_numa_init(params->numa); llama_model_params model_params = llama_model_params_from_gpt_params(*params); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index e55a15a1b..e2551e7a4 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -54,7 +54,8 @@ int main(int argc, char ** argv) { #endif // LOG_DISABLE_LOGS // init llama.cpp - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model = NULL; llama_context * ctx = NULL; diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 18235b8a1..b53fae110 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -31,7 +31,8 @@ int main(int argc, char ** argv){ #endif // LOG_DISABLE_LOGS // init llama.cpp - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model = NULL; llama_context * ctx = NULL; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index e8ab8cbae..f5d2f4893 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -185,7 +185,8 @@ int main(int argc, char ** argv) { } LOG("%s: llama backend init\n", __func__); - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model; llama_context * ctx; diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index d2e074d9e..7d11fcd59 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -122,7 +122,8 @@ int main(int argc, char ** argv) { #endif // LOG_DISABLE_LOGS // init llama.cpp - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model = NULL; llama_context * ctx = NULL; diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 5c0022832..e12a1cdf1 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -71,7 +71,8 @@ int main(int argc, char ** argv) { // init LLM - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); // initialize the model diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index b2c131d4c..67d2d3293 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1809,7 +1809,8 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model; llama_context * ctx; diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 5f1e3e71b..4a5c504e3 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -237,7 +237,7 @@ int main(int argc, char ** argv) { params.imatrix = &imatrix_data; } - llama_backend_init(GGML_NUMA_STRATEGY_DISABLED); + llama_backend_init(); // parse command line arguments const std::string fname_inp = argv[arg_idx]; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0c9851e96..923891839 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2492,7 +2492,8 @@ int main(int argc, char **argv) params.model_alias = params.model; } - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER}, {"commit", LLAMA_COMMIT}}); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 9cfde8308..39e2d8ea4 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -31,7 +31,8 @@ int main(int argc, char ** argv) { // init LLM - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); // initialize the model diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 7b3af01f3..3848791d4 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -50,7 +50,8 @@ int main(int argc, char ** argv) { #endif // LOG_DISABLE_LOGS // init llama.cpp - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model_tgt = NULL; llama_model * model_dft = NULL; diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 9fdcfc9dc..d95a92475 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -17,7 +17,7 @@ int main(int argc, char ** argv) { const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids"; - llama_backend_init(GGML_NUMA_STRATEGY_DISABLED); + llama_backend_init(); llama_model_params model_params = llama_model_default_params(); model_params.vocab_only = true; diff --git a/ggml.c b/ggml.c index a635b3ecc..c131575b9 100644 --- a/ggml.c +++ b/ggml.c @@ -16663,7 +16663,7 @@ typedef pthread_t ggml_thread_t; // Android's libc implementation "bionic" does not support setting affinity #if defined(__linux__) && !defined(__BIONIC__) -static void set_numa_thread_affinity(int thread_n, int n_threads) { +static void set_numa_thread_affinity(int thread_n) { if (!ggml_is_numa()) { return; } @@ -16731,7 +16731,7 @@ static void clear_numa_thread_affinity(void) { #else // TODO: Windows etc. // (the linux implementation may also work on BSD, someone should test) -static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); } +static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } static void clear_numa_thread_affinity(void) {} #endif @@ -17031,7 +17031,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { const int n_threads = state->shared->n_threads; - set_numa_thread_affinity(state->ith, n_threads); + set_numa_thread_affinity(state->ith); int node_n = -1; int task_phase = GGML_TASK_FINALIZE; diff --git a/llama.cpp b/llama.cpp index 81f414d3c..23817e09f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11156,7 +11156,7 @@ bool llama_mlock_supported(void) { return llama_supports_mlock(); } -void llama_backend_init(enum ggml_numa_strategies numa) { +void llama_backend_init(void) { ggml_time_init(); // needed to initialize f16 tables @@ -11166,15 +11166,17 @@ void llama_backend_init(enum ggml_numa_strategies numa) { ggml_free(ctx); } - if (numa > 0) { - ggml_numa_init(numa); - } - #ifdef GGML_USE_MPI ggml_mpi_backend_init(); #endif } +void llama_numa_init(enum ggml_numa_strategies numa) { + if (numa > 0) { + ggml_numa_init(numa); + } +} + void llama_backend_free(void) { #ifdef GGML_USE_MPI ggml_mpi_backend_free(); diff --git a/llama.h b/llama.h index fe9e05f1d..a20b1f8f1 100644 --- a/llama.h +++ b/llama.h @@ -306,7 +306,10 @@ extern "C" { // Initialize the llama + ggml backend // If numa is true, use NUMA optimizations // Call once at the start of the program - LLAMA_API void llama_backend_init(enum ggml_numa_strategies numa); + LLAMA_API void llama_backend_init(void); + + //optional: + LLAMA_API void llama_numa_init(enum ggml_numa_strategies numa); // Call once at the end of the program - currently only used for MPI LLAMA_API void llama_backend_free(void); diff --git a/tests/test-autorelease.cpp b/tests/test-autorelease.cpp index fef6683c4..57fa00011 100644 --- a/tests/test-autorelease.cpp +++ b/tests/test-autorelease.cpp @@ -12,7 +12,7 @@ int main(int argc, char ** argv) { auto * model_path = get_model_or_exit(argc, argv); std::thread([&model_path]() { - llama_backend_init(GGML_NUMA_STRATEGY_DISABLED); + llama_backend_init(); auto * model = llama_load_model_from_file(model_path, llama_model_default_params()); auto * ctx = llama_new_context_with_model(model, llama_context_default_params()); llama_free(ctx); diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index 69c5815fd..858535c3c 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -14,7 +14,7 @@ int main(int argc, char *argv[] ) { fprintf(stderr, "using '%s'\n", model_path); fclose(file); - llama_backend_init(GGML_NUMA_STRATEGY_DISABLED); + llama_backend_init(); auto params = llama_model_params{}; params.use_mmap = false; params.progress_callback = [](float progress, void * ctx){ diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp index 50bd06557..472b0b3a8 100644 --- a/tests/test-tokenizer-0-falcon.cpp +++ b/tests/test-tokenizer-0-falcon.cpp @@ -61,7 +61,7 @@ int main(int argc, char **argv) { llama_model * model; llama_context * ctx; - llama_backend_init(GGML_NUMA_STRATEGY_DISABLED); + llama_backend_init(); // load the vocab { diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp index 5a7577992..0a16cd7eb 100644 --- a/tests/test-tokenizer-0-llama.cpp +++ b/tests/test-tokenizer-0-llama.cpp @@ -60,7 +60,7 @@ int main(int argc, char **argv) { llama_model * model; llama_context * ctx; - llama_backend_init(GGML_NUMA_STRATEGY_DISABLED); + llama_backend_init(); // load the vocab { diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index dcbfbdd3e..3596ce55a 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -25,7 +25,7 @@ int main(int argc, char **argv) { llama_model * model; llama_context * ctx; - llama_backend_init(GGML_NUMA_STRATEGY_DISABLED); + llama_backend_init(); // load the vocab { diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp index 9a36c9e79..9333f8686 100644 --- a/tests/test-tokenizer-1-llama.cpp +++ b/tests/test-tokenizer-1-llama.cpp @@ -25,7 +25,7 @@ int main(int argc, char **argv) { llama_model * model; llama_context * ctx; - llama_backend_init(GGML_NUMA_STRATEGY_DISABLED); + llama_backend_init(); // load the vocab {