diff --git a/common/common.cpp b/common/common.cpp index 044f218b4..6dea8e3d2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1099,8 +1099,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.kv_overrides = params.kv_overrides.data(); } - mparams.n_threads = params.cpuparams.n_threads; - return mparams; } diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index ce66a4733..fc9571c82 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -189,7 +189,7 @@ extern "C" { // Set the number of threads for the backend typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads); // Get additional buffer types provided by the device (returns a NULL-terminated array) - typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device, int n_threads); + typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device); // Set the abort callback for the backend typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data); // Get a list of feature flags supported by the backend (returns a NULL-terminated array) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 38447b6bd..bba18303b 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -325,9 +325,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # Fetch KleidiAI sources: include(FetchContent) - set(KLEIDIAI_COMMIT_SHA "v1.2.0") - set(KLEIDIAI_DOWNLOAD_URL "https://gitlab.arm.com/kleidi/kleidiai/-/archive/${KLEIDIAI_COMMIT_SHA}/kleidiai-${KLEIDIAI_COMMIT_SHA}.tar.gz") - set(KLEIDIAI_ARCHIVE_MD5 "cebcb660079bf15626e7bdaecd18f49c") + set(KLEIDIAI_COMMIT_TAG "v1.2.0") + set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") + set(KLEIDIAI_ARCHIVE_MD5 "6634fefce7357ecfee9eace2068bc68b") if (POLICY CMP0135) cmake_policy(SET CMP0135 NEW) diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp b/ggml/src/ggml-cpu/ggml-cpu-traits.cpp index 14536fe1b..62a0712da 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-traits.cpp @@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {} } // namespace ggml::cpu bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) { - for (auto extra : ggml_backend_cpu_get_extra_buffers_type(params->nth)) { + for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { if (extra && extra->context) { auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context; auto tensor_traits = buf_extra->get_tensor_traits(op); @@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct } bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) { - for (auto extra : ggml_backend_cpu_get_extra_buffers_type(n_threads)) { + for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { if (extra && extra->context) { auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context; auto tensor_traits = buf_extra->get_tensor_traits(op); diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.h b/ggml/src/ggml-cpu/ggml-cpu-traits.h index eba2d379b..99a6186b1 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-traits.h +++ b/ggml/src/ggml-cpu/ggml-cpu-traits.h @@ -33,6 +33,6 @@ class extra_buffer_type { } // namespace ggml::cpu // implemented in ggml-cpu.cpp. -std::vector & ggml_backend_cpu_get_extra_buffers_type(int n_threads); +std::vector & ggml_backend_cpu_get_extra_buffers_type(); #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 399f3f0f3..b79d979db 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -33,8 +33,8 @@ // ggml-backend interface -std::vector& ggml_backend_cpu_get_extra_buffers_type(int n_threads) { - static std::vector bufts = [n_threads]() { +std::vector& ggml_backend_cpu_get_extra_buffers_type() { + static std::vector bufts = []() { std::vector bufts; #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) @@ -44,8 +44,8 @@ std::vector& ggml_backend_cpu_get_extra_buffers_type #endif #ifdef GGML_USE_CPU_KLEIDIAI - if (ggml_backend_cpu_kleidiai_buffer_type(n_threads)) { - bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type(n_threads)); + if (ggml_backend_cpu_kleidiai_buffer_type()) { + bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type()); } #endif @@ -58,21 +58,19 @@ std::vector& ggml_backend_cpu_get_extra_buffers_type bufts.push_back(NULL); return bufts; - - GGML_UNUSED(n_threads); }(); return bufts; } -static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device, int n_threads) { - return ggml_backend_cpu_get_extra_buffers_type(n_threads).data(); +static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) { + return ggml_backend_cpu_get_extra_buffers_type().data(); GGML_UNUSED(device); } static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) { - for (auto extra : ggml_backend_cpu_get_extra_buffers_type(-1)) { + for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { if (extra && extra == buft) return true; } return false; @@ -387,7 +385,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st } // extra_buffer_op? - for (auto extra : ggml_backend_cpu_get_extra_buffers_type(-1)) { + for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { if (extra) { auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context; if (buf_extra && buf_extra->supports_op(dev, op)) { @@ -577,7 +575,7 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r features.push_back({ "OPENMP", "1" }); #endif #ifdef GGML_USE_CPU_KLEIDIAI - features.push_back({ "KLEIDIAI_REPACK", "1" }); + features.push_back({ "KLEIDIAI", "1" }); #endif #ifdef GGML_USE_CPU_AARCH64 features.push_back({ "AARCH64_REPACK", "1" }); diff --git a/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp b/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp index 504996146..32eadbf49 100644 --- a/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp +++ b/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp @@ -34,25 +34,25 @@ struct ggml_kleidiai_context { ggml_kleidiai_kernels * kernels; } static ctx = { NULL }; -static void init_kleidiai_context(int n_threads) { +static void init_kleidiai_context(void) { static bool initialized = false; if (!initialized) { - GGML_ASSERT(n_threads > 0); - initialized = true; + const char *env_var = getenv("GGML_KLEIDIAI_SME"); + int sme_enabled = 0; cpu_feature features = (ggml_cpu_has_dotprod() ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) | (ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM : CPU_FEATURE_NONE) | (ggml_cpu_has_sve() ? CPU_FEATURE_SVE : CPU_FEATURE_NONE); -#if defined(__APPLE__) - if (n_threads == 1) { + if (env_var) { + sme_enabled = atoi(env_var); + } + + if (sme_enabled != 0) { features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE; } -#else - features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE; -#endif ctx.kernels = ggml_kleidiai_select_kernels(features); } } @@ -162,6 +162,8 @@ public: ctx.kernels->rhs_info.pack_func(1, n, k, nr, kr, sr, k_q4_0_block_size, (const uint8_t *)data, NULL, tensor->data, 0, ¶ms); return 0; + + GGML_UNUSED(data_size); } }; @@ -223,7 +225,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { op->src[0]->type == GGML_TYPE_Q4_0 && op->src[0]->buffer && (ggml_n_dims(op->src[0]) == 2) && - op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type(-1) && ctx.kernels + op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels ) { if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { return false; @@ -237,7 +239,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { if (op->op == GGML_OP_MUL_MAT) { - if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type(-1)) { + if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) { return (ggml::cpu::tensor_traits *) op->src[0]->extra; } } @@ -246,7 +248,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { }; } // namespace ggml::cpu::kleidiai -ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(int n_threads) { +ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void) { static ggml::cpu::kleidiai::extra_buffer_type ctx; static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_kleidiai = { /* .iface = */ { @@ -261,7 +263,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(int n_threads) /* .context = */ &ctx, }; - init_kleidiai_context(n_threads); + init_kleidiai_context(); return &ggml_backend_cpu_buffer_type_kleidiai; } diff --git a/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.h b/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.h index 166c3f1a1..aca221e8e 100644 --- a/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.h +++ b/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.h @@ -11,7 +11,7 @@ extern "C" { #endif -ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(int n_threads); +ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void); #ifdef __cplusplus } diff --git a/include/llama.h b/include/llama.h index bb3aa8674..3b75e7607 100644 --- a/include/llama.h +++ b/include/llama.h @@ -304,8 +304,6 @@ extern "C" { bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool check_tensors; // validate model tensor data - - int n_threads; }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 512faee18..75073bf61 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -445,8 +445,7 @@ llama_model_loader::llama_model_loader( std::vector & splits, bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p, - int n_threads) { + const struct llama_model_kv_override * param_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -684,7 +683,6 @@ llama_model_loader::llama_model_loader( this->use_mmap = use_mmap; this->check_tensors = check_tensors; - this->n_threads = n_threads; } std::string llama_model_loader::get_arch_name() const { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 49cb18a3d..fe35404b2 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -77,8 +77,6 @@ struct llama_model_loader { llama_mmaps mappings; - int n_threads; - std::map weights_map; std::unordered_map kv_overrides; @@ -97,8 +95,7 @@ struct llama_model_loader { std::vector & splits, // optional, only need if the split does not follow naming scheme bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p, - int n_threads); + const struct llama_model_kv_override * param_overrides_p); template typename std::enable_if::value, bool>::type diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 199ecdcab..031b4c30b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -247,7 +247,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara } // CPU: ACCEL -> CPU extra -> GPU host -> CPU -static buft_list_t make_cpu_buft_list(const std::vector & devices, int n_threads) { +static buft_list_t make_cpu_buft_list(const std::vector & devices) { buft_list_t buft_list; // add ACCEL buffer types @@ -268,7 +268,7 @@ static buft_list_t make_cpu_buft_list(const std::vector & de auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); if (ggml_backend_dev_get_extra_bufts_fn) { - ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev, n_threads); + ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev); while (extra_bufts && *extra_bufts) { buft_list.emplace_back(cpu_dev, *extra_bufts); ++extra_bufts; @@ -1264,7 +1264,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const bool use_mmap_buffer = true; // build a list of buffer types for the CPU and GPU devices - pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.n_threads); + pimpl->cpu_buft_list = make_cpu_buft_list(devices); for (auto * dev : devices) { buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split); // add CPU buffer types as a fallback @@ -3768,7 +3768,6 @@ struct llama_model_params llama_model_default_params() { /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.check_tensors =*/ false, - /*.n_threads =*/ GGML_DEFAULT_N_THREADS, }; #ifdef GGML_USE_METAL diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 0ebb7504f..fb7982655 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nthread); + llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index 179460a4f..e8cfe5012 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -40,7 +40,7 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.n_threads); + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); ml.print_info();