diff --git a/src/llama.cpp b/src/llama.cpp index 3dc36d536..15823f004 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13767,17 +13767,6 @@ static int llama_encode_internal( GGML_ASSERT(strcmp(embd->name, "result_norm") == 0); - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well - // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering - // with the BLAS calls. need a better solution - // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is - // being processed then Accelerate/BLAS will not be involved, so capping would limit performance. - if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { - n_threads = std::min(4, n_threads); - } - ggml_backend_sched_alloc_graph(lctx.sched, gf); llama_set_inputs(lctx, u_batch);