From 1d1cb01b0110b88869034b4acef57cf177b1edf0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= <sszymczy@gmail.com>
Date: Tue, 2 Jul 2024 11:22:58 +0200
Subject: [PATCH] llama : remove obsolete code

---
 src/llama.cpp | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 3dc36d536..15823f004 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13767,17 +13767,6 @@ static int llama_encode_internal(
 
         GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
 
-        // for big prompts, if BLAS is enabled, it is better to use only one thread
-        // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-        // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
-        //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
-        //       with the BLAS calls. need a better solution
-        // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
-        //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
-        if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
-            n_threads = std::min(4, n_threads);
-        }
-
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
 
         llama_set_inputs(lctx, u_batch);