llama : remove obsolete code
This commit is contained in:
parent
78675f35ee
commit
1d1cb01b01
1 changed files with 0 additions and 11 deletions
|
@ -13767,17 +13767,6 @@ static int llama_encode_internal(
|
|||
|
||||
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
|
||||
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
||||
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
||||
// with the BLAS calls. need a better solution
|
||||
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
||||
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
||||
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
||||
n_threads = std::min(4, n_threads);
|
||||
}
|
||||
|
||||
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
||||
|
||||
llama_set_inputs(lctx, u_batch);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue