Whitespace

This commit is contained in:
Paul Tsochantaris 2024-02-08 18:26:10 +00:00
parent f8dc954e0f
commit d5a6e865f6

View file

@ -7285,7 +7285,7 @@ static int llama_decode_internal(
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
// with the BLAS calls. need a better solution // with the BLAS calls. need a better solution
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance. // being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
n_threads = std::min(4, n_threads); n_threads = std::min(4, n_threads);