From 7032f4f6349c17a8352f9f93f7d2122f45469e59 Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Fri, 26 Jan 2024 11:17:59 -0600 Subject: [PATCH] ggml : update softmax n_task calculation (#5126) updated the n_task calculation to use max number of threads possible. This has improved the prompt eval performance by around 5% for DOT kernels and by around 10% for MMLA kernels on AWS Graviton3. --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index ca98fde8a..ef6fd8caf 100644 --- a/ggml.c +++ b/ggml.c @@ -16597,7 +16597,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_SOFT_MAX: { - n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0])); + n_tasks = MIN(n_threads, ggml_nrows(node->src[0])); } break; case GGML_OP_CONV_TRANSPOSE_1D: {