From 56e45a239e1d5a871009aa162b7ba99c93c40b62 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 1 Feb 2024 20:16:32 +0200 Subject: [PATCH] metal : optimize softmax for C > 32 --- ggml-metal.metal | 16 +++++++++++----- tests/test-backend-ops.cpp | 9 +++++++++ 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/ggml-metal.metal b/ggml-metal.metal index 04c1aaf9c..3d5d762d1 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2217,29 +2217,35 @@ kernel void kernel_flash_attn_ext_f16( for (int64_t p = tiisg; p < C; p += NW) { const half s = ss[j*T + p]; - smax = simd_max(max(smax, s)); - M[j] = simd_max(max(M[j], s)); + smax = max(smax, s); + M[j] = max(M[j], s); } - const half ms = m == -INFINITY ? 0.0h : exp(m - M[j]); + smax = simd_max(smax); + M[j] = simd_max(M[j]); - S[j] = S[j]*ms; + const half ms = m == -INFINITY ? 0.0h : exp(m - M[j]); // create a QxQ diagonal matrix for rescaling the output if (tiisg == j) { ss[j*T + C + j] = ms; } + // local sum + half ls = 0.0h; + for (int64_t p = tiisg; p < C; p += NW) { const half s = ss[j*T + p]; const half vs = s == -INFINITY ? 0.0h : exp(s - M[j]); - S[j] = S[j] + simd_sum(vs); + ls += vs; // the P matrix from the paper (Q rows, C columns) ss[j*T + p] = vs; } + + S[j] = S[j]*ms + simd_sum(ls); } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b1b30b91c..2ab535406 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -572,9 +572,18 @@ struct test_case { // duplicate the op size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1; +#if 1 for (int i = 1; i < n_runs; i++) { gf->nodes[gf->n_nodes++] = out; } +#else + int n_nodes = gf->n_nodes; + for (int i = 1; i < n_runs; i++) { + for (int j = 0; j < n_nodes; j++) { + gf->nodes[gf->n_nodes++] = gf->nodes[j]; + } + } +#endif // calculate memory size_t mem = n_runs * op_size(out);