CUDA: deduplicate FlashAttention code (#7352)

2024-05-18 12:36:25 +02:00 · 2024-05-18 12:36:25 +02:00 · 133d99c599
commit 133d99c599
parent cb42c29427
8 changed files with 316 additions and 654 deletions
--- a/ggml-cuda/softmax.cu
+++ b/ggml-cuda/softmax.cu
@ -1,3 +1,4 @@
+#include "common.cuh"
 #include "softmax.cuh"

 template <typename T>
@ -23,17 +24,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, float * dst
    const int warp_id = threadIdx.x / WARP_SIZE;
    const int lane_id = threadIdx.x % WARP_SIZE;

-    float slope = 1.0f;
-
-    // ALiBi
-    if (max_bias > 0.0f) {
-        const int h = rowx/nrows_y; // head index
-
-        const float base = h < n_head_log2 ? m0 : m1;
-        const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
-
-        slope = powf(base, exph);
-    }
+    const float slope = get_alibi_slope(max_bias, rowx/nrows_y, n_head_log2, m0, m1);

    extern __shared__ float data_soft_max_f32[];
    float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication