CUDA: faster large batch FA without tensor cores (#7314)

This commit is contained in:
Johannes Gäßler 2024-05-17 18:54:52 +02:00 committed by GitHub
parent 82ca83db3c
commit 0fc1e820a9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 823 additions and 15 deletions

View file

@ -57,7 +57,7 @@ static __global__ void flash_attn_vec_ext_f16(
// ALiBi
if (max_bias > 0.0f) {
const int h = blockIdx.y;
const uint32_t h = blockIdx.y;
const float base = h < n_head_log2 ? m0 : m1;
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
@ -232,11 +232,8 @@ static __global__ void flash_attn_vec_ext_f16(
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
}
if (parallel_blocks != 1 && tid != 0) {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
}
if (parallel_blocks != 1 && threadIdx.x < ncols) {
dst_meta[(ic0 + threadIdx.x)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[threadIdx.x], kqsum[threadIdx.x]);
}
#else
NO_DEVICE_CODE;