CUDA: use tensor cores for MMQ (#7676)
* CUDA: int8 tensor cores for MMQ (legacy quants) * fix out-of-bounds writes * __builtin_assume -> GGML_CUDA_ASSUME * fix writeback returning too early
This commit is contained in:
parent
af4ae502dd
commit
1f0dabda8d
7 changed files with 550 additions and 55 deletions
|
@ -1,9 +1,9 @@
|
|||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
#if FP16_MMA_AVAILABLE
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
#include <mma.h>
|
||||
#endif
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
|
||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
|
||||
|
@ -45,7 +45,7 @@ static __global__ void flash_attn_ext_f16(
|
|||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#if FP16_MMA_AVAILABLE
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue