CUDA: refactor and optimize IQ MMVQ (#8215)

* CUDA: refactor and optimize IQ MMVQ

* uint -> uint32_t

* __dp4a -> ggml_cuda_dp4a

* remove MIN_CC_DP4A checks

* change default

* try CI fix
This commit is contained in:
Johannes Gäßler 2024-07-01 20:39:06 +02:00 committed by GitHub
parent dae57a1ebc
commit cb5fad4c6c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 406 additions and 487 deletions

View file

@ -54,12 +54,11 @@ typedef float (*vec_dot_KQ_f32_t)(
template<typename T, int D>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
GGML_UNUSED(Q_v);
half sum = 0.0f;
T sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
@ -72,7 +71,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
const int u = Q_q8[k_KQ_0/WARP_SIZE];
const int sumi = __dp4a(v, u, 0);
const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
@ -90,19 +89,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
}
return sum;
#else
GGML_UNUSED(K_c);
GGML_UNUSED(Q_v);
GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template<typename T, int D>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
GGML_UNUSED(Q_v);
@ -120,7 +111,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
const int u = Q_q8[k_KQ_0/WARP_SIZE];
const int sumi = __dp4a(v, u, 0);
const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
@ -142,19 +133,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
}
return sum;
#else
GGML_UNUSED(K_c);
GGML_UNUSED(Q_v);
GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template<typename T, int D>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
GGML_UNUSED(Q_v);
@ -179,7 +162,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
const int u = Q_q8[k_KQ_0/WARP_SIZE];
const int sumi = __dp4a(v, u, 0);
const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
@ -197,19 +180,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
}
return sum;
#else
GGML_UNUSED(K_c);
GGML_UNUSED(Q_v);
GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template<typename T, int D>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
GGML_UNUSED(Q_v);
@ -234,7 +209,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
const int u = Q_q8[k_KQ_0/WARP_SIZE];
const int sumi = __dp4a(v, u, 0);
const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
@ -256,19 +231,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
}
return sum;
#else
GGML_UNUSED(K_c);
GGML_UNUSED(Q_v);
GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template <typename T, int D>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
GGML_UNUSED(Q_v);
@ -297,13 +264,6 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
}
return sum;
#else
GGML_UNUSED(K_c);
GGML_UNUSED(Q_v);
GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
template <typename T, int D>