remove MIN_CC_DP4A checks
This commit is contained in:
parent
a92595aa93
commit
78754008df
2 changed files with 9 additions and 150 deletions
|
@ -54,7 +54,6 @@ typedef float (*vec_dot_KQ_f32_t)(
|
|||
template<typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
@ -90,19 +89,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
|||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template<typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
@ -142,19 +133,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
|||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template<typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
@ -197,19 +180,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
|||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template<typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
@ -256,19 +231,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
|||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template <typename T, int D>
|
||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
||||
GGML_UNUSED(Q_v);
|
||||
|
@ -297,13 +264,6 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
|||
}
|
||||
|
||||
return sum;
|
||||
#else
|
||||
GGML_UNUSED(K_c);
|
||||
GGML_UNUSED(Q_v);
|
||||
GGML_UNUSED(Q_q8);
|
||||
GGML_UNUSED(Q_ds_v);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template <typename T, int D>
|
||||
|
|
|
@ -51,7 +51,6 @@ static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32
|
|||
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
|
||||
const int * v, const int * u, const float & d4, const half2 & ds8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
int sumi = 0;
|
||||
|
||||
#pragma unroll
|
||||
|
@ -68,9 +67,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|||
|
||||
// second part effectively subtracts 8 from each quant value
|
||||
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
#define VDR_Q4_1_Q8_1_MMVQ 2
|
||||
|
@ -79,7 +75,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|||
template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
|
||||
const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
int sumi = 0;
|
||||
|
||||
#pragma unroll
|
||||
|
@ -105,9 +100,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|||
|
||||
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
||||
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
#define VDR_Q5_0_Q8_1_MMVQ 2
|
||||
|
@ -116,7 +108,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|||
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
|
||||
const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
int sumi = 0;
|
||||
|
||||
#pragma unroll
|
||||
|
@ -140,9 +131,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|||
|
||||
// second part effectively subtracts 16 from each quant value
|
||||
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
#define VDR_Q5_1_Q8_1_MMVQ 2
|
||||
|
@ -151,7 +139,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|||
template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
|
||||
const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
int sumi = 0;
|
||||
|
||||
#pragma unroll
|
||||
|
@ -184,10 +171,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|||
|
||||
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
||||
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
#define VDR_Q8_0_Q8_1_MMVQ 2
|
||||
|
@ -196,7 +179,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|||
template <typename T, int vdr> static __device__ __forceinline__ T vec_dot_q8_0_q8_1_impl(
|
||||
const int * v, const int * u, const T & d8_0, const T & d8_1) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
int sumi = 0;
|
||||
|
||||
#pragma unroll
|
||||
|
@ -206,15 +188,11 @@ template <typename T, int vdr> static __device__ __forceinline__ T vec_dot_q8_0_
|
|||
}
|
||||
|
||||
return d8_0*d8_1 * ((T) sumi);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
|
||||
const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
int sumi = 0;
|
||||
|
||||
#pragma unroll
|
||||
|
@ -236,9 +214,6 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|||
|
||||
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
||||
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
#define VDR_Q2_K_Q8_1_MMVQ 1
|
||||
|
@ -249,7 +224,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|||
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
||||
const half2 & dm2, const float * __restrict__ d8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
|
||||
|
@ -271,16 +245,12 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|||
const float2 dm2f = __half22float2(dm2);
|
||||
|
||||
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
||||
const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
|
||||
|
@ -303,9 +273,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|||
}
|
||||
|
||||
return d8*(sumf_d - sumf_m);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
#define VDR_Q3_K_Q8_1_MMVQ 1
|
||||
|
@ -316,7 +283,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|||
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
||||
const int & scale_offset, const float & d3, const float * __restrict__ d8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
float sumf = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
|
@ -343,9 +309,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|||
}
|
||||
|
||||
return d3 * sumf;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
|
@ -353,7 +316,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|||
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
||||
const float & d3, const float & d8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
int sumi = 0;
|
||||
|
||||
#pragma unroll
|
||||
|
@ -370,9 +332,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|||
}
|
||||
|
||||
return d3*d8 * sumi;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
#define VDR_Q4_K_Q8_1_MMVQ 2
|
||||
|
@ -383,7 +342,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|||
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
||||
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
|
||||
|
@ -402,10 +360,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|||
const float2 dm4f = __half22float2(dm4);
|
||||
|
||||
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
|
@ -413,7 +367,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|||
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
||||
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
|
||||
|
@ -435,10 +388,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|||
const float2 dm4f = __half22float2(dm4);
|
||||
|
||||
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
#define VDR_Q5_K_Q8_1_MMVQ 2
|
||||
|
@ -449,7 +398,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|||
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
||||
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
|
||||
|
@ -475,10 +423,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|||
const float2 dm5f = __half22float2(dm5);
|
||||
|
||||
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
|
@ -486,7 +430,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|||
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
||||
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
|
||||
|
@ -508,10 +451,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|||
const float2 dm4f = __half22float2(dm4);
|
||||
|
||||
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
#define VDR_Q6_K_Q8_1_MMVQ 1
|
||||
|
@ -522,7 +461,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|||
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
||||
const float & d, const float * __restrict__ d8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
float sumf = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
|
@ -539,9 +477,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|||
}
|
||||
|
||||
return d*sumf;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
// contiguous u/y values
|
||||
|
@ -549,7 +484,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|||
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
|
||||
const float & d6, const float * __restrict__ d8) {
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
float sumf_d = 0.0f;
|
||||
|
||||
#pragma unroll
|
||||
|
@ -569,10 +503,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|||
}
|
||||
|
||||
return d6 * sumf_d;
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
||||
|
@ -841,7 +771,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|||
|
||||
static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
|
||||
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq + kbx;
|
||||
|
||||
const int q2 = get_int_b2(bq2->qs, iqs);
|
||||
|
@ -869,16 +799,13 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
|||
sumi = (ls*sumi + sumi/2)/4;
|
||||
const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
|
||||
return d * sumi;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define VDR_IQ2_XS_Q8_1_MMVQ 2
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
|
||||
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq + kbx;
|
||||
|
||||
const int2 q2_packed = make_int2(get_int_b2(bq2->qs, iqs + 0), get_int_b2(bq2->qs, iqs + 1));
|
||||
|
@ -910,17 +837,13 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|||
const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
|
||||
const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
|
||||
return d * sumi;
|
||||
#else
|
||||
GGML_UNUSED(ksigns64);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define VDR_IQ2_S_Q8_1_MMVQ 2
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
|
||||
const block_iq2_s * bq2 = (const block_iq2_s *) vbq + kbx;
|
||||
|
||||
const int qs_packed = get_int_b2(bq2->qs, iqs/2);
|
||||
|
@ -961,17 +884,13 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
|||
|
||||
const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
|
||||
return d * sumi;
|
||||
#else
|
||||
GGML_UNUSED(ksigns64);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define VDR_IQ3_XXS_Q8_1_MMVQ 2
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
|
||||
const block_iq3_xxs * bq3 = (const block_iq3_xxs *) vbq + kbx;
|
||||
|
||||
const int2 q3_packed = make_int2(get_int_b2(bq3->qs, iqs), get_int_b2(bq3->qs, iqs+1));
|
||||
|
@ -999,9 +918,6 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
|||
sumi = (ls*sumi + sumi/2)/2;
|
||||
const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
|
||||
return d * sumi;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define VDR_IQ3_S_Q8_1_MMVQ 2
|
||||
|
@ -1009,7 +925,7 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
|||
// TODO: don't use lookup table for signs
|
||||
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
|
||||
const block_iq3_s * bq3 = (const block_iq3_s *) vbq + kbx;
|
||||
|
||||
const int2 qs_packed = make_int2(get_int_b2(bq3->qs, iqs + 0), get_int_b2(bq3->qs, iqs + 1));
|
||||
|
@ -1044,15 +960,12 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
|||
|
||||
const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
|
||||
return d * sumi;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
const block_iq1_s * bq1 = (const block_iq1_s *) vbq + kbx;
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
|
||||
const int qs_packed = get_int_b2(bq1->qs, iqs);
|
||||
const uint8_t * qs = (const uint8_t *) &qs_packed;
|
||||
|
||||
|
@ -1077,14 +990,11 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
|||
const float delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
|
||||
const float2 ds = __half22float2(bq8_1[iqs].ds);
|
||||
return d1q * (ds.x*sumi + ds.y*delta);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
|
||||
const block_iq1_m * bq1 = (const block_iq1_m *) vbq + kbx;
|
||||
|
||||
const int qs_packed = get_int_b4(bq1->qs, iqs);
|
||||
|
@ -1124,12 +1034,8 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
|||
const int sc0 = 2*((tmp >> 0) & 0x07) + 1;
|
||||
const int sc1 = 2*((tmp >> 3) & 0x07) + 1;
|
||||
return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4) {
|
||||
const int q0_32 = (q4 >> 0) & 0x0F0F0F0F;
|
||||
const int8_t * q0_8 = (const int8_t *) &q0_32;
|
||||
|
@ -1143,13 +1049,12 @@ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4) {
|
|||
|
||||
return make_int2(*((const int *) &val0_8), *((const int *) &val1_8));
|
||||
}
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
#define VDR_IQ4_NL_Q8_1_MMVQ 2
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
|
||||
const block_iq4_nl * bq4 = (const block_iq4_nl *) vbq + kbx;
|
||||
|
||||
const int * q8 = (const int *) bq8_1->qs + iqs;
|
||||
|
@ -1166,16 +1071,13 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
|||
|
||||
const float d = __half2float(bq4->d) * __low2float(bq8_1->ds);
|
||||
return d * sumi;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
#define VDR_IQ4_XS_Q8_1_MMVQ 4
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
|
||||
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq + kbx;
|
||||
|
||||
int sumi = 0;
|
||||
|
@ -1196,7 +1098,4 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
|||
|
||||
const float d = __half2float(bq4->d) * __low2float(bq8_1[iqs/4].ds);
|
||||
return d * sumi;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue