rename CC_TURING to CC_VOLTA
This commit is contained in:
parent
62832c57c4
commit
59937e45a3
1 changed files with 45 additions and 44 deletions
89
ggml-cuda.cu
89
ggml-cuda.cu
|
@ -80,9 +80,9 @@
|
|||
#include "ggml.h"
|
||||
|
||||
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||
#define CC_TURING 700
|
||||
#define CC_VOLTA 700
|
||||
#define CC_OFFSET_AMD 1000000
|
||||
#define CC_RDNA2 CC_OFFSET_AMD + 1030
|
||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
@ -3553,7 +3553,7 @@ template <bool need_check> static __global__ void
|
|||
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
|
||||
#elif __CUDA_ARCH__ >= CC_TURING
|
||||
#elif __CUDA_ARCH__ >= CC_VOLTA
|
||||
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
||||
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
||||
const int nwarps = NWARPS_Q4_0_AMPERE;
|
||||
|
@ -3573,7 +3573,7 @@ template <bool need_check> static __global__ void
|
|||
#else
|
||||
(void) vec_dot_q4_0_q8_1_mul_mat;
|
||||
assert(false);
|
||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
#define MMQ_X_Q4_1_RDNA2 64
|
||||
|
@ -3594,9 +3594,9 @@ template <bool need_check> static __global__ void
|
|||
#if defined(RDNA3) || defined(RDNA2)
|
||||
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
|
||||
#endif // defined(RDNA3) || defined(RDNA2)
|
||||
#elif __CUDA_ARCH__ < CC_TURING
|
||||
#elif __CUDA_ARCH__ < CC_VOLTA
|
||||
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
||||
#endif // __CUDA_ARCH__ < CC_TURING
|
||||
#endif // __CUDA_ARCH__ < CC_VOLTA
|
||||
mul_mat_q4_1(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||
|
@ -3616,7 +3616,7 @@ template <bool need_check> static __global__ void
|
|||
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
|
||||
#elif __CUDA_ARCH__ >= CC_TURING
|
||||
#elif __CUDA_ARCH__ >= CC_VOLTA
|
||||
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
||||
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
||||
const int nwarps = NWARPS_Q4_1_AMPERE;
|
||||
|
@ -3636,7 +3636,7 @@ template <bool need_check> static __global__ void
|
|||
#else
|
||||
(void) vec_dot_q4_1_q8_1_mul_mat;
|
||||
assert(false);
|
||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
#define MMQ_X_Q5_0_RDNA2 64
|
||||
|
@ -3677,7 +3677,7 @@ template <bool need_check> static __global__ void
|
|||
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
|
||||
#elif __CUDA_ARCH__ >= CC_TURING
|
||||
#elif __CUDA_ARCH__ >= CC_VOLTA
|
||||
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
||||
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
||||
const int nwarps = NWARPS_Q5_0_AMPERE;
|
||||
|
@ -3697,7 +3697,7 @@ template <bool need_check> static __global__ void
|
|||
#else
|
||||
(void) vec_dot_q5_0_q8_1_mul_mat;
|
||||
assert(false);
|
||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
#define MMQ_X_Q5_1_RDNA2 64
|
||||
|
@ -3738,7 +3738,7 @@ mul_mat_q5_1(
|
|||
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
|
||||
#elif __CUDA_ARCH__ >= CC_TURING
|
||||
#elif __CUDA_ARCH__ >= CC_VOLTA
|
||||
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
||||
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
||||
const int nwarps = NWARPS_Q5_1_AMPERE;
|
||||
|
@ -3758,7 +3758,7 @@ mul_mat_q5_1(
|
|||
#else
|
||||
(void) vec_dot_q5_1_q8_1_mul_mat;
|
||||
assert(false);
|
||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
#define MMQ_X_Q8_0_RDNA2 64
|
||||
|
@ -3799,7 +3799,7 @@ template <bool need_check> static __global__ void
|
|||
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
|
||||
#elif __CUDA_ARCH__ >= CC_TURING
|
||||
#elif __CUDA_ARCH__ >= CC_VOLTA
|
||||
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
||||
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
||||
const int nwarps = NWARPS_Q8_0_AMPERE;
|
||||
|
@ -3819,7 +3819,7 @@ template <bool need_check> static __global__ void
|
|||
#else
|
||||
(void) vec_dot_q8_0_q8_1_mul_mat;
|
||||
assert(false);
|
||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
#define MMQ_X_Q2_K_RDNA2 64
|
||||
|
@ -3860,7 +3860,7 @@ mul_mat_q2_K(
|
|||
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
|
||||
#elif __CUDA_ARCH__ >= CC_TURING
|
||||
#elif __CUDA_ARCH__ >= CC_VOLTA
|
||||
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
||||
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
||||
const int nwarps = NWARPS_Q2_K_AMPERE;
|
||||
|
@ -3880,7 +3880,7 @@ mul_mat_q2_K(
|
|||
#else
|
||||
(void) vec_dot_q2_K_q8_1_mul_mat;
|
||||
assert(false);
|
||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
#define MMQ_X_Q3_K_RDNA2 128
|
||||
|
@ -3901,9 +3901,9 @@ template <bool need_check> static __global__ void
|
|||
#if defined(RDNA3) || defined(RDNA2)
|
||||
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
|
||||
#endif // defined(RDNA3) || defined(RDNA2)
|
||||
#elif __CUDA_ARCH__ < CC_TURING
|
||||
#elif __CUDA_ARCH__ < CC_VOLTA
|
||||
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
||||
#endif // __CUDA_ARCH__ < CC_TURING
|
||||
#endif // __CUDA_ARCH__ < CC_VOLTA
|
||||
mul_mat_q3_K(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||
|
@ -3923,7 +3923,7 @@ template <bool need_check> static __global__ void
|
|||
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
|
||||
#elif __CUDA_ARCH__ >= CC_TURING
|
||||
#elif __CUDA_ARCH__ >= CC_VOLTA
|
||||
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
||||
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
||||
const int nwarps = NWARPS_Q3_K_AMPERE;
|
||||
|
@ -3943,7 +3943,7 @@ template <bool need_check> static __global__ void
|
|||
#else
|
||||
(void) vec_dot_q3_K_q8_1_mul_mat;
|
||||
assert(false);
|
||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
#define MMQ_X_Q4_K_RDNA2 64
|
||||
|
@ -3964,9 +3964,9 @@ template <bool need_check> static __global__ void
|
|||
#if defined(RDNA3) || defined(RDNA2)
|
||||
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
|
||||
#endif // defined(RDNA3) || defined(RDNA2)
|
||||
#elif __CUDA_ARCH__ < CC_TURING
|
||||
#elif __CUDA_ARCH__ < CC_VOLTA
|
||||
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
||||
#endif // __CUDA_ARCH__ < CC_TURING
|
||||
#endif // __CUDA_ARCH__ < CC_VOLTA
|
||||
mul_mat_q4_K(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||
|
@ -3986,7 +3986,7 @@ template <bool need_check> static __global__ void
|
|||
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
|
||||
#elif __CUDA_ARCH__ >= CC_TURING
|
||||
#elif __CUDA_ARCH__ >= CC_VOLTA
|
||||
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
||||
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
||||
const int nwarps = NWARPS_Q4_K_AMPERE;
|
||||
|
@ -4006,7 +4006,7 @@ template <bool need_check> static __global__ void
|
|||
#else
|
||||
(void) vec_dot_q4_K_q8_1_mul_mat;
|
||||
assert(false);
|
||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
#define MMQ_X_Q5_K_RDNA2 64
|
||||
|
@ -4047,7 +4047,7 @@ mul_mat_q5_K(
|
|||
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
|
||||
#elif __CUDA_ARCH__ >= CC_TURING
|
||||
#elif __CUDA_ARCH__ >= CC_VOLTA
|
||||
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
||||
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
||||
const int nwarps = NWARPS_Q5_K_AMPERE;
|
||||
|
@ -4067,7 +4067,7 @@ mul_mat_q5_K(
|
|||
#else
|
||||
(void) vec_dot_q5_K_q8_1_mul_mat;
|
||||
assert(false);
|
||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
#define MMQ_X_Q6_K_RDNA2 64
|
||||
|
@ -4088,9 +4088,9 @@ template <bool need_check> static __global__ void
|
|||
#if defined(RDNA3) || defined(RDNA2)
|
||||
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
|
||||
#endif // defined(RDNA3) || defined(RDNA2)
|
||||
#elif __CUDA_ARCH__ < CC_TURING
|
||||
#elif __CUDA_ARCH__ < CC_VOLTA
|
||||
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
||||
#endif // __CUDA_ARCH__ < CC_TURING
|
||||
#endif // __CUDA_ARCH__ < CC_VOLTA
|
||||
mul_mat_q6_K(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
||||
|
@ -4110,7 +4110,7 @@ template <bool need_check> static __global__ void
|
|||
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
||||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
|
||||
#elif __CUDA_ARCH__ >= CC_TURING
|
||||
#elif __CUDA_ARCH__ >= CC_VOLTA
|
||||
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
||||
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
||||
const int nwarps = NWARPS_Q6_K_AMPERE;
|
||||
|
@ -4130,7 +4130,7 @@ template <bool need_check> static __global__ void
|
|||
#else
|
||||
(void) vec_dot_q6_K_q8_1_mul_mat;
|
||||
assert(false);
|
||||
#endif // __CUDA_ARCH__ >= CC_TURING
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||
|
@ -4674,6 +4674,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|||
dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
|
@ -4955,7 +4956,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|||
mmq_x = MMQ_X_Q4_0_RDNA1;
|
||||
mmq_y = MMQ_Y_Q4_0_RDNA1;
|
||||
nwarps = NWARPS_Q4_0_RDNA1;
|
||||
} else if (compute_capability >= CC_TURING) {
|
||||
} else if (compute_capability >= CC_VOLTA) {
|
||||
mmq_x = MMQ_X_Q4_0_AMPERE;
|
||||
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
||||
nwarps = NWARPS_Q4_0_AMPERE;
|
||||
|
@ -5000,7 +5001,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|||
mmq_x = MMQ_X_Q4_1_RDNA1;
|
||||
mmq_y = MMQ_Y_Q4_1_RDNA1;
|
||||
nwarps = NWARPS_Q4_1_RDNA1;
|
||||
} else if (compute_capability >= CC_TURING) {
|
||||
} else if (compute_capability >= CC_VOLTA) {
|
||||
mmq_x = MMQ_X_Q4_1_AMPERE;
|
||||
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
||||
nwarps = NWARPS_Q4_1_AMPERE;
|
||||
|
@ -5045,7 +5046,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|||
mmq_x = MMQ_X_Q5_0_RDNA1;
|
||||
mmq_y = MMQ_Y_Q5_0_RDNA1;
|
||||
nwarps = NWARPS_Q5_0_RDNA1;
|
||||
} else if (compute_capability >= CC_TURING) {
|
||||
} else if (compute_capability >= CC_VOLTA) {
|
||||
mmq_x = MMQ_X_Q5_0_AMPERE;
|
||||
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
||||
nwarps = NWARPS_Q5_0_AMPERE;
|
||||
|
@ -5090,7 +5091,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|||
mmq_x = MMQ_X_Q5_1_RDNA1;
|
||||
mmq_y = MMQ_Y_Q5_1_RDNA1;
|
||||
nwarps = NWARPS_Q5_1_RDNA1;
|
||||
} else if (compute_capability >= CC_TURING) {
|
||||
} else if (compute_capability >= CC_VOLTA) {
|
||||
mmq_x = MMQ_X_Q5_1_AMPERE;
|
||||
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
||||
nwarps = NWARPS_Q5_1_AMPERE;
|
||||
|
@ -5135,7 +5136,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|||
mmq_x = MMQ_X_Q8_0_RDNA1;
|
||||
mmq_y = MMQ_Y_Q8_0_RDNA1;
|
||||
nwarps = NWARPS_Q8_0_RDNA1;
|
||||
} else if (compute_capability >= CC_TURING) {
|
||||
} else if (compute_capability >= CC_VOLTA) {
|
||||
mmq_x = MMQ_X_Q8_0_AMPERE;
|
||||
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
||||
nwarps = NWARPS_Q8_0_AMPERE;
|
||||
|
@ -5180,7 +5181,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|||
mmq_x = MMQ_X_Q2_K_RDNA1;
|
||||
mmq_y = MMQ_Y_Q2_K_RDNA1;
|
||||
nwarps = NWARPS_Q2_K_RDNA1;
|
||||
} else if (compute_capability >= CC_TURING) {
|
||||
} else if (compute_capability >= CC_VOLTA) {
|
||||
mmq_x = MMQ_X_Q2_K_AMPERE;
|
||||
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
||||
nwarps = NWARPS_Q2_K_AMPERE;
|
||||
|
@ -5227,7 +5228,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|||
mmq_x = MMQ_X_Q3_K_RDNA1;
|
||||
mmq_y = MMQ_Y_Q3_K_RDNA1;
|
||||
nwarps = NWARPS_Q3_K_RDNA1;
|
||||
} else if (compute_capability >= CC_TURING) {
|
||||
} else if (compute_capability >= CC_VOLTA) {
|
||||
mmq_x = MMQ_X_Q3_K_AMPERE;
|
||||
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
||||
nwarps = NWARPS_Q3_K_AMPERE;
|
||||
|
@ -5273,7 +5274,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|||
mmq_x = MMQ_X_Q4_K_RDNA1;
|
||||
mmq_y = MMQ_Y_Q4_K_RDNA1;
|
||||
nwarps = NWARPS_Q4_K_RDNA1;
|
||||
} else if (compute_capability >= CC_TURING) {
|
||||
} else if (compute_capability >= CC_VOLTA) {
|
||||
mmq_x = MMQ_X_Q4_K_AMPERE;
|
||||
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
||||
nwarps = NWARPS_Q4_K_AMPERE;
|
||||
|
@ -5318,7 +5319,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|||
mmq_x = MMQ_X_Q5_K_RDNA1;
|
||||
mmq_y = MMQ_Y_Q5_K_RDNA1;
|
||||
nwarps = NWARPS_Q5_K_RDNA1;
|
||||
} else if (compute_capability >= CC_TURING) {
|
||||
} else if (compute_capability >= CC_VOLTA) {
|
||||
mmq_x = MMQ_X_Q5_K_AMPERE;
|
||||
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
||||
nwarps = NWARPS_Q5_K_AMPERE;
|
||||
|
@ -5363,7 +5364,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|||
mmq_x = MMQ_X_Q6_K_RDNA1;
|
||||
mmq_y = MMQ_Y_Q6_K_RDNA1;
|
||||
nwarps = NWARPS_Q6_K_RDNA1;
|
||||
} else if (compute_capability >= CC_TURING) {
|
||||
} else if (compute_capability >= CC_VOLTA) {
|
||||
mmq_x = MMQ_X_Q6_K_AMPERE;
|
||||
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
||||
nwarps = NWARPS_Q6_K_AMPERE;
|
||||
|
@ -5941,7 +5942,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|||
switch(type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
return max_compute_capability >= CC_TURING ? 128 : 64;
|
||||
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
|
@ -5952,7 +5953,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
return max_compute_capability >= CC_TURING ? 128 : 64;
|
||||
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return 64;
|
||||
default:
|
||||
|
@ -6117,7 +6118,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|||
|
||||
const int compute_capability = g_compute_capabilities[id];
|
||||
|
||||
if (compute_capability >= CC_TURING && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && ldc == row_diff) {
|
||||
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && ldc == row_diff) {
|
||||
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
||||
half * src0_as_f16 = nullptr;
|
||||
size_t src0_as = 0;
|
||||
|
@ -6128,7 +6129,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|||
src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
|
||||
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
|
||||
}
|
||||
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (half *) src0_dd_i : src0_as_f16;
|
||||
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
|
||||
|
||||
half * src1_as_f16 = nullptr;
|
||||
size_t src1_as = 0;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue