CUDA: use MMQ instead of cuBLAS by default (#8075)
This commit is contained in:
parent
d62e4aaa02
commit
a818f3028d
8 changed files with 124 additions and 122 deletions
|
@ -146,23 +146,6 @@
|
|||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
||||
|
||||
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
||||
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
||||
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
||||
// - 7B quantum model: +100-200 MB
|
||||
// - 13B quantum model: +200-400 MB
|
||||
//
|
||||
//#define GGML_CUDA_FORCE_MMQ
|
||||
|
||||
// TODO: improve this to be correct for more hardware
|
||||
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
||||
#if !defined(GGML_CUDA_FORCE_MMQ)
|
||||
#define CUDA_USE_TENSOR_CORES
|
||||
#endif
|
||||
|
||||
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
|
||||
#define MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
|
||||
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
@ -343,15 +326,15 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
|
|||
#define INT8_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||
|
||||
static bool fast_fp16_available(const int cc) {
|
||||
static constexpr bool fast_fp16_available(const int cc) {
|
||||
return cc >= CC_PASCAL && cc != 610;
|
||||
}
|
||||
|
||||
static bool fp16_mma_available(const int cc) {
|
||||
static constexpr bool fp16_mma_available(const int cc) {
|
||||
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
||||
}
|
||||
|
||||
static bool int8_mma_available(const int cc) {
|
||||
static constexpr bool int8_mma_available(const int cc) {
|
||||
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
|
||||
}
|
||||
|
||||
|
@ -643,19 +626,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
|
|||
static constexpr int qi = QI3_S;
|
||||
};
|
||||
|
||||
static constexpr int get_mmq_x_max_host(int cc) {
|
||||
#ifdef CUDA_USE_TENSOR_CORES
|
||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
|
||||
#else
|
||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
|
||||
#endif // CUDA_USE_TENSOR_CORES
|
||||
}
|
||||
|
||||
// Round rows to this value for --split-mode row:
|
||||
static constexpr int get_mmq_y_host(int cc) {
|
||||
return cc >= CC_VOLTA ? 128 : 64;
|
||||
}
|
||||
|
||||
//////////////////////
|
||||
|
||||
struct ggml_cuda_device_info {
|
||||
|
|
|
@ -69,7 +69,13 @@ void ggml_cuda_op_mul_mat_q(
|
|||
GGML_UNUSED(src1_ddf_i);
|
||||
}
|
||||
|
||||
bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
||||
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||
return false;
|
||||
#endif // GGML_CUDA_FORCE_CUBLAS
|
||||
|
||||
bool mmq_supported;
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
|
@ -81,8 +87,32 @@ bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
|||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
return true;
|
||||
mmq_supported = true;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
mmq_supported = false;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!mmq_supported) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (int8_mma_available(cc)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (cc < MIN_CC_DP4A) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
return true;
|
||||
#endif //GGML_CUDA_FORCE_MMQ
|
||||
|
||||
if (cc < CC_OFFSET_AMD) {
|
||||
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
|
|
@ -7,6 +7,8 @@
|
|||
#include <climits>
|
||||
#include <cstdint>
|
||||
|
||||
#define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
|
||||
|
||||
typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int & kbx0, const int & i_max, const int & stride);
|
||||
typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k0);
|
||||
typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max);
|
||||
|
@ -24,25 +26,42 @@ struct tile_x_sizes {
|
|||
int sc;
|
||||
};
|
||||
|
||||
// get_mmq_x_max_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
|
||||
|
||||
static constexpr __device__ int get_mmq_x_max_device() {
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
return 64;
|
||||
static constexpr int get_mmq_x_max_host(const int cc) {
|
||||
return int8_mma_available(cc) ? 128 :
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
#ifdef CUDA_USE_TENSOR_CORES
|
||||
return MMQ_MAX_BATCH_SIZE;
|
||||
#else
|
||||
return 128;
|
||||
#endif // CUDA_USE_TENSOR_CORES
|
||||
#else
|
||||
return 64;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
|
||||
#endif // GGML_CUDA_FORCE_MMQ
|
||||
}
|
||||
|
||||
// get_mmq_y_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
|
||||
static constexpr __device__ int get_mmq_x_max_device() {
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
return 128;
|
||||
#else // INT8_MMA_AVAILABLE
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
return 128;
|
||||
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
return MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
#else // GGML_CUDA_FORCE_MMQ
|
||||
return 128;
|
||||
#endif // GGML_CUDA_FORCE_MMQ
|
||||
#else // __CUDA_ARCH__ >= CC_VOLTA
|
||||
|
||||
return 64;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static constexpr int get_mmq_y_host(const int cc) {
|
||||
return int8_mma_available(cc) || cc >= CC_VOLTA ? 128 : 64;
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_mmq_y_device() {
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
@ -2590,4 +2609,4 @@ void ggml_cuda_op_mul_mat_q(
|
|||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, cudaStream_t stream);
|
||||
|
||||
bool ggml_cuda_supports_mmq(enum ggml_type type);
|
||||
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11);
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
#include "common.cuh"
|
||||
|
||||
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
|
||||
|
||||
void ggml_cuda_op_mul_mat_vec_q(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue