CUDA: use MMQ instead of cuBLAS by default (#8075)

This commit is contained in:
Johannes Gäßler 2024-06-24 17:43:42 +02:00 committed by GitHub
parent d62e4aaa02
commit a818f3028d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 124 additions and 122 deletions

View file

@ -146,23 +146,6 @@
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
// - 7B quantum model: +100-200 MB
// - 13B quantum model: +200-400 MB
//
//#define GGML_CUDA_FORCE_MMQ
// TODO: improve this to be correct for more hardware
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
#if !defined(GGML_CUDA_FORCE_MMQ)
#define CUDA_USE_TENSOR_CORES
#endif
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
#define MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
#if defined(_MSC_VER)
@ -343,15 +326,15 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
#define INT8_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
static bool fast_fp16_available(const int cc) {
static constexpr bool fast_fp16_available(const int cc) {
return cc >= CC_PASCAL && cc != 610;
}
static bool fp16_mma_available(const int cc) {
static constexpr bool fp16_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
}
static bool int8_mma_available(const int cc) {
static constexpr bool int8_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
}
@ -643,19 +626,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
static constexpr int qi = QI3_S;
};
static constexpr int get_mmq_x_max_host(int cc) {
#ifdef CUDA_USE_TENSOR_CORES
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
#else
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
#endif // CUDA_USE_TENSOR_CORES
}
// Round rows to this value for --split-mode row:
static constexpr int get_mmq_y_host(int cc) {
return cc >= CC_VOLTA ? 128 : 64;
}
//////////////////////
struct ggml_cuda_device_info {

View file

@ -69,7 +69,13 @@ void ggml_cuda_op_mul_mat_q(
GGML_UNUSED(src1_ddf_i);
}
bool ggml_cuda_supports_mmq(enum ggml_type type) {
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
#ifdef GGML_CUDA_FORCE_CUBLAS
return false;
#endif // GGML_CUDA_FORCE_CUBLAS
bool mmq_supported;
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
@ -81,8 +87,32 @@ bool ggml_cuda_supports_mmq(enum ggml_type type) {
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
return true;
mmq_supported = true;
break;
default:
return false;
mmq_supported = false;
break;
}
if (!mmq_supported) {
return false;
}
if (int8_mma_available(cc)) {
return true;
}
if (cc < MIN_CC_DP4A) {
return false;
}
#ifdef GGML_CUDA_FORCE_MMQ
return true;
#endif //GGML_CUDA_FORCE_MMQ
if (cc < CC_OFFSET_AMD) {
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}

View file

@ -7,6 +7,8 @@
#include <climits>
#include <cstdint>
#define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int & kbx0, const int & i_max, const int & stride);
typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k0);
typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & stride, const int & i_max, const int & j_max);
@ -24,25 +26,42 @@ struct tile_x_sizes {
int sc;
};
// get_mmq_x_max_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
static constexpr __device__ int get_mmq_x_max_device() {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
return 64;
static constexpr int get_mmq_x_max_host(const int cc) {
return int8_mma_available(cc) ? 128 :
#ifdef GGML_CUDA_FORCE_MMQ
cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
#else
#if __CUDA_ARCH__ >= CC_VOLTA
#ifdef CUDA_USE_TENSOR_CORES
return MMQ_MAX_BATCH_SIZE;
#else
return 128;
#endif // CUDA_USE_TENSOR_CORES
#else
return 64;
#endif // __CUDA_ARCH__ >= CC_VOLTA
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
#endif // GGML_CUDA_FORCE_MMQ
}
// get_mmq_y_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
static constexpr __device__ int get_mmq_x_max_device() {
#ifdef INT8_MMA_AVAILABLE
return 128;
#else // INT8_MMA_AVAILABLE
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
return 128;
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if __CUDA_ARCH__ >= CC_VOLTA
#ifdef GGML_CUDA_FORCE_MMQ
return MMQ_DP4A_MAX_BATCH_SIZE;
#else // GGML_CUDA_FORCE_MMQ
return 128;
#endif // GGML_CUDA_FORCE_MMQ
#else // __CUDA_ARCH__ >= CC_VOLTA
return 64;
#endif // __CUDA_ARCH__ >= CC_VOLTA
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#endif // INT8_MMA_AVAILABLE
}
static constexpr int get_mmq_y_host(const int cc) {
return int8_mma_available(cc) || cc >= CC_VOLTA ? 128 : 64;
}
static constexpr __device__ int get_mmq_y_device() {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
@ -2590,4 +2609,4 @@ void ggml_cuda_op_mul_mat_q(
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream);
bool ggml_cuda_supports_mmq(enum ggml_type type);
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11);

View file

@ -1,5 +1,7 @@
#include "common.cuh"
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
void ggml_cuda_op_mul_mat_vec_q(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,