CUDA: refactor mmq, dmmv, mmvq (#7716)
* CUDA: refactor mmq, dmmv, mmvq * fix out-of-bounds write * struct for qk, qr, qi * fix cmake build * mmq_type_traits
This commit is contained in:
parent
2b3389677a
commit
7d1a378b8f
112 changed files with 1783 additions and 1767 deletions
84
ggml-cuda.cu
84
ggml-cuda.cu
|
@ -633,88 +633,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|||
|
||||
// cuda split buffer
|
||||
|
||||
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
int64_t max_compute_capability = INT_MIN;
|
||||
static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
|
||||
int64_t row_rounding = 0;
|
||||
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
||||
if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
|
||||
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
|
||||
min_compute_capability = ggml_cuda_info().devices[id].cc;
|
||||
}
|
||||
if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
|
||||
max_compute_capability = ggml_cuda_info().devices[id].cc;
|
||||
}
|
||||
if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
switch(type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
return 1;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
||||
case GGML_TYPE_Q3_K:
|
||||
return min_compute_capability < CC_RDNA2 ? 128 : 64;
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
|
||||
}
|
||||
#else
|
||||
switch(type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return 64;
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
return 1;
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return 64;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
return row_rounding;
|
||||
}
|
||||
|
||||
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
|
||||
const int64_t nrows = ggml_nrows(tensor);
|
||||
const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
|
||||
const int64_t rounding = get_row_rounding(tensor_split);
|
||||
|
||||
*row_low = id == 0 ? 0 : nrows*tensor_split[id];
|
||||
*row_low -= *row_low % rounding;
|
||||
|
@ -1499,7 +1433,7 @@ static void ggml_cuda_op_mul_mat(
|
|||
// for multi GPU, get the row boundaries from tensor split
|
||||
// and round to mul_mat_q tile sizes
|
||||
if (split) {
|
||||
const int64_t rounding = get_row_rounding(src0->type, tensor_split);
|
||||
const int64_t rounding = get_row_rounding(tensor_split);
|
||||
|
||||
if (id != 0) {
|
||||
dev[id].row_low = ne01*tensor_split[id];
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue