CUDA: support for mat. mul. with ne03 != ne13 (#11656)
This commit is contained in:
parent
1ec208083c
commit
fa62da9b2d
2 changed files with 81 additions and 60 deletions
|
@ -1366,8 +1366,6 @@ static void ggml_cuda_op_mul_mat(
|
|||
const int64_t ne13 = src1->ne[3];
|
||||
const int64_t nrows1 = ggml_nrows(src1);
|
||||
|
||||
GGML_ASSERT(ne03 == ne13);
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
|
||||
|
@ -1381,9 +1379,11 @@ static void ggml_cuda_op_mul_mat(
|
|||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
|
||||
|
||||
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
||||
GGML_ASSERT(ne12 % ne02 == 0);
|
||||
GGML_ASSERT(ne13 % ne03 == 0);
|
||||
|
||||
const int64_t i02_divisor = ne12 / ne02;
|
||||
const int64_t i03_divisor = ne13 / ne03;
|
||||
|
||||
const size_t src0_ts = ggml_type_size(src0->type);
|
||||
const size_t src0_bs = ggml_blck_size(src0->type);
|
||||
|
@ -1399,6 +1399,7 @@ static void ggml_cuda_op_mul_mat(
|
|||
GGML_ASSERT(!(split && ne02 > 1));
|
||||
GGML_ASSERT(!(split && ne03 > 1));
|
||||
GGML_ASSERT(!(split && ne02 < ne12));
|
||||
GGML_ASSERT(!(split && ne03 < ne13));
|
||||
|
||||
ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;
|
||||
|
||||
|
@ -1562,7 +1563,8 @@ static void ggml_cuda_op_mul_mat(
|
|||
}
|
||||
|
||||
// for split tensors the data begins at i0 == i0_offset_low
|
||||
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
|
||||
const size_t nbytes_src0_matrix = ne01*ne00*src0_ts / src0_bs;
|
||||
char * src0_dd_i = dev[id].src0_dd + ((i03/i03_divisor)*ne02 + (i02/i02_divisor)) * nbytes_src0_matrix;
|
||||
float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
|
||||
char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
|
||||
float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
||||
|
@ -1606,8 +1608,9 @@ static void ggml_cuda_op_mul_mat(
|
|||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
|
||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
|
||||
if (src1_col_0 == 0 && !src0_is_contiguous && i03 % i03_divisor == 0 && i02 % i02_divisor == 0) {
|
||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
||||
src0_dd_i, src0, i03/i03_divisor, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
|
||||
}
|
||||
|
||||
// do the computation
|
||||
|
@ -1882,7 +1885,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
|
||||
if (!split && use_mul_mat_vec && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
|
||||
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
|
||||
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
|
||||
ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
|
||||
|
@ -2216,12 +2219,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||
ggml_cuda_op_rms_norm_back(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
||||
GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
|
||||
return false;
|
||||
} else {
|
||||
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
||||
}
|
||||
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
||||
break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
ggml_cuda_mul_mat_id(ctx, dst);
|
||||
|
@ -2998,9 +2996,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
|
||||
return false;
|
||||
}
|
||||
#ifdef GGML_USE_MUSA
|
||||
if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
|
||||
!ggml_is_transposed(a) && !ggml_is_transposed(b)) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue