CUDA: revise q8_1 data layout for mul_mat_q (#7824)

This commit is contained in:
Johannes Gäßler 2024-06-09 09:42:25 +02:00 committed by GitHub
parent 2decf57bc6
commit 42b53d192f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 282 additions and 151 deletions

View file

@ -11,6 +11,7 @@ void ggml_cuda_op_mul_mat_q(
const int64_t nb01 = src0->nb[1];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
GGML_ASSERT(ne10 % QK8_1 == 0);
const int64_t ne0 = dst->ne[0];
@ -25,7 +26,7 @@ void ggml_cuda_op_mul_mat_q(
// nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, nrows_dst};
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
switch (src0->type) {
case GGML_TYPE_Q4_0: