cuda : fix bin bcast with non-cont src0

This commit is contained in:
slaren 2024-04-17 19:02:52 +02:00
parent bf56fdecb3
commit d68c935c8d
2 changed files with 47 additions and 17 deletions

View file

@ -22,6 +22,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
int ne0, int ne1, int ne2, int ne3, int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13, int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3, /*int s0, */ int s1, int s2, int s3,
/*int s01,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13) { /*int s10,*/ int s11, int s12, int s13) {
const int i0s = blockDim.x*blockIdx.x + threadIdx.x; const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
const int i1 = (blockDim.y*blockIdx.y + threadIdx.y); const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
@ -36,9 +37,9 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
const int i12 = i2 % ne12; const int i12 = i2 % ne12;
const int i13 = i3 % ne13; const int i13 = i3 % ne13;
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i_src0; const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
const src0_t * src0_row = src0 + i_src0; const src0_t * src0_row = src0 + i_src0;
const src1_t * src1_row = src1 + i_src1; const src1_t * src1_row = src1 + i_src1;
@ -55,6 +56,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
int ne0, int ne1, int ne2, int ne3, int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13, int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3, /*int s0, */ int s1, int s2, int s3,
/*int s01,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13) { /*int s10,*/ int s11, int s12, int s13) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
@ -72,9 +74,9 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
const int i12 = i2 % ne12; const int i12 = i2 % ne12;
const int i13 = i3 % ne13; const int i13 = i3 % ne13;
const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i_src0; const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
const src0_t * src0_row = src0 + i_src0; const src0_t * src0_row = src0 + i_src0;
const src1_t * src1_row = src1 + i_src1; const src1_t * src1_row = src1 + i_src1;
@ -118,17 +120,20 @@ struct bin_bcast_cuda {
cnb[3] *= cne[3]; cnb[3] *= cne[3];
}; };
for (int i = 0; i < 4; i++) { if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
if (nr[i] != 1) { for (int i = 0; i < 4; i++) {
break; if (nr[i] != 1) {
} break;
if (i > 0) { }
collapse_nb(cnb0, cne0); if (i > 0) {
collapse_nb(cnb1, cne1); collapse_nb(cnb0, cne0);
collapse(cne0); collapse_nb(cnb1, cne1);
collapse(cne1); collapse(cne0);
collapse(cne1);
}
} }
} }
{ {
int64_t ne0 = cne0[0]; int64_t ne0 = cne0[0];
int64_t ne1 = cne0[1]; int64_t ne1 = cne0[1];
@ -160,7 +165,28 @@ struct bin_bcast_cuda {
size_t s12 = nb12 / sizeof(src1_t); size_t s12 = nb12 / sizeof(src1_t);
size_t s13 = nb13 / sizeof(src1_t); size_t s13 = nb13 / sizeof(src1_t);
size_t s00 = nb00 / sizeof(src0_t);
size_t s01 = nb01 / sizeof(src0_t);
size_t s02 = nb02 / sizeof(src0_t);
size_t s03 = nb03 / sizeof(src0_t);
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
GGML_ASSERT(s0 == 1); GGML_ASSERT(s0 == 1);
GGML_ASSERT(s00 == 1);
GGML_ASSERT(s10 == 1); GGML_ASSERT(s10 == 1);
const int block_size = 128; const int block_size = 128;
@ -179,13 +205,14 @@ struct bin_bcast_cuda {
); );
if (block_nums.z > 65535) { if (block_nums.z > 65535) {
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>( k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
src0_dd, src1_dd, dst_dd, src0_dd, src1_dd, dst_dd,
ne0, ne1, ne2, ne3, ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13, ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3, /* s0, */ s1, s2, s3,
/* s00, */ s01, s02, s03,
/* s10, */ s11, s12, s13); /* s10, */ s11, s12, s13);
} else { } else {
k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>( k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
@ -193,6 +220,7 @@ struct bin_bcast_cuda {
ne0, ne1, ne2, ne3, ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13, ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3, /* s0, */ s1, s2, s3,
/* s00, */ s01, s02, s03,
/* s10, */ s11, s12, s13); /* s10, */ s11, s12, s13);
} }
} }

View file

@ -6171,9 +6171,6 @@ static struct ggml_tensor * llm_build_moe_ffn(
ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens, ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
experts->nb[2], i*experts->nb[1]); experts->nb[2], i*experts->nb[1]);
// FIXME: non-contiguous add broken in cuda
cur_expert = ggml_cont(ctx, cur_expert);
if (i == 0) { if (i == 0) {
moe_out = cur_expert; moe_out = cur_expert;
} else { } else {
@ -6181,6 +6178,11 @@ static struct ggml_tensor * llm_build_moe_ffn(
} }
} }
if (n_expert_used == 1) {
// avoid returning a non-contiguous tensor
moe_out = ggml_cont(ctx, moe_out);
}
return moe_out; return moe_out;
} }