This commit is contained in:
Milot Mirdita 2025-01-16 05:23:38 +00:00 committed by GitHub
commit 9db9b205b6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -118,13 +118,28 @@ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * sr
const size_t s12 = nb12 / ggml_element_size(src1); const size_t s12 = nb12 / ggml_element_size(src1);
//const size_t s13 = nb13 / ggml_element_size(src1); //const size_t s13 = nb13 / ggml_element_size(src1);
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>( // CUDA grids typically cannot exceed 65535 in .y dimension
src0_dd, src1_dd, dst_dd, static const int64_t MAX_GRID_Y = 65535;
for (int64_t startY = 0; startY < ne10; startY += MAX_GRID_Y) {
int64_t chunkSizeY = std::min<int64_t>(MAX_GRID_Y, ne10 - startY);
// launch chunkSizeY blocks in the Y-dimension instead of the entire ne10
dim3 block_nums(block_num_x, chunkSizeY, ne11 * ne12);
// pointer offset: since the kernel calculates src1[i10*s10 + ...] and dst[i10*s1 + ...],
// we shift src1_dd/dst_dd by (startY * stride) so that when i10=0 in the kernel,
// it corresponds to row "startY" in the overall space.
const int32_t * src1_dd_offset = src1_dd + startY * s10;
float * dst_dd_offset = dst_dd + startY * s1;
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd_offset, dst_dd_offset,
ne00, /*ne01, ne02, ne03,*/ ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/ /*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3, /* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03, /* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/); s10, s11, s12/*, s13*/);
}
GGML_UNUSED(dst); GGML_UNUSED(dst);
} }