Bugfix: We can handle the situation where matrix rows / thread count is not a multiple of TILESIZE_X
This commit is contained in:
parent
a33cbbe03b
commit
42ad59fe41
1 changed files with 11 additions and 3 deletions
14
ggml.c
14
ggml.c
|
@ -6941,15 +6941,23 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
#endif
|
||||
|
||||
//void *p = (void *) src0->data;
|
||||
assert((ir1-ir0) % EXPERIMENT_TILESIZE_X == 0);
|
||||
|
||||
int x_stride = EXPERIMENT_TILESIZE_X;
|
||||
|
||||
// if the second matrix is two small, we cannot use the tiled code
|
||||
if (ne11 < EXPERIMENT_TILESIZE_Y) {
|
||||
x_stride = 1;
|
||||
}
|
||||
|
||||
|
||||
for (int ir = ir0; ir < ir1; ir+=x_stride) {
|
||||
// check if we can advance with x_stride = EXPERIMENT_TILESIZE_X
|
||||
//printf("ir0=%i -> ir1 - ir=%i\n", ir0, ir1-ir);
|
||||
if ((ir1-ir) < EXPERIMENT_TILESIZE_X) {
|
||||
// we do not have enough rows left - we need to go step by step
|
||||
//printf("ir0=%i - switching to stride 1\n", ir0, ir1-ir);
|
||||
x_stride = 1;
|
||||
}
|
||||
|
||||
// src0 indices
|
||||
const int i03 = ir/(ne02*ne01);
|
||||
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
|
@ -6988,7 +6996,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
|
||||
assert(ne00 % 32 == 0);
|
||||
|
||||
if (ne11 < EXPERIMENT_TILESIZE_Y) {
|
||||
if ((x_stride != EXPERIMENT_TILESIZE_X) || (ne11 < EXPERIMENT_TILESIZE_Y)) {
|
||||
//printf("using legacy tile size implementation\n");
|
||||
// existing implementation tiled implementation
|
||||
for (int64_t ic = 0; ic < ne11; ++ic) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue