diff --git a/ggml.c b/ggml.c index afaa4f065..d09b698f8 100644 --- a/ggml.c +++ b/ggml.c @@ -6941,15 +6941,23 @@ static void ggml_compute_forward_mul_mat_q_f32( #endif //void *p = (void *) src0->data; - assert((ir1-ir0) % EXPERIMENT_TILESIZE_X == 0); int x_stride = EXPERIMENT_TILESIZE_X; + + // if the second matrix is two small, we cannot use the tiled code if (ne11 < EXPERIMENT_TILESIZE_Y) { x_stride = 1; } - for (int ir = ir0; ir < ir1; ir+=x_stride) { + // check if we can advance with x_stride = EXPERIMENT_TILESIZE_X + //printf("ir0=%i -> ir1 - ir=%i\n", ir0, ir1-ir); + if ((ir1-ir) < EXPERIMENT_TILESIZE_X) { + // we do not have enough rows left - we need to go step by step + //printf("ir0=%i - switching to stride 1\n", ir0, ir1-ir); + x_stride = 1; + } + // src0 indices const int i03 = ir/(ne02*ne01); const int i02 = (ir - i03*ne02*ne01)/ne01; @@ -6988,7 +6996,7 @@ static void ggml_compute_forward_mul_mat_q_f32( assert(ne00 % 32 == 0); - if (ne11 < EXPERIMENT_TILESIZE_Y) { + if ((x_stride != EXPERIMENT_TILESIZE_X) || (ne11 < EXPERIMENT_TILESIZE_Y)) { //printf("using legacy tile size implementation\n"); // existing implementation tiled implementation for (int64_t ic = 0; ic < ne11; ++ic) {