Another faster f16 x f32 matrix multiply kernel

2023-09-08 16:34:07 +02:00 · 2023-09-08 16:34:07 +02:00 · 9f353f0536
commit 9f353f0536
parent c9e38057f0
1 changed files with 1 additions and 0 deletions
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -682,6 +682,7 @@ kernel void kernel_mul_mat_f16_f32_l4(
    device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);

    for (int r1 = 0; r1 < nrows; ++r1) {
+
        device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);

        float sumf = 0;