diff --git a/ggml-metal.metal b/ggml-metal.metal index 43814ed09..981bb41af 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -267,6 +267,8 @@ kernel void kernel_mul_mat_q4_0_f32( uint2 tptg[[threads_per_threadgroup]]) { const int nb = ne00/QK4_0; + const int8_t m8 = 8; + const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; @@ -276,33 +278,54 @@ kernel void kernel_mul_mat_q4_0_f32( const uint nth = tptg.x*tptg.y; const uint ith = tptg.y*tpitg.x + tpitg.y; + const int first = 4 * tpitg.y; + sum[ith] = 0.0f; + float sumf = 0; + //float sumf1 = 0; + //float sumf2 = 0; + for (int i = tpitg.x; i < nb; i += tptg.x) { - device const uchar4 * x0p = (device const uchar4 *) (x + i)->qs; - device const float4 * y0p = (device const float4 *) (y + i*QK4_0); + device const uchar * x0p = (device const uchar *) (x + i)->qs; + device const float * y0p = (device const float *) (y + i*QK4_0); const float d = (float)((x + i)->d); - const uchar4 x0v = *(x0p + tpitg.y); - const float4 y0v = *(y0p + tpitg.y + 0); - const float4 y1v = *(y0p + tpitg.y + 4); + device const uchar * x0v = x0p + first; + device const float * y0v = y0p + first; + device const float * y1v = y0p + first + 16; - float acc = 0.0f; + //float3 acc = {0.0f, 0.0f, 0.f}; + float2 acc = {0.0f, 0.0f}; for (int j = 0; j < 4; ++j) { - const int x0 = x0v[j] & 0x0F; - const int x1 = x0v[j] >> 4; - const float y0 = y0v[j]; - const float y1 = y1v[j]; + //acc[0] += y0v[j] * (x0v[j] & 0xF); + //acc[1] += y1v[j] * (x0v[j] >> 4); + //acc[2] += y0v[j] + y1v[j]; - acc += (x0 - 8)*y0 + (x1 - 8)*y1; + acc[0] += y0v[j] * ((int8_t)(x0v[j] & 0xF) - m8); + acc[1] += y1v[j] * ((int8_t)(x0v[j] >> 4) - m8); + + //const int x0 = x0v[j] & 0x0F; + //const int x1 = x0v[j] >> 4; + + //const float y0 = y0v[j]; + //const float y1 = y1v[j]; + + //acc += (x0 - 8)*y0 + (x1 - 8)*y1; } - sum[ith] += acc*d; + //sum[ith] += acc*d; + sumf += d * (acc[0] + acc[1]); + //sumf1 += d * (acc[0] + acc[1]); + //sumf2 += d * acc[2]; } + sum[ith] = sumf; + //sum[ith] = sumf1 - 8.f*sumf2; + // // Accumulate the sum from all threads in the threadgroup // This version is slightly faster than the commented out one below,