metal : 8% faster q4_0
Avoid copying into local uchar4 anf float4.
This commit is contained in:
parent
72ff5282bf
commit
090710e485
1 changed files with 35 additions and 12 deletions
|
@ -267,6 +267,8 @@ kernel void kernel_mul_mat_q4_0_f32(
|
|||
uint2 tptg[[threads_per_threadgroup]]) {
|
||||
const int nb = ne00/QK4_0;
|
||||
|
||||
const int8_t m8 = 8;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
|
||||
|
@ -276,33 +278,54 @@ kernel void kernel_mul_mat_q4_0_f32(
|
|||
const uint nth = tptg.x*tptg.y;
|
||||
const uint ith = tptg.y*tpitg.x + tpitg.y;
|
||||
|
||||
const int first = 4 * tpitg.y;
|
||||
|
||||
sum[ith] = 0.0f;
|
||||
|
||||
float sumf = 0;
|
||||
//float sumf1 = 0;
|
||||
//float sumf2 = 0;
|
||||
|
||||
for (int i = tpitg.x; i < nb; i += tptg.x) {
|
||||
device const uchar4 * x0p = (device const uchar4 *) (x + i)->qs;
|
||||
device const float4 * y0p = (device const float4 *) (y + i*QK4_0);
|
||||
device const uchar * x0p = (device const uchar *) (x + i)->qs;
|
||||
device const float * y0p = (device const float *) (y + i*QK4_0);
|
||||
|
||||
const float d = (float)((x + i)->d);
|
||||
|
||||
const uchar4 x0v = *(x0p + tpitg.y);
|
||||
const float4 y0v = *(y0p + tpitg.y + 0);
|
||||
const float4 y1v = *(y0p + tpitg.y + 4);
|
||||
device const uchar * x0v = x0p + first;
|
||||
device const float * y0v = y0p + first;
|
||||
device const float * y1v = y0p + first + 16;
|
||||
|
||||
float acc = 0.0f;
|
||||
//float3 acc = {0.0f, 0.0f, 0.f};
|
||||
float2 acc = {0.0f, 0.0f};
|
||||
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
const int x0 = x0v[j] & 0x0F;
|
||||
const int x1 = x0v[j] >> 4;
|
||||
|
||||
const float y0 = y0v[j];
|
||||
const float y1 = y1v[j];
|
||||
//acc[0] += y0v[j] * (x0v[j] & 0xF);
|
||||
//acc[1] += y1v[j] * (x0v[j] >> 4);
|
||||
//acc[2] += y0v[j] + y1v[j];
|
||||
|
||||
acc += (x0 - 8)*y0 + (x1 - 8)*y1;
|
||||
acc[0] += y0v[j] * ((int8_t)(x0v[j] & 0xF) - m8);
|
||||
acc[1] += y1v[j] * ((int8_t)(x0v[j] >> 4) - m8);
|
||||
|
||||
//const int x0 = x0v[j] & 0x0F;
|
||||
//const int x1 = x0v[j] >> 4;
|
||||
|
||||
//const float y0 = y0v[j];
|
||||
//const float y1 = y1v[j];
|
||||
|
||||
//acc += (x0 - 8)*y0 + (x1 - 8)*y1;
|
||||
}
|
||||
|
||||
sum[ith] += acc*d;
|
||||
//sum[ith] += acc*d;
|
||||
sumf += d * (acc[0] + acc[1]);
|
||||
//sumf1 += d * (acc[0] + acc[1]);
|
||||
//sumf2 += d * acc[2];
|
||||
}
|
||||
|
||||
sum[ith] = sumf;
|
||||
//sum[ith] = sumf1 - 8.f*sumf2;
|
||||
|
||||
//
|
||||
// Accumulate the sum from all threads in the threadgroup
|
||||
// This version is slightly faster than the commented out one below,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue