iq3_s: make it work on metal for QK_K = 64
Pleasent surprise: the coding was super-block size independent, so all it took was to delete some QK_K == 256 guards.
This commit is contained in:
parent
e6e61e3158
commit
cbd950b220
1 changed files with 0 additions and 7 deletions
|
@ -4489,7 +4489,6 @@ void kernel_mul_mv_iq3_s_f32_impl(
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if QK_K == 256
|
|
||||||
const int ix = tiisg;
|
const int ix = tiisg;
|
||||||
|
|
||||||
device const float * y4 = y + 32 * ix;
|
device const float * y4 = y + 32 * ix;
|
||||||
|
@ -4535,12 +4534,6 @@ void kernel_mul_mv_iq3_s_f32_impl(
|
||||||
|
|
||||||
y4 += 32 * 32;
|
y4 += 32 * 32;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
(void) x;
|
|
||||||
(void) y;
|
|
||||||
(void) yl;
|
|
||||||
(void) nb32;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (int row = 0; row < N_DST; ++row) {
|
for (int row = 0; row < N_DST; ++row) {
|
||||||
all_sum = simd_sum(sumf[row]);
|
all_sum = simd_sum(sumf[row]);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue