metal : minor fixup in FA kernel
ggml-ci
This commit is contained in:
parent
42cadc74bd
commit
40e717263e
1 changed files with 3 additions and 3 deletions
|
@ -2776,11 +2776,11 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
|||
const short iv3 = iq3 / rv3;
|
||||
|
||||
// load the queries from shared memory into local memory
|
||||
float4 mq[D4];
|
||||
float4 mq[D4/NW];
|
||||
|
||||
for (short ii = 0; ii < D4; ii += NW) {
|
||||
short i = ii + tiisg;
|
||||
mq[i] = (float4) sq4[i];
|
||||
mq[i/NW] = (float4) sq4[i];
|
||||
}
|
||||
|
||||
// pointer to the mask
|
||||
|
@ -2812,7 +2812,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
|||
mk[2] = (float4) pk4[i + 2*(nb11/8)];
|
||||
mk[3] = (float4) pk4[i + 3*(nb11/8)];
|
||||
|
||||
mqk += (float4) (mq[i] * mk);
|
||||
mqk += (float4) (mq[i/NW] * mk);
|
||||
}
|
||||
|
||||
// reduce the results from the threads in the simdgroup
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue