look at the right final memory location.

This commit is contained in:
Julia Longtin 2024-05-11 11:27:52 +00:00
parent 47ca67a062
commit 1b7ca0b413

View file

@ -100,8 +100,8 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
"vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
"vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t" // Perform a fused multiply add
// No compare. we must be three.
"vmovaps\t\t64(%%r10),\t%%zmm5\n\t" // Load two vectors.
"vmovaps\t\t64(%%r12),\t%%zmm6\n\t"
"vmovaps\t\t128(%%r10),\t%%zmm5\n\t" // Load two vectors.
"vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
"vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t" // Perform a fused multiply add
"2:\n\t" // Label for loop end
"vmovnraps\t\t%%zmm0,\t(%[RES])\n\t" // Save our results.