use better memory save operator.

2024-03-23 20:49:11 +00:00 · 2024-03-23 20:49:11 +00:00 · ed639a6cf9
commit ed639a6cf9
parent 5c010f761f
1 changed files with 1 additions and 4 deletions
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@ -6,9 +6,6 @@
 // For memcpy.
 #include <string.h>

-// No, we have an SIMD unit.
-// #define GGML_SIMD
-
 // This SIMD unit can work with 32 float32s at once.
 #define GGML_F32_STEP 32
 // We can fit 16 of these float32s in a single vector register.
@ -27,7 +24,7 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)

  __asm__ __volatile__ (
                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
-                        "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
+                        "vmovnraps\t\t%%zmm8,\t%[RES]\n\t"
                       : [RES]  "+m"  (*target)
                       : [Z]    "m"   (zero)
                       : "zmm8");