broadcast a single int8, instead of 4 of them.

This commit is contained in:
Julia Longtin 2024-05-10 14:19:27 +00:00
parent 664a6025a1
commit 6e0258abac

View file

@ -21,14 +21,14 @@ void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restri
inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target) inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
{ {
uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0}; uint8_t zero = 0;
__asm__ __volatile__ ( __asm__ __volatile__ (
"vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value. "vbroadcastss\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
"vmovnraps\t\t%%zmm8,\t%[RES]\n\t" "vmovnraps\t\t%%zmm8,\t%[RES]\n\t"
: [RES] "+m" (*target) : [RES] "+m" (*target)
: [Z] "m" (zero) : [Z] "m" (zero)
: "zmm8"); : "zmm8", "memory");
} }