expand mask, and align memory.

This commit is contained in:
Julia Longtin 2024-03-23 20:48:43 +00:00
parent 7a00422fa3
commit 5c010f761f

View file

@ -30,7 +30,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target) inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
{ {
uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0}; uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
uint32_t mask=0x0000000F; uint32_t mask=0x000000FF;
__asm__ __volatile__ ( __asm__ __volatile__ (
"vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value. "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
@ -39,7 +39,7 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
: [RES] "+m" (*target) : [RES] "+m" (*target)
: [Z] "m" (zero), : [Z] "m" (zero),
[M] "r" (mask) [M] "r" (mask)
: "r9", "zmm8", "k1"); : "zmm8", "k1", memory);
} }
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
@ -62,11 +62,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
int8_t aux8[QK_K]; int8_t aux8[QK_K];
int16_t aux16[8]; int16_t aux16[8];
float32x8_t sums; float32x8_t sums __attribute__((aligned(64)));
int32_t aux32[8]; int32_t aux32[8];
//memset(sums, 0, 8*sizeof(float));
GGML_F32x8_VEC_ZERO(&sums); GGML_F32x8_VEC_ZERO(&sums);
float sumf = 0; float sumf = 0;