diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c index eebd12d89..418fa772d 100644 --- a/ggml-phi-knc-dot_q5_K_q8_K.c +++ b/ggml-phi-knc-dot_q5_K_q8_K.c @@ -15,19 +15,18 @@ // For block_q5_K and block_q8_K. only given the second time. #include "ggml-common.h" - // This SIMD unit can work with 32 float32s at once. #define GGML_F32_STEP 32 // We can fit 16 of these float32s in a single vector register. #define GGML_F32_EPR 16 -typedef float float32x8_t __attribute__((vector_size (64))); -typedef float float32x16_t __attribute__((vector_size (128))); -typedef int8_t int8x16_t __attribute__((vector_size (32))); -typedef int16_t int16x8_t __attribute__((vector_size (32))); -typedef int16_t int16x16_t __attribute__((vector_size (64))); -typedef int32_t int32x8_t __attribute__((vector_size (64))); -typedef int32_t int32x16_t __attribute__((vector_size (128))); +typedef float float32x8_t __attribute__((vector_size (32))); +typedef float float32x16_t __attribute__((vector_size (64))); +typedef int8_t int8x16_t __attribute__((vector_size (16))); +typedef int16_t int16x8_t __attribute__((vector_size (16))); +typedef int16_t int16x16_t __attribute__((vector_size (32))); +typedef int32_t int32x8_t __attribute__((vector_size (32))); +typedef int32_t int32x16_t __attribute__((vector_size (64))); /* A forward declaration, to keep GCC happy. */ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc); @@ -145,10 +144,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r const uint8_t * scales = (const uint8_t*)&utmp[0]; const uint8_t * mins = (const uint8_t*)&utmp[2]; - float32x16_t sums __attribute__((aligned(128))); - int8x16_t aux8[QK_K/16] __attribute__((aligned(32))); - int16x16_t aux16[QK_K/16] __attribute__((aligned(64))); - int32x16_t aux32 __attribute__((aligned(128))); + float32x16_t sums __attribute__((aligned(64))); + int8x16_t aux8[QK_K/16] __attribute__((aligned(16))); + int16x16_t aux16[QK_K/16] __attribute__((aligned(32))); + int32x16_t aux32 __attribute__((aligned(64))); GGML_F32x16_VEC_ZERO(&sums);