separate filling aux16 from consuming aux16 by making it an array of vectors.
This commit is contained in:
parent
e579af1e95
commit
2a47e5f05f
1 changed files with 11 additions and 7 deletions
|
@ -147,7 +147,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
|
|
||||||
float32x16_t sums __attribute__((aligned(128)));
|
float32x16_t sums __attribute__((aligned(128)));
|
||||||
int8x16_t aux8[QK_K/16] __attribute__((aligned(32)));
|
int8x16_t aux8[QK_K/16] __attribute__((aligned(32)));
|
||||||
int16x16_t aux16 __attribute__((aligned(64)));
|
int16x16_t aux16[QK_K/16] __attribute__((aligned(64)));
|
||||||
int32x16_t aux32 __attribute__((aligned(128)));
|
int32x16_t aux32 __attribute__((aligned(128)));
|
||||||
|
|
||||||
GGML_F32x16_VEC_ZERO(&sums);
|
GGML_F32x16_VEC_ZERO(&sums);
|
||||||
|
@ -188,15 +188,19 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||||
int is = 0;
|
int is = 0;
|
||||||
for (int j = 0; j < QK_K/32; ++j) {
|
for (int j = 0; j < QK_K/32; ++j) {
|
||||||
int32_t scale = scales[is++];
|
for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[j*2])[l] = q8[l] * a[l];
|
||||||
for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
|
|
||||||
GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
|
|
||||||
q8 += 16; a += 16;
|
q8 += 16; a += 16;
|
||||||
for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
|
for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[(j*2)+1])[l] = q8[l] * a[l];
|
||||||
// FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
|
|
||||||
GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
|
|
||||||
q8 += 16; a += 16;
|
q8 += 16; a += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
|
||||||
|
for (int j = 0; j < QK_K/32; ++j) {
|
||||||
|
int32_t scale = scales[is++];
|
||||||
|
GGML_I16x16_S_FMA_I32x16 (&aux16[j*2], scale, &aux32);
|
||||||
|
GGML_I16x16_S_FMA_I32x16 (&aux16[(j*2)+1], scale, &aux32);
|
||||||
|
}
|
||||||
|
|
||||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
|
for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
|
||||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue