From c796dd4c9009eb5410fbe3372744c4c4bbd47f24 Mon Sep 17 00:00:00 2001 From: syx Date: Tue, 12 Dec 2023 15:03:10 +0800 Subject: [PATCH] support axpy q4_0 for loop --- ggml-quants.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/ggml-quants.c b/ggml-quants.c index 608e8e986..87fc92f13 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -2430,7 +2430,7 @@ void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * res assert(nb % 2 == 0); const block_q4_0 * restrict x = vx; - +#if defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -2491,7 +2491,21 @@ void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * res _mm256_storeu_ps((__m256*)(vz + i*128+96), by); } +#else + float *res = (float *)vz; + float scale_fp32 = GGML_FP16_TO_FP32(scale); + for (int i = 0; i < nb; i++) { + float result_scale = GGML_FP16_TO_FP32(x[i].d) * scale_fp32; + int offset = i * QK4_0; + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0x0F) - 8; + const int v1 = (x[i].qs[j] >> 4) - 8; + res[offset + j] = res[offset + j] + ((float)(v0 * (int)alpha) * result_scale); + res[offset + j + qk/2] = res[offset + j + qk/2] + ((float)(v1 * (int)alpha) * result_scale); + } + } +#endif }