faster with madd

2024-11-02 17:12:23 -04:00 · 2024-11-02 17:12:23 -04:00 · 7de0bdc2db
commit 7de0bdc2db
parent 629befc729
1 changed files with 2 additions and 2 deletions
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -4232,9 +4232,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
        const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
        const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
        __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
-        p_1 = _mm_add_epi32(_mm_cvtepi16_epi32(_mm_bsrli_si128(p_1, 8)), _mm_cvtepi16_epi32(p_1));
+        p_1 = _mm_madd_epi16(p_1, _mm_set1_epi16(1));
        __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
-        p_2 = _mm_add_epi32(_mm_cvtepi16_epi32(_mm_bsrli_si128(p_2, 8)), _mm_cvtepi16_epi32(p_2));
+        p_2 = _mm_madd_epi16(p_2, _mm_set1_epi16(1));

        const __m256 deltas = _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d)),
                              _mm_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)));