From 1335c78639d32f93851ad707b19f8263e7786373 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Sat, 2 Nov 2024 16:36:56 -0400
Subject: [PATCH] 256b version, also slow. i tried :)

---
 ggml/src/ggml-quants.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 652aeaa9f..686e16d09 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -4237,9 +4237,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
         p_2 = _mm_add_epi32(_mm_cvtepi16_epi32(_mm_bsrli_si128(p_2, 8)), _mm_cvtepi16_epi32(p_2));
 
 		// TODO: if f16c
-		const __m128 del = _mm_cvtph_ps(_mm_set_epi16(0, 0, 0 , 0, x[ib + 1].d, y[ib + 1].d, x[ib].d, y[ib].d));
-		const __m128 del_mul = _mm_mul_ps(del, _mm_permute_ps(del, 0xB1)); // x.d*y.d
-		const __m256 deltas = _mm256_set_m128(_mm_permute_ps(del_mul, 0xFF), _mm_permute_ps(del_mul, 0x00));
+		const __m256 del = _mm256_cvtph_ps(_mm_set_epi16(0 , 0, x[ib + 1].d, y[ib + 1].d, 0, 0, x[ib].d, y[ib].d));
+		const __m256 del_mul = _mm256_mul_ps(del, _mm256_permute_ps(del, 0x01)); // x.d*y.d
+		const __m256 deltas = _mm256_permute_ps(del_mul, 0x00);
 
 		// TODO: may be room to optimize here?
         //const __m256 deltas = _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d)),