From 7379dd2dbabecd369d46168195d1ae10c969ba63 Mon Sep 17 00:00:00 2001
From: 3ooabkhxtn <3ooabkhxtn@local>
Date: Fri, 12 May 2023 09:20:48 +0000
Subject: [PATCH] - Added prefetch

llama_print_timings:        load time =  3021.72 ms
llama_print_timings:      sample time =   128.90 ms /   128 runs   (    1.01 ms per token)
llama_print_timings: prompt eval time =  2826.35 ms /     8 tokens (  353.29 ms per token)
llama_print_timings:        eval time = 53198.13 ms /   127 runs   (  418.88 ms per token)
llama_print_timings:       total time = 56380.69 ms
---
 ggml.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/ggml.c b/ggml.c
index f9545e673..880d574c6 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2171,7 +2171,10 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
     __m128 acc_3 = _mm_setzero_ps();
 
     {
-        // Compute combined scale for the block
+        _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
         const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[0].d ), _mm_set1_ps( y[0].d ) );
 
         const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
@@ -2186,7 +2189,10 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         bx_1 = _mm_sub_epi8(bx_1, off);
         const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
 
-        // Compute combined scale for the block
+        _mm_prefetch(&x[2] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[2] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
         const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[1].d ), _mm_set1_ps( y[1].d ) );
 
         const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
@@ -2216,7 +2222,10 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
 
     // Main loop
     for (int i = 2; i < nb; i+=2) {
-        // Compute combined scale for the block
+        _mm_prefetch(&x[i + 1] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[i + 1] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
         const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[i].d ), _mm_set1_ps( y[i].d ) );
 
         const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
@@ -2231,6 +2240,9 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         bx_1 = _mm_sub_epi8(bx_1, off);
         const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
 
+        _mm_prefetch(&x[i + 2] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[i + 2] + sizeof(block_q8_0), _MM_HINT_T0);
+
         // Compute combined scale for the block 2 and 3
         const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[i + 1].d ), _mm_set1_ps( y[i + 1].d ) );