- Added prefetch
llama_print_timings: load time = 3021.72 ms llama_print_timings: sample time = 128.90 ms / 128 runs ( 1.01 ms per token) llama_print_timings: prompt eval time = 2826.35 ms / 8 tokens ( 353.29 ms per token) llama_print_timings: eval time = 53198.13 ms / 127 runs ( 418.88 ms per token) llama_print_timings: total time = 56380.69 ms
This commit is contained in:
parent
78bbb3cdfe
commit
7379dd2dba
1 changed files with 15 additions and 3 deletions
18
ggml.c
18
ggml.c
|
@ -2171,7 +2171,10 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|||
__m128 acc_3 = _mm_setzero_ps();
|
||||
|
||||
{
|
||||
// Compute combined scale for the block
|
||||
_mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
|
||||
_mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
|
||||
|
||||
// Compute combined scale for the block 0 and 1
|
||||
const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[0].d ), _mm_set1_ps( y[0].d ) );
|
||||
|
||||
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
|
||||
|
@ -2186,7 +2189,10 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|||
bx_1 = _mm_sub_epi8(bx_1, off);
|
||||
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
|
||||
|
||||
// Compute combined scale for the block
|
||||
_mm_prefetch(&x[2] + sizeof(block_q4_0), _MM_HINT_T0);
|
||||
_mm_prefetch(&y[2] + sizeof(block_q8_0), _MM_HINT_T0);
|
||||
|
||||
// Compute combined scale for the block 2 and 3
|
||||
const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[1].d ), _mm_set1_ps( y[1].d ) );
|
||||
|
||||
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
|
||||
|
@ -2216,7 +2222,10 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|||
|
||||
// Main loop
|
||||
for (int i = 2; i < nb; i+=2) {
|
||||
// Compute combined scale for the block
|
||||
_mm_prefetch(&x[i + 1] + sizeof(block_q4_0), _MM_HINT_T0);
|
||||
_mm_prefetch(&y[i + 1] + sizeof(block_q8_0), _MM_HINT_T0);
|
||||
|
||||
// Compute combined scale for the block 0 and 1
|
||||
const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[i].d ), _mm_set1_ps( y[i].d ) );
|
||||
|
||||
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
|
||||
|
@ -2231,6 +2240,9 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|||
bx_1 = _mm_sub_epi8(bx_1, off);
|
||||
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
|
||||
|
||||
_mm_prefetch(&x[i + 2] + sizeof(block_q4_0), _MM_HINT_T0);
|
||||
_mm_prefetch(&y[i + 2] + sizeof(block_q8_0), _MM_HINT_T0);
|
||||
|
||||
// Compute combined scale for the block 2 and 3
|
||||
const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[i + 1].d ), _mm_set1_ps( y[i + 1].d ) );
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue