- Improved prefetching

llama_print_timings:        load time =  2899.92 ms
llama_print_timings:      sample time =   127.62 ms /   128 runs   (    1.00 ms per token)
llama_print_timings: prompt eval time =  2705.68 ms /     8 tokens (  338.21 ms per token)
llama_print_timings:        eval time = 52500.58 ms /   127 runs   (  413.39 ms per token)
llama_print_timings:       total time = 55559.90 ms
This commit is contained in:
3ooabkhxtn 2023-05-12 10:17:13 +00:00
parent 8699fd0d43
commit ca54314a2f

16
ggml.c
View file

@ -2155,8 +2155,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
// First round without accumulation
{
_mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
_mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
_mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
_mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
// Compute combined scale for the block 0 and 1
const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[0].d ), _mm_set1_ps( y[0].d ) );
@ -2173,8 +2173,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
bx_1 = _mm_sub_epi8(bx_1, off);
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
_mm_prefetch(&x[2] + sizeof(block_q4_0), _MM_HINT_T0);
_mm_prefetch(&y[2] + sizeof(block_q8_0), _MM_HINT_T0);
_mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
_mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
// Compute combined scale for the block 2 and 3
const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[1].d ), _mm_set1_ps( y[1].d ) );
@ -2206,8 +2206,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
// Main loop
for (int i = 2; i < nb; i+=2) {
_mm_prefetch(&x[i + 1] + sizeof(block_q4_0), _MM_HINT_T0);
_mm_prefetch(&y[i + 1] + sizeof(block_q8_0), _MM_HINT_T0);
_mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
// Compute combined scale for the block 0 and 1
const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[i].d ), _mm_set1_ps( y[i].d ) );
@ -2224,8 +2224,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
bx_1 = _mm_sub_epi8(bx_1, off);
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
_mm_prefetch(&x[i + 2] + sizeof(block_q4_0), _MM_HINT_T0);
_mm_prefetch(&y[i + 2] + sizeof(block_q8_0), _MM_HINT_T0);
_mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
_mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
// Compute combined scale for the block 2 and 3
const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[i + 1].d ), _mm_set1_ps( y[i + 1].d ) );