- Split multiplication and addition to make it easier for the compiler to optimise
- Accumulate two acc instead of one llama_print_timings: load time = 3137.95 ms llama_print_timings: sample time = 132.54 ms / 128 runs ( 1.04 ms per token) llama_print_timings: prompt eval time = 2943.22 ms / 8 tokens ( 367.90 ms per token) llama_print_timings: eval time = 59539.50 ms / 127 runs ( 468.81 ms per token) llama_print_timings: total time = 62843.23 ms
This commit is contained in:
parent
524d6c9447
commit
607b9c7373
1 changed files with 19 additions and 5 deletions
24
ggml.c
24
ggml.c
|
@ -492,6 +492,15 @@ static inline float hsum_float_4(const __m128 x) {
|
|||
|
||||
return _mm_cvtss_f32(res);
|
||||
}
|
||||
|
||||
// horizontally add 2x4 floats
|
||||
static inline float hsum_float_2x4(const __m128 x, const __m128 y) {
|
||||
__m128 res =_mm_hadd_ps(x, y);
|
||||
res =_mm_hadd_ps(res, res);
|
||||
res =_mm_hadd_ps(res, res);
|
||||
|
||||
return _mm_cvtss_f32(res);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if __AVX__ || __AVX2__ || __AVX512F__
|
||||
|
@ -2141,7 +2150,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|||
*s = hsum_float_8(acc);
|
||||
#elif defined(__SSE3__)
|
||||
// Initialize accumulator with zeros
|
||||
__m128 acc = _mm_setzero_ps();
|
||||
__m128 acc_0 = _mm_setzero_ps();
|
||||
__m128 acc_1 = _mm_setzero_ps();
|
||||
|
||||
// Main loop
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
@ -2167,12 +2177,16 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|||
__m128 p0 = _mm_cvtepi32_ps(i32_0);
|
||||
__m128 p1 = _mm_cvtepi32_ps(i32_1);
|
||||
|
||||
// Apply the scale, and accumulate
|
||||
acc = _mm_add_ps(_mm_mul_ps( d, p0 ), acc);
|
||||
acc = _mm_add_ps(_mm_mul_ps( d, p1 ), acc);
|
||||
// Apply the scale
|
||||
__m128 p0_d = _mm_mul_ps( d, p0 );
|
||||
__m128 p1_d = _mm_mul_ps( d, p1 );
|
||||
|
||||
// Accumulate
|
||||
acc_0 = _mm_add_ps(p0_d, acc_0);
|
||||
acc_1 = _mm_add_ps(p1_d, acc_1);
|
||||
}
|
||||
|
||||
*s = hsum_float_4(acc);
|
||||
*s = hsum_float_2x4(acc_0, acc_1);
|
||||
#else
|
||||
// scalar
|
||||
float sumf = 0.0;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue