faster with madd
This commit is contained in:
parent
629befc729
commit
7de0bdc2db
1 changed files with 2 additions and 2 deletions
|
@ -4232,9 +4232,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
||||
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
||||
__m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
|
||||
p_1 = _mm_add_epi32(_mm_cvtepi16_epi32(_mm_bsrli_si128(p_1, 8)), _mm_cvtepi16_epi32(p_1));
|
||||
p_1 = _mm_madd_epi16(p_1, _mm_set1_epi16(1));
|
||||
__m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
|
||||
p_2 = _mm_add_epi32(_mm_cvtepi16_epi32(_mm_bsrli_si128(p_2, 8)), _mm_cvtepi16_epi32(p_2));
|
||||
p_2 = _mm_madd_epi16(p_2, _mm_set1_epi16(1));
|
||||
|
||||
const __m256 deltas = _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d)),
|
||||
_mm_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue