When doing inference on a CPU, if you have F16C available, it's better to use AVX instead of the lookup table.

This commit is contained in:
Kunnis 2024-04-12 20:46:01 -05:00
parent ab9a3240a9
commit ab2fae200c

View file

@ -106,6 +106,9 @@ typedef uint16_t ggml_fp16_internal_t;
#ifdef _MSC_VER
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
//If we have F16C, testing shows it's much faster than using the lookup tables.
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#else
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)