When doing inference on a CPU, if you have F16C available, it's better to use AVX instead of the lookup table.

2024-04-12 20:46:01 -05:00 · 2024-04-12 20:46:01 -05:00 · ab2fae200c
commit ab2fae200c
parent ab9a3240a9
1 changed files with 3 additions and 0 deletions
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -106,6 +106,9 @@ typedef uint16_t ggml_fp16_internal_t;
 #ifdef _MSC_VER
 #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
 #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+//If we have F16C, testing shows it's much faster than using the lookup tables.
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #else
 #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)