When doing inference on a CPU, if you have F16C available, it's better to use AVX instead of the lookup table.
This commit is contained in:
parent
ab9a3240a9
commit
ab2fae200c
1 changed files with 3 additions and 0 deletions
|
@ -106,6 +106,9 @@ typedef uint16_t ggml_fp16_internal_t;
|
|||
#ifdef _MSC_VER
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||
//If we have F16C, testing shows it's much faster than using the lookup tables.
|
||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#else
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue