From 0aa6488a6739328101257f29a097a877eb719d2c Mon Sep 17 00:00:00 2001 From: Shupei Fan Date: Wed, 27 Nov 2024 21:52:12 +0800 Subject: [PATCH] ggml-cpu: add __ARM_FEATURE_DOTPROD guard --- ggml/include/ggml-cpu.h | 1 + ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 16 ++++++++-------- ggml/src/ggml-cpu/ggml-cpu.c | 17 ++++++++++++++++- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index a5358d047..e14ea9ea5 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -91,6 +91,7 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_neon (void); GGML_BACKEND_API int ggml_cpu_has_arm_fma (void); GGML_BACKEND_API int ggml_cpu_has_fp16_va (void); + GGML_BACKEND_API int ggml_cpu_has_dotprod (void); GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void); GGML_BACKEND_API int ggml_cpu_has_sve (void); GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c index 77ea24e73..ced378879 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c @@ -530,7 +530,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) - if (ggml_cpu_has_neon()) { + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -1017,8 +1017,8 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void UNUSED(ncols_interleaved); UNUSED(blocklen); -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) - if (ggml_cpu_has_neon()) { +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl); const block_q8_0 * a_ptr = (const block_q8_0 *) vy; float * res_ptr = s; @@ -1115,7 +1115,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) - if (ggml_cpu_has_neon()) { + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -3504,8 +3504,8 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void UNUSED(ncols_interleaved); UNUSED(blocklen); -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) - if (ggml_cpu_has_neon()) { +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl); for (int y = 0; y < nr / 4; y++) { @@ -3834,11 +3834,11 @@ enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * c if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { return GGML_TYPE_Q4_0_4_8; } - if (ggml_cpu_has_neon()) { + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { return GGML_TYPE_Q4_0_4_4; } } else if (cur->type == GGML_TYPE_IQ4_NL) { - if (ggml_cpu_has_neon()) { + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { return GGML_TYPE_IQ4_NL_4_4; } } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index a7acafba1..fea867440 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -109,10 +109,11 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; #if defined(__ARM_ARCH) struct ggml_arm_arch_features_type { int has_neon; + int has_dotprod; int has_i8mm; int has_sve; int sve_cnt; -} ggml_arm_arch_features = {-1, -1, -1, 0}; +} ggml_arm_arch_features = {-1, -1, -1, -1, 0}; #endif @@ -2448,6 +2449,7 @@ static void ggml_init_arm_arch_features(void) { uint32_t hwcap2 = getauxval(AT_HWCAP2); ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD); + ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP); ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE); @@ -2462,6 +2464,11 @@ static void ggml_init_arm_arch_features(void) { } ggml_arm_arch_features.has_neon = oldp; + if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) { + oldp = 0; + } + ggml_arm_arch_features.has_dotprod = oldp; + if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) { oldp = 0; } @@ -13890,6 +13897,14 @@ int ggml_cpu_has_neon(void) { #endif } +int ggml_cpu_has_dotprod(void) { +#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD) + return ggml_arm_arch_features.has_dotprod; +#else + return 0; +#endif +} + int ggml_cpu_has_sve(void) { #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) return ggml_arm_arch_features.has_sve;