ggml-cpu: add __ARM_FEATURE_DOTPROD guard

2024-11-27 21:52:12 +08:00 · 2024-11-27 21:52:12 +08:00 · 0aa6488a67
commit 0aa6488a67
parent f56013dcb5
3 changed files with 25 additions and 9 deletions
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -91,6 +91,7 @@ extern "C" {
    GGML_BACKEND_API int ggml_cpu_has_neon       (void);
    GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
    GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
    GGML_BACKEND_API int ggml_cpu_has_dotprod    (void);
    GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
    GGML_BACKEND_API int ggml_cpu_has_sve        (void);
    GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@ -530,7 +530,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    if (ggml_cpu_has_neon()) {
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
        const void * b_ptr = vx;
        const void * a_ptr = vy;
        float * res_ptr = s;
@ -1017,8 +1017,8 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    if (ggml_cpu_has_neon()) {
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
        const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
        float * res_ptr = s;
@ -1115,7 +1115,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    if (ggml_cpu_has_neon()) {
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
        const void * b_ptr = vx;
        const void * a_ptr = vy;
        float * res_ptr = s;
@ -3504,8 +3504,8 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    if (ggml_cpu_has_neon()) {
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
        const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
        for (int y = 0; y < nr / 4; y++) {
@ -3834,11 +3834,11 @@ enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * c
        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
            return GGML_TYPE_Q4_0_4_8;
        }
-        if (ggml_cpu_has_neon()) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
            return GGML_TYPE_Q4_0_4_4;
        }
    } else if (cur->type == GGML_TYPE_IQ4_NL) {
-        if (ggml_cpu_has_neon()) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
            return GGML_TYPE_IQ4_NL_4_4;
        }
    }
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -109,10 +109,11 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
 #if defined(__ARM_ARCH)
 struct ggml_arm_arch_features_type {
    int has_neon;
    int has_dotprod;
    int has_i8mm;
    int has_sve;
    int sve_cnt;
-} ggml_arm_arch_features = {-1, -1, -1, 0};
+} ggml_arm_arch_features = {-1, -1, -1, -1, 0};
 #endif
@ -2448,6 +2449,7 @@ static void ggml_init_arm_arch_features(void) {
    uint32_t hwcap2 = getauxval(AT_HWCAP2);
    ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
    ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
    ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
    ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);
@ -2462,6 +2464,11 @@ static void ggml_init_arm_arch_features(void) {
    }
    ggml_arm_arch_features.has_neon = oldp;
    if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
        oldp = 0;
    }
    ggml_arm_arch_features.has_dotprod = oldp;
    if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
        oldp = 0;
    }
@ -13890,6 +13897,14 @@ int ggml_cpu_has_neon(void) {
 #endif
 }
 int ggml_cpu_has_dotprod(void) {
 #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
    return ggml_arm_arch_features.has_dotprod;
 #else
    return 0;
 #endif
 }
 int ggml_cpu_has_sve(void) {
 #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
    return ggml_arm_arch_features.has_sve;