ggml-cpu: add __ARM_FEATURE_DOTPROD guard

This commit is contained in:
Shupei Fan 2024-11-27 21:52:12 +08:00
parent f56013dcb5
commit 0aa6488a67
3 changed files with 25 additions and 9 deletions

View file

@ -91,6 +91,7 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_neon (void); GGML_BACKEND_API int ggml_cpu_has_neon (void);
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void); GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void); GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void); GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
GGML_BACKEND_API int ggml_cpu_has_sve (void); GGML_BACKEND_API int ggml_cpu_has_sve (void);
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes

View file

@ -530,7 +530,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(blocklen); UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
if (ggml_cpu_has_neon()) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
float * res_ptr = s; float * res_ptr = s;
@ -1017,8 +1017,8 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(ncols_interleaved); UNUSED(ncols_interleaved);
UNUSED(blocklen); UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
if (ggml_cpu_has_neon()) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl); const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
const block_q8_0 * a_ptr = (const block_q8_0 *) vy; const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
float * res_ptr = s; float * res_ptr = s;
@ -1115,7 +1115,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(blocklen); UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
if (ggml_cpu_has_neon()) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const void * b_ptr = vx; const void * b_ptr = vx;
const void * a_ptr = vy; const void * a_ptr = vy;
float * res_ptr = s; float * res_ptr = s;
@ -3504,8 +3504,8 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(ncols_interleaved); UNUSED(ncols_interleaved);
UNUSED(blocklen); UNUSED(blocklen);
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
if (ggml_cpu_has_neon()) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl); const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
for (int y = 0; y < nr / 4; y++) { for (int y = 0; y < nr / 4; y++) {
@ -3834,11 +3834,11 @@ enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * c
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
return GGML_TYPE_Q4_0_4_8; return GGML_TYPE_Q4_0_4_8;
} }
if (ggml_cpu_has_neon()) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
return GGML_TYPE_Q4_0_4_4; return GGML_TYPE_Q4_0_4_4;
} }
} else if (cur->type == GGML_TYPE_IQ4_NL) { } else if (cur->type == GGML_TYPE_IQ4_NL) {
if (ggml_cpu_has_neon()) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
return GGML_TYPE_IQ4_NL_4_4; return GGML_TYPE_IQ4_NL_4_4;
} }
} }

View file

@ -109,10 +109,11 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
#if defined(__ARM_ARCH) #if defined(__ARM_ARCH)
struct ggml_arm_arch_features_type { struct ggml_arm_arch_features_type {
int has_neon; int has_neon;
int has_dotprod;
int has_i8mm; int has_i8mm;
int has_sve; int has_sve;
int sve_cnt; int sve_cnt;
} ggml_arm_arch_features = {-1, -1, -1, 0}; } ggml_arm_arch_features = {-1, -1, -1, -1, 0};
#endif #endif
@ -2448,6 +2449,7 @@ static void ggml_init_arm_arch_features(void) {
uint32_t hwcap2 = getauxval(AT_HWCAP2); uint32_t hwcap2 = getauxval(AT_HWCAP2);
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD); ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE); ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
@ -2462,6 +2464,11 @@ static void ggml_init_arm_arch_features(void) {
} }
ggml_arm_arch_features.has_neon = oldp; ggml_arm_arch_features.has_neon = oldp;
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
oldp = 0;
}
ggml_arm_arch_features.has_dotprod = oldp;
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) { if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
oldp = 0; oldp = 0;
} }
@ -13890,6 +13897,14 @@ int ggml_cpu_has_neon(void) {
#endif #endif
} }
int ggml_cpu_has_dotprod(void) {
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
return ggml_arm_arch_features.has_dotprod;
#else
return 0;
#endif
}
int ggml_cpu_has_sve(void) { int ggml_cpu_has_sve(void) {
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
return ggml_arm_arch_features.has_sve; return ggml_arm_arch_features.has_sve;