ggml : add run-time detection of neon, i8mm and sve (#9331)
* ggml: Added run-time detection of neon, i8mm and sve Adds run-time detection of the Arm instructions set features neon, i8mm and sve for Linux and Apple build targets. * ggml: Extend feature detection to include non aarch64 Arm arch * ggml: Move definition of ggml_arm_arch_features to the global data section
This commit is contained in:
		
							parent
							
								
									89f9944981
								
							
						
					
					
						commit
						6a0f779484
					
				
					 5 changed files with 93 additions and 32 deletions
				
			
		|  | @ -598,15 +598,6 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_ | |||
|     return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8); | ||||
| } | ||||
| 
 | ||||
| // Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported.
 | ||||
| static int sve_lane_count(void) { | ||||
| #if defined(__ARM_FEATURE_SVE) | ||||
|     return ggml_sve_cnt_b; | ||||
| #else | ||||
|     return 0; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { | ||||
|     const int qk = QK8_0; | ||||
|     const int nb = n / qk; | ||||
|  | @ -843,7 +834,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * | |||
| 
 | ||||
| #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) | ||||
| #if defined(__ARM_FEATURE_SVE) | ||||
|     if (ggml_cpu_has_sve() && sve_lane_count() == QK8_0) { | ||||
|     if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) { | ||||
|         const void * b_ptr = vx; | ||||
|         const void * a_ptr = vy; | ||||
|         float * res_ptr = s; | ||||
|  | @ -2020,7 +2011,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * | |||
| 
 | ||||
| #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) | ||||
| #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) | ||||
|     if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && sve_lane_count() == QK8_0) { | ||||
|     if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) { | ||||
|         const void * b_ptr = vx; | ||||
|         const void * a_ptr = vy; | ||||
|         float * res_ptr = s; | ||||
|  |  | |||
|  | @ -4013,7 +4013,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r | |||
|     svfloat32_t sumv0 = svdup_n_f32(0.0f); | ||||
|     svfloat32_t sumv1 = svdup_n_f32(0.0f); | ||||
| 
 | ||||
|     const int vector_length = ggml_sve_cnt_b*8; | ||||
|     const int vector_length = ggml_cpu_get_sve_cnt()*8; | ||||
| 
 | ||||
|     // VLA Implementation using switch case
 | ||||
|     switch (vector_length) { | ||||
|  | @ -5597,7 +5597,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r | |||
|     svfloat32_t sumv0 = svdup_n_f32(0.0f); | ||||
|     svfloat32_t sumv1 = svdup_n_f32(0.0f); | ||||
| 
 | ||||
|     const int vector_length = ggml_sve_cnt_b*8; | ||||
|     const int vector_length = ggml_cpu_get_sve_cnt()*8; | ||||
| 
 | ||||
|     //VLA Implemenation for SVE
 | ||||
|     switch (vector_length) { | ||||
|  |  | |||
|  | @ -142,10 +142,6 @@ void iq2xs_free_impl(enum ggml_type type); | |||
| void iq3xs_init_impl(int grid_size); | ||||
| void iq3xs_free_impl(int grid_size); | ||||
| 
 | ||||
| #if defined(__ARM_FEATURE_SVE) | ||||
| extern int ggml_sve_cnt_b; | ||||
| #endif | ||||
| 
 | ||||
| #ifdef __cplusplus | ||||
| } | ||||
| #endif | ||||
|  |  | |||
							
								
								
									
										101
									
								
								ggml/src/ggml.c
									
										
									
									
									
								
							
							
						
						
									
										101
									
								
								ggml/src/ggml.c
									
										
									
									
									
								
							|  | @ -39,9 +39,6 @@ | |||
| #include <unistd.h> | ||||
| #endif | ||||
| 
 | ||||
| #if defined(__ARM_FEATURE_SVE) | ||||
| int ggml_sve_cnt_b = 0; | ||||
| #endif | ||||
| #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) | ||||
| #undef GGML_USE_LLAMAFILE | ||||
| #endif | ||||
|  | @ -455,6 +452,15 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; | |||
| // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
 | ||||
| float ggml_table_f32_f16[1 << 16]; | ||||
| 
 | ||||
| #if defined(__ARM_ARCH) | ||||
| struct ggml_arm_arch_features_type { | ||||
|     int has_neon; | ||||
|     int has_i8mm; | ||||
|     int has_sve; | ||||
|     int sve_cnt; | ||||
| } ggml_arm_arch_features = {-1, -1, -1, 0}; | ||||
| #endif | ||||
| 
 | ||||
| GGML_CALL const char * ggml_status_to_string(enum ggml_status status) { | ||||
|     switch (status) { | ||||
|         case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)"; | ||||
|  | @ -3673,6 +3679,66 @@ static inline int ggml_up(int n, int m) { | |||
| 
 | ||||
| ////////////////////////////////////////////////////////////////////////////////
 | ||||
| 
 | ||||
| #if defined(__ARM_ARCH) | ||||
| 
 | ||||
| #if defined(__linux__) && defined(__aarch64__) | ||||
| #include <sys/auxv.h> | ||||
| #elif defined(__APPLE__) | ||||
| #include <sys/sysctl.h> | ||||
| #endif | ||||
| 
 | ||||
| static void ggml_init_arm_arch_features(void) { | ||||
| #if defined(__linux__) && defined(__aarch64__) | ||||
|     uint32_t hwcap = getauxval(AT_HWCAP); | ||||
|     uint32_t hwcap2 = getauxval(AT_HWCAP2); | ||||
| 
 | ||||
|     ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD); | ||||
|     ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); | ||||
|     ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE); | ||||
| 
 | ||||
| #if defined(__ARM_FEATURE_SVE) | ||||
|     ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); | ||||
| #endif | ||||
| #elif defined(__APPLE__) | ||||
|     int oldp = 0; | ||||
|     size_t size = sizeof(oldp); | ||||
|     if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) { | ||||
|         oldp = 0; | ||||
|     } | ||||
|     ggml_arm_arch_features.has_neon = oldp; | ||||
| 
 | ||||
|     if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) { | ||||
|         oldp = 0; | ||||
|     } | ||||
|     ggml_arm_arch_features.has_i8mm = oldp; | ||||
| 
 | ||||
|     ggml_arm_arch_features.has_sve = 0; | ||||
|     ggml_arm_arch_features.sve_cnt = 0; | ||||
| #else | ||||
| // Run-time CPU feature detection not implemented for this platform, fallback to compile time
 | ||||
| #if defined(__ARM_NEON) | ||||
|     ggml_arm_arch_features.has_neon = 1; | ||||
| #else | ||||
|     ggml_arm_arch_features.has_neon = 0; | ||||
| #endif | ||||
| 
 | ||||
| #if defined(__ARM_FEATURE_MATMUL_INT8) | ||||
|     ggml_arm_arch_features.has_i8mm = 1; | ||||
| #else | ||||
|     ggml_arm_arch_features.has_i8mm = 0; | ||||
| #endif | ||||
| 
 | ||||
| #if defined(__ARM_FEATURE_SVE) | ||||
|     ggml_arm_arch_features.has_sve = 1; | ||||
|     ggml_arm_arch_features.sve_cnt = 16; | ||||
| #else | ||||
|     ggml_arm_arch_features.has_sve = 0; | ||||
|     ggml_arm_arch_features.sve_cnt = 0; | ||||
| #endif | ||||
| #endif | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| struct ggml_context * ggml_init(struct ggml_init_params params) { | ||||
|     // make this function thread safe
 | ||||
|     ggml_critical_section_start(); | ||||
|  | @ -3723,6 +3789,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { | |||
|             GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); | ||||
|         } | ||||
| 
 | ||||
| #if defined(__ARM_ARCH) | ||||
|         ggml_init_arm_arch_features(); | ||||
| #endif | ||||
| 
 | ||||
|         is_first_call = false; | ||||
|     } | ||||
| 
 | ||||
|  | @ -3771,12 +3841,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { | |||
| 
 | ||||
|     GGML_ASSERT_ALIGNED(ctx->mem_buffer); | ||||
| 
 | ||||
| #if defined(__ARM_FEATURE_SVE) | ||||
|     if (!ggml_sve_cnt_b) { | ||||
|         ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); | ||||
|     } | ||||
| #endif | ||||
| 
 | ||||
|     GGML_PRINT_DEBUG("%s: context initialized\n", __func__); | ||||
| 
 | ||||
|     ggml_critical_section_end(); | ||||
|  | @ -23578,16 +23642,16 @@ int ggml_cpu_has_fma(void) { | |||
| } | ||||
| 
 | ||||
| int ggml_cpu_has_neon(void) { | ||||
| #if defined(__ARM_NEON) | ||||
|     return 1; | ||||
| #if defined(__ARM_ARCH) | ||||
|     return ggml_arm_arch_features.has_neon; | ||||
| #else | ||||
|     return 0; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| int ggml_cpu_has_sve(void) { | ||||
| #if defined(__ARM_FEATURE_SVE) | ||||
|     return 1; | ||||
| #if defined(__ARM_ARCH) | ||||
|     return ggml_arm_arch_features.has_sve; | ||||
| #else | ||||
|     return 0; | ||||
| #endif | ||||
|  | @ -23734,11 +23798,18 @@ int ggml_cpu_has_vsx(void) { | |||
| } | ||||
| 
 | ||||
| int ggml_cpu_has_matmul_int8(void) { | ||||
| #if defined(__ARM_FEATURE_MATMUL_INT8) | ||||
|     return 1; | ||||
| #if defined(__ARM_ARCH) | ||||
|     return ggml_arm_arch_features.has_i8mm; | ||||
| #else | ||||
|     return 0; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| int ggml_cpu_get_sve_cnt(void) { | ||||
| #if defined(__ARM_ARCH) | ||||
|     return ggml_arm_arch_features.sve_cnt; | ||||
| #else | ||||
|     return 0; | ||||
| #endif | ||||
| } | ||||
| ////////////////////////////////////////////////////////////////////////////////
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue