From 9edd10737bdb5df3f84edcf5923dc6f3f18a86e9 Mon Sep 17 00:00:00 2001 From: Charles Xu Date: Fri, 7 Feb 2025 16:09:58 +0100 Subject: [PATCH] updates for review comments --- ggml/src/ggml-cpu/CMakeLists.txt | 23 +++-- .../ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp | 12 ++- .../ggml-kleidiai/kleidiai_kernels.cpp | 89 +++++++++++++++++++ 3 files changed, 114 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 6d8fce504..10c7eb847 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -111,14 +111,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name) function(check_arm_feature tag code) set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}") - check_cxx_source_runs( - "${code}" - GGML_MACHINE_SUPPORTS_${tag} - ) + check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag}) if (GGML_MACHINE_SUPPORTS_${tag}) set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE) - elseif(NOT tag STREQUAL "sme") - set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE) + else() + set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+no${tag}") + check_cxx_source_compiles("${code}" GGML_MACHINE_SUPPORTS_no${tag}) + if (GGML_MACHINE_SUPPORTS_no${tag}) + set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE) + endif() endif() set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) endfunction() @@ -370,9 +371,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/ ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/) - string(FIND "${ARCH_FLAGS}" "+dotprod" DOTPROD_ENABLED) - string(FIND "${ARCH_FLAGS}" "+i8mm" I8MM_ENABLED) - string(FIND "${ARCH_FLAGS}" "+sme" SME_ENABLED) + set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}") + if (NOT ARCH_FLAGS_TEMP) + string(REGEX MATCH "-march=[^ ]+" ARCH_FLAGS_TEMP "${CMAKE_C_FLAGS}") + endif() + string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED) + string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED) + string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED) set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS}) diff --git a/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp b/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp index 77fe8e86b..8b689880e 100644 --- a/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp +++ b/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp @@ -23,6 +23,7 @@ #include "ggml-cpu.h" #include "ggml-impl.h" #include "ggml-backend-impl.h" +#include "ggml-threading.h" #include "kleidiai_kernels.h" @@ -35,6 +36,8 @@ struct ggml_kleidiai_context { } static ctx = { NULL }; static void init_kleidiai_context(void) { + + ggml_critical_section_start(); static bool initialized = false; if (!initialized) { @@ -55,6 +58,12 @@ static void init_kleidiai_context(void) { } ctx.kernels = ggml_kleidiai_select_kernels(features); } + ggml_critical_section_end(); +} + +static inline int ggml_ne(const ggml_tensor * tensor, int dim) { + GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS); + return tensor->ne[dim]; } namespace ggml::cpu::kleidiai { @@ -237,7 +246,8 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { return false; } - if (op->src[1]->type == GGML_TYPE_F32) { + if (op->src[1]->type == GGML_TYPE_F32 && + ggml_ne(op->src[1], 2) == 1 && ggml_ne(op->src[1], 3) == 1) { return true; } } diff --git a/ggml/src/ggml-cpu/ggml-kleidiai/kleidiai_kernels.cpp b/ggml/src/ggml-cpu/ggml-kleidiai/kleidiai_kernels.cpp index fbb44cf17..4d100c9a9 100644 --- a/ggml/src/ggml-cpu/ggml-kleidiai/kleidiai_kernels.cpp +++ b/ggml/src/ggml-cpu/ggml-kleidiai/kleidiai_kernels.cpp @@ -63,6 +63,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .required_cpu = */ CPU_FEATURE_SME, }, #endif +#if defined(__APPLE__) #if defined(__ARM_FEATURE_DOTPROD) { /* DOTPROD GEMM */ @@ -149,6 +150,94 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM, }, #endif +#else +#if defined(__ARM_FEATURE_MATMUL_INT8) + { + /* i8mm GEMM */ + /* .kern_info = */ { + /* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, + /* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, + /* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, + /* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, + /* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, + /* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, + /* .get_lhs_offset = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, + /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, + /* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, + /* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, + /* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, + }, + /* i8mm GEMV */ + /* .kern_info = */ { + /* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, + /* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, + /* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, + /* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, + /* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, + /* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, + /* .get_lhs_offset = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, + /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, + /* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, + /* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, + /* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, + }, + /* .lhs_info = */ { + /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32, + /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, + /* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, + /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, + }, + /* .rhs_info = */ { + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + }, + /* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM, + }, +#endif +#if defined(__ARM_FEATURE_DOTPROD) + { + /* DOTPROD GEMM */ + /* .kern_info = */ { + /* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, + /* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, + /* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, + /* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, + /* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, + /* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, + /* .get_lhs_offset = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, + /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, + /* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, + /* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, + /* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, + }, + /* DOTPROD GEMV */ + /* .kern_info = */ { + /* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, + /* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, + /* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, + /* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, + /* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, + /* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, + /* .get_lhs_offset = */ kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, + /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, + /* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, + /* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, + /* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, + }, + /* .lhs_info = */ { + /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32, + /* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32, + /* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32, + /* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32, + }, + /* .rhs_info = */ { + /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + }, + /* .required_cpu = */ CPU_FEATURE_DOTPROD, + }, +#endif +#endif }; ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature features) {