From f4eb1b38546bca8385c243fce1b93515d5ca707a Mon Sep 17 00:00:00 2001 From: Charles Xu Date: Tue, 4 Feb 2025 13:38:27 +0100 Subject: [PATCH] Add support for multithread LHS conversion --- ggml/src/ggml-cpu/CMakeLists.txt | 12 +++---- .../ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp | 33 +++++++++++-------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index bba18303b..4b0ca0daa 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -117,7 +117,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ) if (GGML_MACHINE_SUPPORTS_${tag}) set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE) - else() + elseif(NOT tag STREQUAL "sme") set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE) endif() set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) @@ -325,9 +325,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # Fetch KleidiAI sources: include(FetchContent) - set(KLEIDIAI_COMMIT_TAG "v1.2.0") + set(KLEIDIAI_COMMIT_TAG "v1.3.0") set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") - set(KLEIDIAI_ARCHIVE_MD5 "6634fefce7357ecfee9eace2068bc68b") + set(KLEIDIAI_ARCHIVE_MD5 "060bd2dc64642b091f461cc8dd7426d9") if (POLICY CMP0135) cmake_policy(SET CMP0135 NEW) @@ -370,9 +370,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/ ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/) - string(FIND ${ARCH_FLAGS} "+dotprod" DOTPROD_ENABLED) - string(FIND ${ARCH_FLAGS} "+i8mm" I8MM_ENABLED) - string(FIND ${ARCH_FLAGS} "+sme" SME_ENABLED) + string(FIND "${ARCH_FLAGS}" "+dotprod" DOTPROD_ENABLED) + string(FIND "${ARCH_FLAGS}" "+i8mm" I8MM_ENABLED) + string(FIND "${ARCH_FLAGS}" "+sme" SME_ENABLED) set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS}) diff --git a/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp b/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp index 32eadbf49..77fe8e86b 100644 --- a/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp +++ b/ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp @@ -114,30 +114,37 @@ class tensor_traits : public ggml::cpu::tensor_traits { size_t sr = kernel->get_sr(); size_t bl = k_q4_0_block_size; - const size_t lhs_packed_offset = lhs_info->get_packed_offset(0, k, bl, mr, kr, sr); + // Calculate number of columns to be processed per thread + const size_t num_m_per_thread = kai_roundup(m, nth) / nth; + const size_t m_start = ith * num_m_per_thread; + size_t m_to_process = num_m_per_thread; + if ((m_start + m_to_process) > m) { + m_to_process = m - m_start; + } - if (ith == 0) { + if(m_start < m) { // Transform LHS - const size_t src_stride = src1->nb[1]; - const float * src_ptr = reinterpret_cast(lhs + lhs_info->get_offset(0, dst->src[1]->nb[1])); - void * dst_ptr = static_cast(lhs_packed + lhs_packed_offset); + const size_t src_stride = src1->nb[1]; + const float * src_ptr = reinterpret_cast(lhs + lhs_info->get_offset(0, dst->src[1]->nb[1])); + const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, bl, mr, kr, sr); + void * lhs_packed_ptr = static_cast(lhs_packed + lhs_packed_offset); - lhs_info->pack_func(m, k, bl, mr, kr, sr, 0, src_ptr, src_stride, dst_ptr); + lhs_info->pack_func(m_to_process, k, bl, mr, kr, sr, m_start, src_ptr, src_stride, lhs_packed_ptr); } ggml_barrier(params->threadpool); - // Perform the operation - const size_t dst_stride = dst->nb[1]; + // Perform the operation + const size_t dst_stride = dst->nb[1]; + const size_t lhs_packed_offset = lhs_info->get_packed_offset(0, k, k_q4_0_block_size, mr, kr, sr); const size_t rhs_packed_offset = kernel->get_rhs_packed_offset(n_start, k, k_q4_0_block_size); const size_t dst_offset = kernel->get_dst_offset(0, n_start, dst_stride); - - const void * lhs_ptr = static_cast(lhs_packed + lhs_packed_offset); - const void * rhs_ptr = static_cast(rhs_packed + rhs_packed_offset); - float *dst_ptr = reinterpret_cast(static_cast(dst->data) + dst_offset); + const void * rhs_ptr = static_cast(rhs_packed + rhs_packed_offset); + const void* lhs_ptr = (const void*)((const char *)lhs_packed + lhs_packed_offset); + float *dst_ptr = reinterpret_cast(static_cast(dst->data) + dst_offset); kernel->run_kernel(m, n_to_process, k, k_q4_0_block_size, lhs_ptr, rhs_ptr, dst_ptr, - dst_stride, sizeof(float), -FLT_MAX, FLT_MAX); + dst_stride, sizeof(float), -FLT_MAX, FLT_MAX); return true; } return false;