diff --git a/Makefile b/Makefile index 9a541f21d..b11efd961 100644 --- a/Makefile +++ b/Makefile @@ -874,9 +874,9 @@ ggml/src/ggml-cuda/%.o: \ $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< endif # GGML_HIPBLAS -ifdef GGML_CPU_AARCH64 - MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64 - MK_CFLAGS += -DGGML_USE_CPU_AARCH64 +ifdef GGML_RUNTIME_REPACK + MK_CPPFLAGS += -DGGML_USE_RUNTIME_REPACK + MK_CFLAGS += -DGGML_USE_RUNTIME_REPACK endif ifdef GGML_METAL diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 33422425d..6732f1880 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -92,7 +92,7 @@ else() endif() option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) -option(GGML_CPU_AARCH64 "ggml: use runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu" OFF) +option(GGML_RUNTIME_REPACK "ggml: use runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu" OFF) option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 39b081dae..50d9cfd59 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -145,7 +145,7 @@ extern "C" { GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); #endif -#ifdef GGML_USE_CPU_AARCH64 +#ifdef GGML_USE_RUNTIME_REPACK GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void); #endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 122573201..b7aa6de40 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -880,10 +880,10 @@ if (GGML_CPU_HBM) target_link_libraries(ggml PUBLIC memkind) endif() -if (GGML_CPU_AARCH64) +if (GGML_RUNTIME_REPACK) message(STATUS "Using runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu") - add_compile_definitions(GGML_USE_CPU_AARCH64) + add_compile_definitions(GGML_USE_RUNTIME_REPACK) endif() if (GGML_CANN) diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index 801cf2bdc..78ba8a0a4 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -3477,10 +3477,9 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * } } -#ifdef GGML_USE_CPU_AARCH64 -static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) { +#ifdef GGML_USE_RUNTIME_REPACK +static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_Q4_0); - GGML_ASSERT(t->ne[0] % 8 == 0); GGML_ASSERT(interleave_block == 4 || interleave_block == 8); block_q4_0x4 *dst = (block_q4_0x4 *)t->data; @@ -3492,9 +3491,12 @@ static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_bloc GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); + if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + return -1; + } + for (int b = 0; b < nrow; b += nrows_interleaved) { - for (int64_t x = 0; x < nblocks; x++) - { + for (int64_t x = 0; x < nblocks; x++) { for (int i = 0; i < nrows_interleaved; i++) { dst_tmp[i] = src[x + i * nblocks]; } @@ -3502,13 +3504,13 @@ static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_bloc } src += nrows_interleaved * nblocks; } + return 0; GGML_UNUSED(data_size); } -static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * data, size_t data_size) { +static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_Q4_0); - GGML_ASSERT(t->ne[0] % 8 == 0); GGML_ASSERT(interleave_block == 8); block_q4_0x8 *dst = (block_q4_0x8*)t->data; @@ -3520,6 +3522,10 @@ static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); + if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + return -1; + } + for (int b = 0; b < nrow; b += nrows_interleaved) { for (int64_t x = 0; x < nblocks; x++) { for (int i = 0; i < nrows_interleaved; i++ ) { @@ -3529,6 +3535,7 @@ static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block } src += nrows_interleaved * nblocks; } + return 0; GGML_UNUSED(data_size); } @@ -3536,22 +3543,18 @@ static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block // Prepare for optimized kernels if applicable int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size) { GGML_ASSERT(cur->type == GGML_TYPE_Q4_0); - int ret = -1; #if defined(__ARM_ARCH) if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) { - repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size); - ret = 0; + return repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size); } else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size); - ret = 0; + return repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size); } else if (ggml_cpu_has_neon()) { - repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size); - ret = 0; + return repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size); } #endif - return ret; + return -1; GGML_UNUSED(cur); GGML_UNUSED(data); diff --git a/ggml/src/ggml-aarch64.h b/ggml/src/ggml-aarch64.h index 0353c6be4..74eddb060 100644 --- a/ggml/src/ggml-aarch64.h +++ b/ggml/src/ggml-aarch64.h @@ -33,7 +33,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -#ifdef GGML_USE_CPU_AARCH64 +#ifdef GGML_USE_RUNTIME_REPACK int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size); enum ggml_type ggml_get_optimal_type(const struct ggml_tensor * cur); #endif diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b21a92a76..5f8cb543c 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -2239,7 +2239,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { } #endif -#ifdef GGML_USE_CPU_AARCH64 +#ifdef GGML_USE_RUNTIME_REPACK // buffer type AARCH64 @@ -2316,7 +2316,7 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen bufts[index++] = ggml_backend_cpu_hbm_buffer_type(); #endif -#ifdef GGML_USE_CPU_AARCH64 +#ifdef GGML_USE_RUNTIME_REPACK if (ggml_cpu_has_neon() || ggml_cpu_has_matmul_int8() || ggml_cpu_has_sve()) { bufts[index++] = ggml_backend_cpu_aarch64_buffer_type(); } @@ -2635,7 +2635,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_b } static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { -#ifdef GGML_USE_CPU_AARCH64 +#ifdef GGML_USE_RUNTIME_REPACK const struct ggml_tensor *tensor = op->src[0]; if (tensor && tensor->buffer && (strcmp(tensor->buffer->buft->iface.get_name(tensor->buffer->buft),"CPU_AARCH64") == 0)) { if (op->op == GGML_OP_MUL_MAT && tensor->type == GGML_TYPE_Q4_0) { diff --git a/ggml/src/ggml-cpu.c b/ggml/src/ggml-cpu.c index b62fd3413..40ce0e5c4 100644 --- a/ggml/src/ggml-cpu.c +++ b/ggml/src/ggml-cpu.c @@ -7427,7 +7427,7 @@ static void ggml_compute_forward_mul_mat( enum ggml_type type = src0->type; -#ifdef GGML_USE_CPU_AARCH64 +#ifdef GGML_USE_RUNTIME_REPACK if (strcmp(src0->buffer->buft->iface.get_name(src0->buffer->buft),"CPU_AARCH64") == 0) { type = ggml_get_optimal_type(src0); }