add check for tensor dimensions

This commit is contained in:
Charles Xu 2024-11-08 17:01:51 +01:00
parent 5947d72c84
commit 871036d236
8 changed files with 30 additions and 27 deletions

View file

@ -874,9 +874,9 @@ ggml/src/ggml-cuda/%.o: \
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # GGML_HIPBLAS endif # GGML_HIPBLAS
ifdef GGML_CPU_AARCH64 ifdef GGML_RUNTIME_REPACK
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64 MK_CPPFLAGS += -DGGML_USE_RUNTIME_REPACK
MK_CFLAGS += -DGGML_USE_CPU_AARCH64 MK_CFLAGS += -DGGML_USE_RUNTIME_REPACK
endif endif
ifdef GGML_METAL ifdef GGML_METAL

View file

@ -92,7 +92,7 @@ else()
endif() endif()
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu" OFF) option(GGML_RUNTIME_REPACK "ggml: use runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu" OFF)
option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})

View file

@ -145,7 +145,7 @@ extern "C" {
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif #endif
#ifdef GGML_USE_CPU_AARCH64 #ifdef GGML_USE_RUNTIME_REPACK
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void); GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
#endif #endif

View file

@ -880,10 +880,10 @@ if (GGML_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind) target_link_libraries(ggml PUBLIC memkind)
endif() endif()
if (GGML_CPU_AARCH64) if (GGML_RUNTIME_REPACK)
message(STATUS "Using runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu") message(STATUS "Using runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu")
add_compile_definitions(GGML_USE_CPU_AARCH64) add_compile_definitions(GGML_USE_RUNTIME_REPACK)
endif() endif()
if (GGML_CANN) if (GGML_CANN)

View file

@ -3477,10 +3477,9 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
} }
} }
#ifdef GGML_USE_CPU_AARCH64 #ifdef GGML_USE_RUNTIME_REPACK
static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) { static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q4_0); GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
GGML_ASSERT(t->ne[0] % 8 == 0);
GGML_ASSERT(interleave_block == 4 || interleave_block == 8); GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
block_q4_0x4 *dst = (block_q4_0x4 *)t->data; block_q4_0x4 *dst = (block_q4_0x4 *)t->data;
@ -3492,9 +3491,12 @@ static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_bloc
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
return -1;
}
for (int b = 0; b < nrow; b += nrows_interleaved) { for (int b = 0; b < nrow; b += nrows_interleaved) {
for (int64_t x = 0; x < nblocks; x++) for (int64_t x = 0; x < nblocks; x++) {
{
for (int i = 0; i < nrows_interleaved; i++) { for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks]; dst_tmp[i] = src[x + i * nblocks];
} }
@ -3502,13 +3504,13 @@ static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_bloc
} }
src += nrows_interleaved * nblocks; src += nrows_interleaved * nblocks;
} }
return 0;
GGML_UNUSED(data_size); GGML_UNUSED(data_size);
} }
static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * data, size_t data_size) { static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q4_0); GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
GGML_ASSERT(t->ne[0] % 8 == 0);
GGML_ASSERT(interleave_block == 8); GGML_ASSERT(interleave_block == 8);
block_q4_0x8 *dst = (block_q4_0x8*)t->data; block_q4_0x8 *dst = (block_q4_0x8*)t->data;
@ -3520,6 +3522,10 @@ static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
return -1;
}
for (int b = 0; b < nrow; b += nrows_interleaved) { for (int b = 0; b < nrow; b += nrows_interleaved) {
for (int64_t x = 0; x < nblocks; x++) { for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++ ) { for (int i = 0; i < nrows_interleaved; i++ ) {
@ -3529,6 +3535,7 @@ static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block
} }
src += nrows_interleaved * nblocks; src += nrows_interleaved * nblocks;
} }
return 0;
GGML_UNUSED(data_size); GGML_UNUSED(data_size);
} }
@ -3536,22 +3543,18 @@ static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block
// Prepare for optimized kernels if applicable // Prepare for optimized kernels if applicable
int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size) { int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size) {
GGML_ASSERT(cur->type == GGML_TYPE_Q4_0); GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
int ret = -1;
#if defined(__ARM_ARCH) #if defined(__ARM_ARCH)
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) { if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size); return repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
ret = 0;
} }
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size); return repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
ret = 0;
} }
else if (ggml_cpu_has_neon()) { else if (ggml_cpu_has_neon()) {
repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size); return repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
ret = 0;
} }
#endif #endif
return ret; return -1;
GGML_UNUSED(cur); GGML_UNUSED(cur);
GGML_UNUSED(data); GGML_UNUSED(data);

View file

@ -33,7 +33,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#ifdef GGML_USE_CPU_AARCH64 #ifdef GGML_USE_RUNTIME_REPACK
int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size); int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size);
enum ggml_type ggml_get_optimal_type(const struct ggml_tensor * cur); enum ggml_type ggml_get_optimal_type(const struct ggml_tensor * cur);
#endif #endif

View file

@ -2239,7 +2239,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
} }
#endif #endif
#ifdef GGML_USE_CPU_AARCH64 #ifdef GGML_USE_RUNTIME_REPACK
// buffer type AARCH64 // buffer type AARCH64
@ -2316,7 +2316,7 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
bufts[index++] = ggml_backend_cpu_hbm_buffer_type(); bufts[index++] = ggml_backend_cpu_hbm_buffer_type();
#endif #endif
#ifdef GGML_USE_CPU_AARCH64 #ifdef GGML_USE_RUNTIME_REPACK
if (ggml_cpu_has_neon() || ggml_cpu_has_matmul_int8() || ggml_cpu_has_sve()) { if (ggml_cpu_has_neon() || ggml_cpu_has_matmul_int8() || ggml_cpu_has_sve()) {
bufts[index++] = ggml_backend_cpu_aarch64_buffer_type(); bufts[index++] = ggml_backend_cpu_aarch64_buffer_type();
} }
@ -2635,7 +2635,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_b
} }
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
#ifdef GGML_USE_CPU_AARCH64 #ifdef GGML_USE_RUNTIME_REPACK
const struct ggml_tensor *tensor = op->src[0]; const struct ggml_tensor *tensor = op->src[0];
if (tensor && tensor->buffer && (strcmp(tensor->buffer->buft->iface.get_name(tensor->buffer->buft),"CPU_AARCH64") == 0)) { if (tensor && tensor->buffer && (strcmp(tensor->buffer->buft->iface.get_name(tensor->buffer->buft),"CPU_AARCH64") == 0)) {
if (op->op == GGML_OP_MUL_MAT && tensor->type == GGML_TYPE_Q4_0) { if (op->op == GGML_OP_MUL_MAT && tensor->type == GGML_TYPE_Q4_0) {

View file

@ -7427,7 +7427,7 @@ static void ggml_compute_forward_mul_mat(
enum ggml_type type = src0->type; enum ggml_type type = src0->type;
#ifdef GGML_USE_CPU_AARCH64 #ifdef GGML_USE_RUNTIME_REPACK
if (strcmp(src0->buffer->buft->iface.get_name(src0->buffer->buft),"CPU_AARCH64") == 0) { if (strcmp(src0->buffer->buft->iface.get_name(src0->buffer->buft),"CPU_AARCH64") == 0) {
type = ggml_get_optimal_type(src0); type = ggml_get_optimal_type(src0);
} }