From 95322e93bfd951ada822c9a7931ae58ac1dd8c96 Mon Sep 17 00:00:00 2001 From: Djip007 <3705339+Djip007@users.noreply.github.com> Date: Tue, 3 Dec 2024 00:09:44 +0100 Subject: [PATCH] clang-format --- ggml/src/ggml-cpu/amx/amx.cpp | 133 ++++---- ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 448 +++++++++++++------------ ggml/src/ggml-cpu/ggml-cpu-aarch64.h | 3 +- ggml/src/ggml-cpu/ggml-cpu-hbm.cpp | 26 +- ggml/src/ggml-cpu/ggml-cpu-hbm.h | 3 +- ggml/src/ggml-cpu/ggml-cpu-traits.cpp | 14 +- ggml/src/ggml-cpu/ggml-cpu-traits.h | 38 +-- 7 files changed, 334 insertions(+), 331 deletions(-) diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp index 3d8d91f75..a57ec2496 100644 --- a/ggml/src/ggml-cpu/amx/amx.cpp +++ b/ggml/src/ggml-cpu/amx/amx.cpp @@ -20,27 +20,26 @@ // AMX type_trais namespace ggml::cpu::amx { - class tensor_traits : public ggml::cpu::tensor_traits { +class tensor_traits : public ggml::cpu::tensor_traits { + bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { + size = ggml_backend_amx_desired_wsize(op); + return true; + } - bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { - size = ggml_backend_amx_desired_wsize(op); + bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT) { + ggml_backend_amx_mul_mat(params, op); return true; } - - bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { - if (op->op == GGML_OP_MUL_MAT) { - ggml_backend_amx_mul_mat(params, op); - return true; - } - return false; - } - }; - - static ggml::cpu::tensor_traits* get_tensor_traits(ggml_backend_buffer_t , struct ggml_tensor *) { - static tensor_traits traits; - return &traits; + return false; } +}; + +static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) { + static tensor_traits traits; + return &traits; } +} // namespace ggml::cpu::amx // AMX buffer interface static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -48,26 +47,28 @@ static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { } static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { - return (void *)(buffer->context); + return (void *) (buffer->context); } static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { - tensor->extra = (void *)ggml::cpu::amx::get_tensor_traits(buffer, tensor); + tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor); GGML_UNUSED(buffer); } -static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - memset((char *)tensor->data + offset, value, size); +static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + uint8_t value, size_t offset, size_t size) { + memset((char *) tensor->data + offset, value, size); GGML_UNUSED(buffer); } -static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + const void * data, size_t offset, size_t size) { if (qtype_has_amx_kernels(tensor->type)) { ggml_backend_amx_convert_weight(tensor, data, offset, size); } else { - memcpy((char *)tensor->data + offset, data, size); + memcpy((char *) tensor->data + offset, data, size); } GGML_UNUSED(buffer); @@ -136,49 +137,42 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ } namespace ggml::cpu::amx { - class extra_buffer_type : ggml::cpu::extra_buffer_type { - bool supports_op(ggml_backend_dev_t , const struct ggml_tensor * op) override { - // handle only 2d gemm for now - auto is_contiguous_2d = [](const struct ggml_tensor * t) { - return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; - }; +class extra_buffer_type : ggml::cpu::extra_buffer_type { + bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { + // handle only 2d gemm for now + auto is_contiguous_2d = [](const struct ggml_tensor * t) { + return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; + }; - if ( op->op == GGML_OP_MUL_MAT && - is_contiguous_2d(op->src[0]) && // src0 must be contiguous - is_contiguous_2d(op->src[1]) && // src1 must be contiguous - op->src[0]->buffer && - op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() && - op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x - (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16)) - ) - { - // src1 must be host buffer - if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { - return false; - } - // src1 must be float32 - if (op->src[1]->type == GGML_TYPE_F32) { - return true; - } + if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous + is_contiguous_2d(op->src[1]) && // src1 must be contiguous + op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() && + op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x + (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) { + // src1 must be host buffer + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; } - return false; + // src1 must be float32 + if (op->src[1]->type == GGML_TYPE_F32) { + return true; + } + } + return false; + } + + ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer && + op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) { + return (ggml::cpu::tensor_traits *) op->src[0]->extra; } - ggml::cpu::tensor_traits* get_tensor_traits(const struct ggml_tensor * op) override { - if ( op->op == GGML_OP_MUL_MAT && - op->src[0]->buffer && - op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() - ) - { - return (ggml::cpu::tensor_traits*) op->src[0]->extra; - } + return nullptr; + } +}; +} // namespace ggml::cpu::amx - return nullptr; - } - }; -} - -static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { +static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { return ggml_backend_amx_get_alloc_size(tensor); GGML_UNUSED(buft); @@ -200,25 +194,26 @@ static bool ggml_amx_init() { return true; #endif } + ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { /* .iface = */ { - /* .get_name = */ ggml_backend_amx_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, - /* .is_host = */ nullptr, - }, + /* .get_name = */ ggml_backend_amx_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, + /* .get_max_size = */ nullptr, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, + /* .is_host = */ nullptr, + }, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), /* .context = */ new ggml::cpu::amx::extra_buffer_type(), }; if (!ggml_amx_init()) { - return NULL; + return nullptr; } return &ggml_backend_buffer_type_amx; } -#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__) +#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__) diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 00d6c57f4..d34a90d27 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -20,31 +20,36 @@ // TODO: move to include file? template constexpr int QK_0() { - if constexpr (K==4) return QK4_0; - if constexpr (K==8) return QK8_0; + if constexpr (K == 4) { + return QK4_0; + } + if constexpr (K == 8) { + return QK8_0; + } return -1; } -template -struct block { - ggml_half d[N]; // deltas for N qK_0 blocks - int8_t qs[(QK_0() * N * K) / 8]; // quants for N qK_0 blocks +template struct block { + ggml_half d[N]; // deltas for N qK_0 blocks + int8_t qs[(QK_0() * N * K) / 8]; // quants for N qK_0 blocks }; -// control size -static_assert(sizeof(block<4,4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding"); -static_assert(sizeof(block<4,8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding"); -static_assert(sizeof(block<8,4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding"); -static_assert(sizeof(block<8,8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding"); -using block_q4_0x4 = block<4,4>; -using block_q4_0x8 = block<4,8>; -using block_q8_0x4 = block<8,4>; -using block_q8_0x8 = block<8,8>; +// control size +static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding"); +static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding"); +static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding"); +static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding"); + +using block_q4_0x4 = block<4, 4>; +using block_q4_0x8 = block<4, 8>; +using block_q8_0x4 = block<8, 4>; +using block_q8_0x8 = block<8, 8>; struct block_iq4_nlx4 { - ggml_half d[4]; // deltas for 4 iq4_nl blocks - uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks + ggml_half d[4]; // deltas for 4 iq4_nl blocks + uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks }; + static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding"); #if defined(__GNUC__) @@ -3799,161 +3804,172 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b } namespace ggml::cpu::aarch64 { - // repack - template int repack(struct ggml_tensor *, const void *, size_t); - - // TODO: generalise. - template<> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size); - } - template<> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size); - } - template<> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size); - } - template<> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); - } - template<> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); - } - - // gemv - template void gemv(int, float *, size_t, const void *, const void *, int, int); - - template<> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); - } - template<> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); - } - template<> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); - } - template<> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); - } - - // gemm - template void gemm(int, float *, size_t, const void *, const void *, int, int); - - template<> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); - } - template<> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); - } - template<> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); - } - template<> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { - ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); - } - - class tensor_traits_base : public ggml::cpu::tensor_traits { - public: - virtual int repack (struct ggml_tensor *t, const void * data, size_t data_size) = 0; - }; - - template - class tensor_traits : public tensor_traits_base { - - bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { - // not realy a GGML_TYPE_Q8_0 but same size. - size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1])); - return true; - } - - bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - struct ggml_tensor * dst = op; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int ith = params->ith; - const int nth = params->nth; - - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - GGML_ASSERT(ggml_n_dims(op->src[0]) == 2); - // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2); - - char * wdata = static_cast (params->wdata); - const size_t nbw1 = ggml_row_size(GGML_TYPE_Q8_0, ne10); - - assert(params->wsize >= nbw1*ne11); - - ggml_from_float_t const from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float; - - int64_t i11_processed = 0; - for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { - quantize_mat_q8_0((float *)((char *) src1->data + i11*nb11), - (void *) (wdata + i11*nbw1), - 4, ne10, INTER_SIZE); - } - i11_processed = ne11 - ne11 % 4; - for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { - from_float((float *)((char *) src1->data + i11*nb11), - (void *) (wdata + i11*nbw1), - ne10); - } - - ggml_barrier(params->threadpool); - - const void * src1_wdata = params->wdata; - const size_t src1_col_stride = ggml_row_size(GGML_TYPE_Q8_0, ne10); - int64_t src0_start = (ith * ne01) / nth; - int64_t src0_end = ((ith + 1) * ne01) / nth; - src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS): src0_start; - src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS): src0_end; - if (src0_start >= src0_end) return true; - - // If there are more than three rows in src1, use gemm; otherwise, use gemv. - if (ne11 > 3) { - gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, - (const char *) src0->data + src0_start * nb01, - (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); - } - for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) { - gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01, - (const char *) src0->data + src0_start * nb01, - (const char *) src1_wdata + (src1_col_stride * iter), 1, - src0_end - src0_start); - } - - return true; - } - - int repack (struct ggml_tensor *t, const void * data, size_t data_size) override { - return ggml::cpu::aarch64::repack(t, data, data_size); - } - - }; - - // instance for Q4 - static const tensor_traits q4_0_4x4_q8_0; - static const tensor_traits q4_0_4x8_q8_0; - static const tensor_traits q4_0_8x8_q8_0; - - // instance for IQ4 - static const tensor_traits iq4_nl_4x4_q8_0; +// repack +template +int repack(struct ggml_tensor *, const void *, size_t); +// TODO: generalise. +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size); } -static const ggml::cpu::tensor_traits* ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) { +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); +} + +// gemv +template +void gemv(int, float *, size_t, const void *, const void *, int, int); + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> +void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +// gemm +template +void gemm(int, float *, size_t, const void *, const void *, int, int); + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> +void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +class tensor_traits_base : public ggml::cpu::tensor_traits { + public: + virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0; +}; + +template class tensor_traits : public tensor_traits_base { + bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { + // not realy a GGML_TYPE_Q8_0 but same size. + size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1])); + return true; + } + + bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + struct ggml_tensor * dst = op; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_n_dims(op->src[0]) == 2); + // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2); + + char * wdata = static_cast(params->wdata); + const size_t nbw1 = ggml_row_size(GGML_TYPE_Q8_0, ne10); + + assert(params->wsize >= nbw1 * ne11); + + const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float; + + int64_t i11_processed = 0; + for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { + quantize_mat_q8_0((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10, + INTER_SIZE); + } + i11_processed = ne11 - ne11 % 4; + for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { + from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10); + } + + ggml_barrier(params->threadpool); + + const void * src1_wdata = params->wdata; + const size_t src1_col_stride = ggml_row_size(GGML_TYPE_Q8_0, ne10); + int64_t src0_start = (ith * ne01) / nth; + int64_t src0_end = ((ith + 1) * ne01) / nth; + src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start; + src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end; + if (src0_start >= src0_end) { + return true; + } + + // If there are more than three rows in src1, use gemm; otherwise, use gemv. + if (ne11 > 3) { + gemm(ne00, (float *) ((char *) dst->data) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); + } + for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) { + gemv(ne00, (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata + (src1_col_stride * iter), 1, + src0_end - src0_start); + } + + return true; + } + + int repack(struct ggml_tensor * t, const void * data, size_t data_size) override { + return ggml::cpu::aarch64::repack(t, data, data_size); + } +}; + +// instance for Q4 +static const tensor_traits q4_0_4x4_q8_0; +static const tensor_traits q4_0_4x8_q8_0; +static const tensor_traits q4_0_8x8_q8_0; + +// instance for IQ4 +static const tensor_traits iq4_nl_4x4_q8_0; + +} // namespace ggml::cpu::aarch64 + +static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) { if (cur->type == GGML_TYPE_Q4_0) { // TODO: enable for AVX2 - currently disabled due to bad gemv performance if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { @@ -3975,17 +3991,18 @@ static const ggml::cpu::tensor_traits* ggml_aarch64_get_optimal_repack_type(cons } static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { - tensor->extra = (void *)const_cast(ggml_aarch64_get_optimal_repack_type(tensor)); + tensor->extra = (void *) const_cast(ggml_aarch64_get_optimal_repack_type(tensor)); GGML_UNUSED(buffer); } -static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + const void * data, size_t offset, size_t size) { GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); - auto tensor_traits = (ggml::cpu::aarch64::tensor_traits_base*)tensor->extra; - auto OK = tensor_traits->repack(tensor, data, size); + auto tensor_traits = (ggml::cpu::aarch64::tensor_traits_base *) tensor->extra; + auto OK = tensor_traits->repack(tensor, data, size); GGML_ASSERT(OK == 0); GGML_UNUSED(buffer); @@ -4004,9 +4021,9 @@ static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(g return nullptr; } - buffer->buft = buft; + buffer->buft = buft; buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor; - buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor; + buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor; return buffer; } @@ -4017,57 +4034,48 @@ static size_t ggml_backend_cpu_aarch64_buffer_type_get_alignment(ggml_backend_bu } namespace ggml::cpu::aarch64 { - class extra_buffer_type : ggml::cpu::extra_buffer_type { - - bool supports_op(ggml_backend_dev_t , const struct ggml_tensor * op) override { - if ( op->op == GGML_OP_MUL_MAT && - op->src[0]->buffer && - (ggml_n_dims(op->src[0]) == 2) && - op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() && - ggml_aarch64_get_optimal_repack_type(op->src[0]) - ) - { - if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { - return false; - } - if (op->src[1]->type == GGML_TYPE_F32) { - return true; - } - //if (op->src[1]->type == GGML_TYPE_Q8_0) { - // return true; - //} - // may be possible if Q8_0 packed... +class extra_buffer_type : ggml::cpu::extra_buffer_type { + bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer && (ggml_n_dims(op->src[0]) == 2) && + op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() && + ggml_aarch64_get_optimal_repack_type(op->src[0])) { + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; } - return false; + if (op->src[1]->type == GGML_TYPE_F32) { + return true; + } + //if (op->src[1]->type == GGML_TYPE_Q8_0) { + // return true; + //} + // may be possible if Q8_0 packed... + } + return false; + } + + ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer && + op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()) { + return (ggml::cpu::tensor_traits *) op->src[0]->extra; } - ggml::cpu::tensor_traits* get_tensor_traits(const struct ggml_tensor * op) override { - if ( op->op == GGML_OP_MUL_MAT && - op->src[0]->buffer && - op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() - ) - { - return (ggml::cpu::tensor_traits*) op->src[0]->extra; - } - - return nullptr; - } - - }; -} + return nullptr; + } +}; +} // namespace ggml::cpu::aarch64 ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = { - /* .iface = */ { - /* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_aarch64_buffer_type_get_alignment, - /* .get_max_size = */ nullptr, // defaults to SIZE_MAX - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes - /* .is_host = */ nullptr, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ new ggml::cpu::aarch64::extra_buffer_type(), + /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_aarch64_buffer_type_get_alignment, + /* .get_max_size = */ nullptr, // defaults to SIZE_MAX + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ nullptr, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ new ggml::cpu::aarch64::extra_buffer_type(), }; return &ggml_backend_cpu_buffer_type_aarch64; diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h index a1f89c091..6e84c826b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h @@ -1,9 +1,8 @@ #pragma once -#include "ggml.h" #include "ggml-cpu-traits.h" +#include "ggml.h" // GGML internal header ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void); - diff --git a/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp b/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp index c079c3302..fa8dea2af 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp @@ -21,17 +21,18 @@ static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer hbw_free(buffer->context); } -static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { void * ptr; - int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); + int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); if (result != 0) { GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size); return NULL; } ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); - buffer->buft = buft; - buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; return buffer; } @@ -39,17 +40,16 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { /* .iface = */ { - /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, - }, - /* .context = */ NULL, + /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_max_size = */ nullptr, // defaults to SIZE_MAX + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + }, + /* .context = */ nullptr, }; return &ggml_backend_cpu_buffer_type_hbm; } #endif - diff --git a/ggml/src/ggml-cpu/ggml-cpu-hbm.h b/ggml/src/ggml-cpu/ggml-cpu-hbm.h index f31ece668..09a1f09d7 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-hbm.h +++ b/ggml/src/ggml-cpu/ggml-cpu-hbm.h @@ -1,9 +1,8 @@ #pragma once -#include "ggml.h" #include "ggml-backend.h" +#include "ggml.h" // GGML CPU internal header ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); - diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp b/ggml/src/ggml-cpu/ggml-cpu-traits.cpp index ffb915031..62a0712da 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-traits.cpp @@ -1,16 +1,18 @@ #include "ggml-cpu-traits.h" -#include "ggml-backend.h" + #include "ggml-backend-impl.h" +#include "ggml-backend.h" namespace ggml::cpu { - tensor_traits::~tensor_traits() {} - extra_buffer_type::~extra_buffer_type() {} -} +tensor_traits::~tensor_traits() {} + +extra_buffer_type::~extra_buffer_type() {} +} // namespace ggml::cpu bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) { for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { if (extra && extra->context) { - auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context; + auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context; auto tensor_traits = buf_extra->get_tensor_traits(op); if (tensor_traits && tensor_traits->compute_forward(params, op)) { return true; @@ -23,7 +25,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) { for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { if (extra && extra->context) { - auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context; + auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context; auto tensor_traits = buf_extra->get_tensor_traits(op); if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) { return true; diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.h b/ggml/src/ggml-cpu/ggml-cpu-traits.h index 6e06af74f..3b8cfac43 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-traits.h +++ b/ggml/src/ggml-cpu/ggml-cpu-traits.h @@ -1,10 +1,10 @@ #pragma once -#include "ggml.h" #include "ggml-backend-impl.h" #include "ggml-cpu-impl.h" +#include "ggml.h" -#ifdef __cplusplus -#include +#ifdef __cplusplus +# include extern "C" { #endif @@ -12,27 +12,27 @@ extern "C" { bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op); bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size); -#ifdef __cplusplus +#ifdef __cplusplus } namespace ggml::cpu { - // register in tensor->extra - class tensor_traits { - public: - ~tensor_traits(); - virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0; - virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0; - }; +// register in tensor->extra +class tensor_traits { + public: + ~tensor_traits(); + virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0; + virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0; +}; - class extra_buffer_type { - public: - ~extra_buffer_type(); - virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0; - virtual tensor_traits* get_tensor_traits(const struct ggml_tensor * op) = 0; - }; -} // namespace ggml::cpu +class extra_buffer_type { + public: + ~extra_buffer_type(); + virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0; + virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op) = 0; +}; +} // namespace ggml::cpu // implemented in ggml-cpu.cpp. -std::vector& ggml_backend_cpu_get_extra_buffers_type(); +std::vector & ggml_backend_cpu_get_extra_buffers_type(); #endif