clang-format

This commit is contained in:
Djip007 2024-12-03 00:09:44 +01:00
parent 98ea414f81
commit 95322e93bf
7 changed files with 334 additions and 331 deletions

View file

@ -20,8 +20,7 @@
// AMX type_trais // AMX type_trais
namespace ggml::cpu::amx { namespace ggml::cpu::amx {
class tensor_traits : public ggml::cpu::tensor_traits { class tensor_traits : public ggml::cpu::tensor_traits {
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
size = ggml_backend_amx_desired_wsize(op); size = ggml_backend_amx_desired_wsize(op);
return true; return true;
@ -34,13 +33,13 @@ namespace ggml::cpu::amx {
} }
return false; return false;
} }
}; };
static ggml::cpu::tensor_traits* get_tensor_traits(ggml_backend_buffer_t , struct ggml_tensor *) { static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
static tensor_traits traits; static tensor_traits traits;
return &traits; return &traits;
}
} }
} // namespace ggml::cpu::amx
// AMX buffer interface // AMX buffer interface
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@ -48,26 +47,28 @@ static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
} }
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)(buffer->context); return (void *) (buffer->context);
} }
static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
tensor->extra = (void *)ggml::cpu::amx::get_tensor_traits(buffer, tensor); tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
memset((char *)tensor->data + offset, value, size); uint8_t value, size_t offset, size_t size) {
memset((char *) tensor->data + offset, value, size);
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
const void * data, size_t offset, size_t size) {
if (qtype_has_amx_kernels(tensor->type)) { if (qtype_has_amx_kernels(tensor->type)) {
ggml_backend_amx_convert_weight(tensor, data, offset, size); ggml_backend_amx_convert_weight(tensor, data, offset, size);
} else { } else {
memcpy((char *)tensor->data + offset, data, size); memcpy((char *) tensor->data + offset, data, size);
} }
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
@ -136,22 +137,18 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ
} }
namespace ggml::cpu::amx { namespace ggml::cpu::amx {
class extra_buffer_type : ggml::cpu::extra_buffer_type { class extra_buffer_type : ggml::cpu::extra_buffer_type {
bool supports_op(ggml_backend_dev_t , const struct ggml_tensor * op) override { bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
// handle only 2d gemm for now // handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) { auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
}; };
if ( op->op == GGML_OP_MUL_MAT && if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
is_contiguous_2d(op->src[0]) && // src0 must be contiguous
is_contiguous_2d(op->src[1]) && // src1 must be contiguous is_contiguous_2d(op->src[1]) && // src1 must be contiguous
op->src[0]->buffer && op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
(qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16)) (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
)
{
// src1 must be host buffer // src1 must be host buffer
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
return false; return false;
@ -164,21 +161,18 @@ namespace ggml::cpu::amx {
return false; return false;
} }
ggml::cpu::tensor_traits* get_tensor_traits(const struct ggml_tensor * op) override { ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
if ( op->op == GGML_OP_MUL_MAT && if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() return (ggml::cpu::tensor_traits *) op->src[0]->extra;
)
{
return (ggml::cpu::tensor_traits*) op->src[0]->extra;
} }
return nullptr; return nullptr;
} }
}; };
} } // namespace ggml::cpu::amx
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
return ggml_backend_amx_get_alloc_size(tensor); return ggml_backend_amx_get_alloc_size(tensor);
GGML_UNUSED(buft); GGML_UNUSED(buft);
@ -200,13 +194,14 @@ static bool ggml_amx_init() {
return true; return true;
#endif #endif
} }
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ { /* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name, /* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ nullptr, /* .is_host = */ nullptr,
}, },
@ -215,7 +210,7 @@ ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
}; };
if (!ggml_amx_init()) { if (!ggml_amx_init()) {
return NULL; return nullptr;
} }
return &ggml_backend_buffer_type_amx; return &ggml_backend_buffer_type_amx;

View file

@ -20,31 +20,36 @@
// TODO: move to include file? // TODO: move to include file?
template <int K> constexpr int QK_0() { template <int K> constexpr int QK_0() {
if constexpr (K==4) return QK4_0; if constexpr (K == 4) {
if constexpr (K==8) return QK8_0; return QK4_0;
}
if constexpr (K == 8) {
return QK8_0;
}
return -1; return -1;
} }
template<int K, int N> template <int K, int N> struct block {
struct block {
ggml_half d[N]; // deltas for N qK_0 blocks ggml_half d[N]; // deltas for N qK_0 blocks
int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks
}; };
// control size
static_assert(sizeof(block<4,4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
static_assert(sizeof(block<4,8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
static_assert(sizeof(block<8,4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
static_assert(sizeof(block<8,8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
using block_q4_0x4 = block<4,4>; // control size
using block_q4_0x8 = block<4,8>; static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
using block_q8_0x4 = block<8,4>; static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
using block_q8_0x8 = block<8,8>; static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
using block_q4_0x4 = block<4, 4>;
using block_q4_0x8 = block<4, 8>;
using block_q8_0x4 = block<8, 4>;
using block_q8_0x8 = block<8, 8>;
struct block_iq4_nlx4 { struct block_iq4_nlx4 {
ggml_half d[4]; // deltas for 4 iq4_nl blocks ggml_half d[4]; // deltas for 4 iq4_nl blocks
uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks
}; };
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding"); static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
#if defined(__GNUC__) #if defined(__GNUC__)
@ -3799,66 +3804,79 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
} }
namespace ggml::cpu::aarch64 { namespace ggml::cpu::aarch64 {
// repack // repack
template<typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> int repack(struct ggml_tensor *, const void *, size_t); template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
int repack(struct ggml_tensor *, const void *, size_t);
// TODO: generalise. // TODO: generalise.
template<> int repack<block_q4_0,4,4>(struct ggml_tensor * t, const void * data, size_t data_size) { template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size); return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
} }
template<> int repack<block_q4_0,8,4>(struct ggml_tensor * t, const void * data, size_t data_size) {
template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size); return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
} }
template<> int repack<block_q4_0,8,8>(struct ggml_tensor * t, const void * data, size_t data_size) {
template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size); return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
} }
template<> int repack<block_iq4_nl,4,4>(struct ggml_tensor * t, const void * data, size_t data_size) {
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
} }
template<> int repack<block_iq4_nl,8,4>(struct ggml_tensor * t, const void * data, size_t data_size) {
template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
} }
// gemv // gemv
template<typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> void gemv(int, float *, size_t, const void *, const void *, int, int); template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
void gemv(int, float *, size_t, const void *, const void *, int, int);
template<> void gemv<block_q4_0,4,4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { template <> void gemv<block_q4_0, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
} }
template<> void gemv<block_q4_0,8,4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
template <> void gemv<block_q4_0, 8, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
} }
template<> void gemv<block_q4_0,8,8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
template <> void gemv<block_q4_0, 8, 8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
} }
template<> void gemv<block_iq4_nl,4,4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
template <>
void gemv<block_iq4_nl, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
} }
// gemm // gemm
template<typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> void gemm(int, float *, size_t, const void *, const void *, int, int); template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
void gemm(int, float *, size_t, const void *, const void *, int, int);
template<> void gemm<block_q4_0,4,4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { template <> void gemm<block_q4_0, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
} }
template<> void gemm<block_q4_0,8,4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
template <> void gemm<block_q4_0, 8, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
} }
template<> void gemm<block_q4_0,8,8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
template <> void gemm<block_q4_0, 8, 8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
} }
template<> void gemm<block_iq4_nl,4,4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
template <>
void gemm<block_iq4_nl, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
} }
class tensor_traits_base : public ggml::cpu::tensor_traits { class tensor_traits_base : public ggml::cpu::tensor_traits {
public: public:
virtual int repack (struct ggml_tensor *t, const void * data, size_t data_size) = 0; virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
}; };
template<typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
class tensor_traits : public tensor_traits_base {
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_traits : public tensor_traits_base {
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
// not realy a GGML_TYPE_Q8_0 but same size. // not realy a GGML_TYPE_Q8_0 but same size.
size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1])); size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1]));
@ -3891,24 +3909,21 @@ namespace ggml::cpu::aarch64 {
GGML_ASSERT(ggml_n_dims(op->src[0]) == 2); GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
// GGML_ASSERT(ggml_n_dims(op->src[1]) == 2); // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
char * wdata = static_cast<char*> (params->wdata); char * wdata = static_cast<char *>(params->wdata);
const size_t nbw1 = ggml_row_size(GGML_TYPE_Q8_0, ne10); const size_t nbw1 = ggml_row_size(GGML_TYPE_Q8_0, ne10);
assert(params->wsize >= nbw1*ne11); assert(params->wsize >= nbw1 * ne11);
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float; const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float;
int64_t i11_processed = 0; int64_t i11_processed = 0;
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
quantize_mat_q8_0((float *)((char *) src1->data + i11*nb11), quantize_mat_q8_0((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10,
(void *) (wdata + i11*nbw1), INTER_SIZE);
4, ne10, INTER_SIZE);
} }
i11_processed = ne11 - ne11 % 4; i11_processed = ne11 - ne11 % 4;
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
from_float((float *)((char *) src1->data + i11*nb11), from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
(void *) (wdata + i11*nbw1),
ne10);
} }
ggml_barrier(params->threadpool); ggml_barrier(params->threadpool);
@ -3917,18 +3932,20 @@ namespace ggml::cpu::aarch64 {
const size_t src1_col_stride = ggml_row_size(GGML_TYPE_Q8_0, ne10); const size_t src1_col_stride = ggml_row_size(GGML_TYPE_Q8_0, ne10);
int64_t src0_start = (ith * ne01) / nth; int64_t src0_start = (ith * ne01) / nth;
int64_t src0_end = ((ith + 1) * ne01) / nth; int64_t src0_end = ((ith + 1) * ne01) / nth;
src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS): src0_start; src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS): src0_end; src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
if (src0_start >= src0_end) return true; if (src0_start >= src0_end) {
return true;
}
// If there are more than three rows in src1, use gemm; otherwise, use gemv. // If there are more than three rows in src1, use gemm; otherwise, use gemv.
if (ne11 > 3) { if (ne11 > 3) {
gemm<BLOC_TYPE, INTER_SIZE, NB_COLS>(ne00, (float *)((char *) dst->data) + src0_start, ne01, gemm<BLOC_TYPE, INTER_SIZE, NB_COLS>(ne00, (float *) ((char *) dst->data) + src0_start, ne01,
(const char *) src0->data + src0_start * nb01, (const char *) src0->data + src0_start * nb01,
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
} }
for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) { for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS>(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01, gemv<BLOC_TYPE, INTER_SIZE, NB_COLS>(ne00, (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
(const char *) src0->data + src0_start * nb01, (const char *) src0->data + src0_start * nb01,
(const char *) src1_wdata + (src1_col_stride * iter), 1, (const char *) src1_wdata + (src1_col_stride * iter), 1,
src0_end - src0_start); src0_end - src0_start);
@ -3937,23 +3954,22 @@ namespace ggml::cpu::aarch64 {
return true; return true;
} }
int repack (struct ggml_tensor *t, const void * data, size_t data_size) override { int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
return ggml::cpu::aarch64::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size); return ggml::cpu::aarch64::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
} }
};
}; // instance for Q4
static const tensor_traits<block_q4_0, 4, 4> q4_0_4x4_q8_0;
static const tensor_traits<block_q4_0, 8, 4> q4_0_4x8_q8_0;
static const tensor_traits<block_q4_0, 8, 8> q4_0_8x8_q8_0;
// instance for Q4 // instance for IQ4
static const tensor_traits<block_q4_0,4,4> q4_0_4x4_q8_0; static const tensor_traits<block_iq4_nl, 4, 4> iq4_nl_4x4_q8_0;
static const tensor_traits<block_q4_0,8,4> q4_0_4x8_q8_0;
static const tensor_traits<block_q4_0,8,8> q4_0_8x8_q8_0;
// instance for IQ4 } // namespace ggml::cpu::aarch64
static const tensor_traits<block_iq4_nl,4,4> iq4_nl_4x4_q8_0;
} static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
static const ggml::cpu::tensor_traits* ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
if (cur->type == GGML_TYPE_Q4_0) { if (cur->type == GGML_TYPE_Q4_0) {
// TODO: enable for AVX2 - currently disabled due to bad gemv performance // TODO: enable for AVX2 - currently disabled due to bad gemv performance
if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
@ -3975,16 +3991,17 @@ static const ggml::cpu::tensor_traits* ggml_aarch64_get_optimal_repack_type(cons
} }
static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
tensor->extra = (void *)const_cast<ggml::cpu::tensor_traits*>(ggml_aarch64_get_optimal_repack_type(tensor)); tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));
GGML_UNUSED(buffer); GGML_UNUSED(buffer);
} }
static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
const void * data, size_t offset, size_t size) {
GGML_ASSERT(offset == 0); GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor)); GGML_ASSERT(size == ggml_nbytes(tensor));
auto tensor_traits = (ggml::cpu::aarch64::tensor_traits_base*)tensor->extra; auto tensor_traits = (ggml::cpu::aarch64::tensor_traits_base *) tensor->extra;
auto OK = tensor_traits->repack(tensor, data, size); auto OK = tensor_traits->repack(tensor, data, size);
GGML_ASSERT(OK == 0); GGML_ASSERT(OK == 0);
@ -4017,16 +4034,11 @@ static size_t ggml_backend_cpu_aarch64_buffer_type_get_alignment(ggml_backend_bu
} }
namespace ggml::cpu::aarch64 { namespace ggml::cpu::aarch64 {
class extra_buffer_type : ggml::cpu::extra_buffer_type { class extra_buffer_type : ggml::cpu::extra_buffer_type {
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
bool supports_op(ggml_backend_dev_t , const struct ggml_tensor * op) override { if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer && (ggml_n_dims(op->src[0]) == 2) &&
if ( op->op == GGML_OP_MUL_MAT &&
op->src[0]->buffer &&
(ggml_n_dims(op->src[0]) == 2) &&
op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() &&
ggml_aarch64_get_optimal_repack_type(op->src[0]) ggml_aarch64_get_optimal_repack_type(op->src[0])) {
)
{
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
return false; return false;
} }
@ -4041,20 +4053,16 @@ namespace ggml::cpu::aarch64 {
return false; return false;
} }
ggml::cpu::tensor_traits* get_tensor_traits(const struct ggml_tensor * op) override { ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
if ( op->op == GGML_OP_MUL_MAT && if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()) {
op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() return (ggml::cpu::tensor_traits *) op->src[0]->extra;
)
{
return (ggml::cpu::tensor_traits*) op->src[0]->extra;
} }
return nullptr; return nullptr;
} }
};
}; } // namespace ggml::cpu::aarch64
}
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) { ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {

View file

@ -1,9 +1,8 @@
#pragma once #pragma once
#include "ggml.h"
#include "ggml-cpu-traits.h" #include "ggml-cpu-traits.h"
#include "ggml.h"
// GGML internal header // GGML internal header
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void); ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);

View file

@ -21,7 +21,8 @@ static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer
hbw_free(buffer->context); hbw_free(buffer->context);
} }
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
size_t size) {
void * ptr; void * ptr;
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
if (result != 0) { if (result != 0) {
@ -42,14 +43,13 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
}, },
/* .context = */ NULL, /* .context = */ nullptr,
}; };
return &ggml_backend_cpu_buffer_type_hbm; return &ggml_backend_cpu_buffer_type_hbm;
} }
#endif #endif

View file

@ -1,9 +1,8 @@
#pragma once #pragma once
#include "ggml.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#include "ggml.h"
// GGML CPU internal header // GGML CPU internal header
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);

View file

@ -1,16 +1,18 @@
#include "ggml-cpu-traits.h" #include "ggml-cpu-traits.h"
#include "ggml-backend.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include "ggml-backend.h"
namespace ggml::cpu { namespace ggml::cpu {
tensor_traits::~tensor_traits() {} tensor_traits::~tensor_traits() {}
extra_buffer_type::~extra_buffer_type() {}
} extra_buffer_type::~extra_buffer_type() {}
} // namespace ggml::cpu
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) { bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
if (extra && extra->context) { if (extra && extra->context) {
auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context; auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
auto tensor_traits = buf_extra->get_tensor_traits(op); auto tensor_traits = buf_extra->get_tensor_traits(op);
if (tensor_traits && tensor_traits->compute_forward(params, op)) { if (tensor_traits && tensor_traits->compute_forward(params, op)) {
return true; return true;
@ -23,7 +25,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) { bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
if (extra && extra->context) { if (extra && extra->context) {
auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context; auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
auto tensor_traits = buf_extra->get_tensor_traits(op); auto tensor_traits = buf_extra->get_tensor_traits(op);
if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) { if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) {
return true; return true;

View file

@ -1,10 +1,10 @@
#pragma once #pragma once
#include "ggml.h"
#include "ggml-backend-impl.h" #include "ggml-backend-impl.h"
#include "ggml-cpu-impl.h" #include "ggml-cpu-impl.h"
#include "ggml.h"
#ifdef __cplusplus #ifdef __cplusplus
#include <vector> # include <vector>
extern "C" { extern "C" {
#endif #endif
@ -16,23 +16,23 @@ bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size
} }
namespace ggml::cpu { namespace ggml::cpu {
// register in tensor->extra // register in tensor->extra
class tensor_traits { class tensor_traits {
public: public:
~tensor_traits(); ~tensor_traits();
virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0; virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0;
virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0; virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
}; };
class extra_buffer_type { class extra_buffer_type {
public: public:
~extra_buffer_type(); ~extra_buffer_type();
virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0; virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
virtual tensor_traits* get_tensor_traits(const struct ggml_tensor * op) = 0; virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op) = 0;
}; };
} // namespace ggml::cpu } // namespace ggml::cpu
// implemented in ggml-cpu.cpp. // implemented in ggml-cpu.cpp.
std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type(); std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
#endif #endif