clang-format
This commit is contained in:
parent
98ea414f81
commit
95322e93bf
7 changed files with 334 additions and 331 deletions
|
@ -21,7 +21,6 @@
|
||||||
// AMX type_trais
|
// AMX type_trais
|
||||||
namespace ggml::cpu::amx {
|
namespace ggml::cpu::amx {
|
||||||
class tensor_traits : public ggml::cpu::tensor_traits {
|
class tensor_traits : public ggml::cpu::tensor_traits {
|
||||||
|
|
||||||
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
|
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
|
||||||
size = ggml_backend_amx_desired_wsize(op);
|
size = ggml_backend_amx_desired_wsize(op);
|
||||||
return true;
|
return true;
|
||||||
|
@ -40,7 +39,7 @@ namespace ggml::cpu::amx {
|
||||||
static tensor_traits traits;
|
static tensor_traits traits;
|
||||||
return &traits;
|
return &traits;
|
||||||
}
|
}
|
||||||
}
|
} // namespace ggml::cpu::amx
|
||||||
|
|
||||||
// AMX buffer interface
|
// AMX buffer interface
|
||||||
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
@ -57,13 +56,15 @@ static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, st
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||||
|
uint8_t value, size_t offset, size_t size) {
|
||||||
memset((char *) tensor->data + offset, value, size);
|
memset((char *) tensor->data + offset, value, size);
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||||
|
const void * data, size_t offset, size_t size) {
|
||||||
if (qtype_has_amx_kernels(tensor->type)) {
|
if (qtype_has_amx_kernels(tensor->type)) {
|
||||||
ggml_backend_amx_convert_weight(tensor, data, offset, size);
|
ggml_backend_amx_convert_weight(tensor, data, offset, size);
|
||||||
} else {
|
} else {
|
||||||
|
@ -143,15 +144,11 @@ namespace ggml::cpu::amx {
|
||||||
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
|
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
if ( op->op == GGML_OP_MUL_MAT &&
|
if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
|
||||||
is_contiguous_2d(op->src[0]) && // src0 must be contiguous
|
|
||||||
is_contiguous_2d(op->src[1]) && // src1 must be contiguous
|
is_contiguous_2d(op->src[1]) && // src1 must be contiguous
|
||||||
op->src[0]->buffer &&
|
op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
|
||||||
op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
|
|
||||||
op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
|
op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
|
||||||
(qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))
|
(qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
|
||||||
)
|
|
||||||
{
|
|
||||||
// src1 must be host buffer
|
// src1 must be host buffer
|
||||||
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -165,18 +162,15 @@ namespace ggml::cpu::amx {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
|
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
|
||||||
if ( op->op == GGML_OP_MUL_MAT &&
|
if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
|
||||||
op->src[0]->buffer &&
|
op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
|
||||||
op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()
|
|
||||||
)
|
|
||||||
{
|
|
||||||
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
} // namespace ggml::cpu::amx
|
||||||
|
|
||||||
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||||
return ggml_backend_amx_get_alloc_size(tensor);
|
return ggml_backend_amx_get_alloc_size(tensor);
|
||||||
|
@ -200,13 +194,14 @@ static bool ggml_amx_init() {
|
||||||
return true;
|
return true;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
||||||
/* .is_host = */ nullptr,
|
/* .is_host = */ nullptr,
|
||||||
},
|
},
|
||||||
|
@ -215,7 +210,7 @@ ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!ggml_amx_init()) {
|
if (!ggml_amx_init()) {
|
||||||
return NULL;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
return &ggml_backend_buffer_type_amx;
|
return &ggml_backend_buffer_type_amx;
|
||||||
|
|
|
@ -20,16 +20,20 @@
|
||||||
|
|
||||||
// TODO: move to include file?
|
// TODO: move to include file?
|
||||||
template <int K> constexpr int QK_0() {
|
template <int K> constexpr int QK_0() {
|
||||||
if constexpr (K==4) return QK4_0;
|
if constexpr (K == 4) {
|
||||||
if constexpr (K==8) return QK8_0;
|
return QK4_0;
|
||||||
|
}
|
||||||
|
if constexpr (K == 8) {
|
||||||
|
return QK8_0;
|
||||||
|
}
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int K, int N>
|
template <int K, int N> struct block {
|
||||||
struct block {
|
|
||||||
ggml_half d[N]; // deltas for N qK_0 blocks
|
ggml_half d[N]; // deltas for N qK_0 blocks
|
||||||
int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks
|
int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks
|
||||||
};
|
};
|
||||||
|
|
||||||
// control size
|
// control size
|
||||||
static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
|
static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
|
||||||
static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
|
static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
|
||||||
|
@ -45,6 +49,7 @@ struct block_iq4_nlx4 {
|
||||||
ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
||||||
uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks
|
uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
||||||
|
|
||||||
#if defined(__GNUC__)
|
#if defined(__GNUC__)
|
||||||
|
@ -3800,54 +3805,69 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
|
||||||
|
|
||||||
namespace ggml::cpu::aarch64 {
|
namespace ggml::cpu::aarch64 {
|
||||||
// repack
|
// repack
|
||||||
template<typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> int repack(struct ggml_tensor *, const void *, size_t);
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
||||||
|
int repack(struct ggml_tensor *, const void *, size_t);
|
||||||
|
|
||||||
// TODO: generalise.
|
// TODO: generalise.
|
||||||
template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||||
return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
|
return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||||
return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
|
return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||||
return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
|
return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||||
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||||
return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
|
return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// gemv
|
// gemv
|
||||||
template<typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> void gemv(int, float *, size_t, const void *, const void *, int, int);
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
||||||
|
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
||||||
|
|
||||||
template <> void gemv<block_q4_0, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
template <> void gemv<block_q4_0, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> void gemv<block_q4_0, 8, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
template <> void gemv<block_q4_0, 8, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> void gemv<block_q4_0, 8, 8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
template <> void gemv<block_q4_0, 8, 8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
template<> void gemv<block_iq4_nl,4,4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
||||||
|
template <>
|
||||||
|
void gemv<block_iq4_nl, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// gemm
|
// gemm
|
||||||
template<typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> void gemm(int, float *, size_t, const void *, const void *, int, int);
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
||||||
|
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
||||||
|
|
||||||
template <> void gemm<block_q4_0, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
template <> void gemm<block_q4_0, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> void gemm<block_q4_0, 8, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
template <> void gemm<block_q4_0, 8, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> void gemm<block_q4_0, 8, 8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
template <> void gemm<block_q4_0, 8, 8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
template<> void gemm<block_iq4_nl,4,4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
||||||
|
template <>
|
||||||
|
void gemm<block_iq4_nl, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3856,9 +3876,7 @@ namespace ggml::cpu::aarch64 {
|
||||||
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_traits : public tensor_traits_base {
|
||||||
class tensor_traits : public tensor_traits_base {
|
|
||||||
|
|
||||||
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
|
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
|
||||||
// not realy a GGML_TYPE_Q8_0 but same size.
|
// not realy a GGML_TYPE_Q8_0 but same size.
|
||||||
size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1]));
|
size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1]));
|
||||||
|
@ -3896,19 +3914,16 @@ namespace ggml::cpu::aarch64 {
|
||||||
|
|
||||||
assert(params->wsize >= nbw1 * ne11);
|
assert(params->wsize >= nbw1 * ne11);
|
||||||
|
|
||||||
ggml_from_float_t const from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float;
|
const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float;
|
||||||
|
|
||||||
int64_t i11_processed = 0;
|
int64_t i11_processed = 0;
|
||||||
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
|
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
|
||||||
quantize_mat_q8_0((float *)((char *) src1->data + i11*nb11),
|
quantize_mat_q8_0((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10,
|
||||||
(void *) (wdata + i11*nbw1),
|
INTER_SIZE);
|
||||||
4, ne10, INTER_SIZE);
|
|
||||||
}
|
}
|
||||||
i11_processed = ne11 - ne11 % 4;
|
i11_processed = ne11 - ne11 % 4;
|
||||||
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
|
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
|
||||||
from_float((float *)((char *) src1->data + i11*nb11),
|
from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
|
||||||
(void *) (wdata + i11*nbw1),
|
|
||||||
ne10);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_barrier(params->threadpool);
|
ggml_barrier(params->threadpool);
|
||||||
|
@ -3919,7 +3934,9 @@ namespace ggml::cpu::aarch64 {
|
||||||
int64_t src0_end = ((ith + 1) * ne01) / nth;
|
int64_t src0_end = ((ith + 1) * ne01) / nth;
|
||||||
src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
|
src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
|
||||||
src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
|
src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
|
||||||
if (src0_start >= src0_end) return true;
|
if (src0_start >= src0_end) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
|
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
|
||||||
if (ne11 > 3) {
|
if (ne11 > 3) {
|
||||||
|
@ -3940,7 +3957,6 @@ namespace ggml::cpu::aarch64 {
|
||||||
int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
|
int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
|
||||||
return ggml::cpu::aarch64::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
|
return ggml::cpu::aarch64::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// instance for Q4
|
// instance for Q4
|
||||||
|
@ -3951,7 +3967,7 @@ namespace ggml::cpu::aarch64 {
|
||||||
// instance for IQ4
|
// instance for IQ4
|
||||||
static const tensor_traits<block_iq4_nl, 4, 4> iq4_nl_4x4_q8_0;
|
static const tensor_traits<block_iq4_nl, 4, 4> iq4_nl_4x4_q8_0;
|
||||||
|
|
||||||
}
|
} // namespace ggml::cpu::aarch64
|
||||||
|
|
||||||
static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
|
static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
|
||||||
if (cur->type == GGML_TYPE_Q4_0) {
|
if (cur->type == GGML_TYPE_Q4_0) {
|
||||||
|
@ -3980,7 +3996,8 @@ static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t bu
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||||
|
const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset == 0);
|
GGML_ASSERT(offset == 0);
|
||||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||||
|
|
||||||
|
@ -4018,15 +4035,10 @@ static size_t ggml_backend_cpu_aarch64_buffer_type_get_alignment(ggml_backend_bu
|
||||||
|
|
||||||
namespace ggml::cpu::aarch64 {
|
namespace ggml::cpu::aarch64 {
|
||||||
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||||
|
|
||||||
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
||||||
if ( op->op == GGML_OP_MUL_MAT &&
|
if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer && (ggml_n_dims(op->src[0]) == 2) &&
|
||||||
op->src[0]->buffer &&
|
|
||||||
(ggml_n_dims(op->src[0]) == 2) &&
|
|
||||||
op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() &&
|
op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() &&
|
||||||
ggml_aarch64_get_optimal_repack_type(op->src[0])
|
ggml_aarch64_get_optimal_repack_type(op->src[0])) {
|
||||||
)
|
|
||||||
{
|
|
||||||
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -4042,19 +4054,15 @@ namespace ggml::cpu::aarch64 {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
|
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
|
||||||
if ( op->op == GGML_OP_MUL_MAT &&
|
if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
|
||||||
op->src[0]->buffer &&
|
op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()) {
|
||||||
op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()
|
|
||||||
)
|
|
||||||
{
|
|
||||||
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
} // namespace ggml::cpu::aarch64
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-cpu-traits.h"
|
#include "ggml-cpu-traits.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
// GGML internal header
|
// GGML internal header
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
|
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,8 @@ static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer
|
||||||
hbw_free(buffer->context);
|
hbw_free(buffer->context);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||||
|
size_t size) {
|
||||||
void * ptr;
|
void * ptr;
|
||||||
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
|
@ -42,14 +43,13 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||||
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
return &ggml_backend_cpu_buffer_type_hbm;
|
return &ggml_backend_cpu_buffer_type_hbm;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
// GGML CPU internal header
|
// GGML CPU internal header
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
#include "ggml-cpu-traits.h"
|
#include "ggml-cpu-traits.h"
|
||||||
#include "ggml-backend.h"
|
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
namespace ggml::cpu {
|
namespace ggml::cpu {
|
||||||
tensor_traits::~tensor_traits() {}
|
tensor_traits::~tensor_traits() {}
|
||||||
|
|
||||||
extra_buffer_type::~extra_buffer_type() {}
|
extra_buffer_type::~extra_buffer_type() {}
|
||||||
}
|
} // namespace ggml::cpu
|
||||||
|
|
||||||
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
|
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
|
||||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
#include "ggml-cpu-impl.h"
|
#include "ggml-cpu-impl.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
# include <vector>
|
# include <vector>
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue