sync : ggml (new ops, tests, backend, etc.) (#4359)

* sync : ggml (part 1)

* sync : ggml (part 2, CUDA)

* sync : ggml (part 3, Metal)

* ggml : build fixes

ggml-ci

* cuda : restore lost changes

* cuda : restore lost changes (StableLM rope)

* cmake : enable separable compilation for CUDA

ggml-ci

* ggml-cuda : remove device side dequantize

* Revert "cmake : enable separable compilation for CUDA"

This reverts commit 09e35d04b1.

* cuda : remove assert for rope

* tests : add test-backend-ops

* ggml : fix bug in ggml_concat

* ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()`

* ci : try to fix macOS

* ggml-backend : remove backend self-registration

* ci : disable Metal for macOS cmake build

ggml-ci

* metal : fix "supports family" call

* metal : fix assert

* metal : print resource path

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
Georgi Gerganov 2023-12-07 22:26:54 +02:00 committed by GitHub
parent bcc0eb4591
commit fe680e3d10
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 4637 additions and 1031 deletions

414
ggml.c
View file

@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
#define UNUSED GGML_UNUSED
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
//
// tensor access macros
//
#define GGML_TENSOR_UNARY_OP_LOCALS \
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
#define GGML_TENSOR_BINARY_OP_LOCALS \
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
#if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h>
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
@ -1613,6 +1595,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GROUP_NORM",
"MUL_MAT",
"MUL_MAT_ID",
"OUT_PROD",
"SCALE",
@ -1640,6 +1623,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"POOL_1D",
"POOL_2D",
"UPSCALE",
"ARGSORT",
"FLASH_ATTN",
"FLASH_FF",
@ -1666,7 +1650,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK",
};
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@ -1695,6 +1679,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"group_norm(x)",
"X*Y",
"X[i]*Y",
"X*Y",
"x*v",
@ -1722,6 +1707,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"pool_1d(x)",
"pool_2d(x)",
"upscale(x)",
"argsort(x)",
"flash_attn(x)",
"flash_ff(x)",
@ -1748,10 +1734,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)",
};
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
"ABS",
"SGN",
"NEG",
"STEP",
"TANH",
"ELU",
"RELU",
"GELU",
"GELU_QUICK",
"SILU",
"LEAKY",
};
static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@ -1771,6 +1775,7 @@ static void ggml_setup_op_has_task_pass(void) {
p[GGML_OP_ACC ] = true;
p[GGML_OP_MUL_MAT ] = true;
p[GGML_OP_MUL_MAT_ID ] = true;
p[GGML_OP_OUT_PROD ] = true;
p[GGML_OP_SET ] = true;
p[GGML_OP_GET_ROWS_BACK ] = true;
@ -2023,6 +2028,20 @@ const char * ggml_op_symbol(enum ggml_op op) {
return GGML_OP_SYMBOL[op];
}
const char * ggml_unary_op_name(enum ggml_unary_op op) {
return GGML_UNARY_OP_NAME[op];
}
const char * ggml_op_desc(const struct ggml_tensor * t) {
if (t->op == GGML_OP_UNARY) {
enum ggml_unary_op uop = ggml_get_unary_op(t);
return ggml_unary_op_name(uop);
}
else {
return ggml_op_name(t->op);
}
}
size_t ggml_element_size(const struct ggml_tensor * tensor) {
return ggml_type_size(tensor->type);
}
@ -3154,9 +3173,7 @@ static struct ggml_tensor * ggml_add_impl(
struct ggml_tensor * a,
struct ggml_tensor * b,
bool inplace) {
// TODO: support less-strict constraint
// GGML_ASSERT(ggml_can_repeat(b, a));
GGML_ASSERT(ggml_can_repeat_rows(b, a));
GGML_ASSERT(ggml_can_repeat(b, a));
bool is_node = false;
@ -3371,9 +3388,7 @@ static struct ggml_tensor * ggml_mul_impl(
struct ggml_tensor * a,
struct ggml_tensor * b,
bool inplace) {
// TODO: support less-strict constraint
// GGML_ASSERT(ggml_can_repeat(b, a));
GGML_ASSERT(ggml_can_repeat_rows(b, a));
GGML_ASSERT(ggml_can_repeat(b, a));
bool is_node = false;
@ -3418,7 +3433,7 @@ static struct ggml_tensor * ggml_div_impl(
struct ggml_tensor * a,
struct ggml_tensor * b,
bool inplace) {
GGML_ASSERT(ggml_are_same_shape(a, b));
GGML_ASSERT(ggml_can_repeat(b, a));
bool is_node = false;
@ -4056,6 +4071,49 @@ struct ggml_tensor * ggml_mul_mat(
return result;
}
// ggml_mul_mat_id
struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx,
struct ggml_tensor * as[],
struct ggml_tensor * ids,
int id,
struct ggml_tensor * b) {
int64_t n_as = ids->ne[0];
GGML_ASSERT(ids->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_vector(ids));
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
GGML_ASSERT(id >= 0 && id < n_as);
bool is_node = false;
if (as[0]->grad || b->grad) {
is_node = true;
}
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
ggml_set_op_params_i32(result, 0, id);
result->op = GGML_OP_MUL_MAT_ID;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = ids;
result->src[1] = b;
for (int64_t i = 0; i < n_as; i++) {
struct ggml_tensor * a = as[i];
GGML_ASSERT(ggml_are_same_shape(as[0], a));
GGML_ASSERT(ggml_can_mul_mat(a, b));
GGML_ASSERT(!ggml_is_transposed(a));
result->src[i + 2] = a;
}
return result;
}
// ggml_out_prod
struct ggml_tensor * ggml_out_prod(
@ -4209,7 +4267,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
struct ggml_tensor * b,
size_t nb1,
size_t offset) {
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
}
// ggml_cpy
@ -5468,6 +5526,43 @@ struct ggml_tensor * ggml_upscale(
return ggml_upscale_impl(ctx, a, scale_factor);
}
// ggml_argsort
struct ggml_tensor * ggml_argsort(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_sort_order order) {
bool is_node = false;
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
ggml_set_op_params_i32(result, 0, (int32_t) order);
result->op = GGML_OP_ARGSORT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
return result;
}
// ggml_top_k
struct ggml_tensor * ggml_top_k(
struct ggml_context * ctx,
struct ggml_tensor * a,
int k) {
GGML_ASSERT(a->ne[0] >= k);
struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
result = ggml_view_4d(ctx, result,
k, result->ne[1], result->ne[2], result->ne[3],
result->nb[1], result->nb[2], result->nb[3],
0);
return result;
}
// ggml_flash_attn
struct ggml_tensor * ggml_flash_attn(
@ -6827,7 +6922,7 @@ static void ggml_compute_forward_add_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
@ -6860,16 +6955,19 @@ static void ggml_compute_forward_add_f32(
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
const int64_t nr0 = ne00 / ne10;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
for (int64_t r = 0; r < nr0; ++r) {
#ifdef GGML_USE_ACCELERATE
vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
#else
ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
#endif
}
}
} else {
// src1 is not contiguous
@ -6886,8 +6984,9 @@ static void ggml_compute_forward_add_f32(
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
for (int i0 = 0; i0 < ne0; i0++) {
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
for (int64_t i0 = 0; i0 < ne0; ++i0) {
const int64_t i10 = i0 % ne10;
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
}
@ -7607,7 +7706,7 @@ static void ggml_compute_forward_mul_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
@ -7630,7 +7729,6 @@ static void ggml_compute_forward_mul_f32(
GGML_ASSERT( nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float));
GGML_ASSERT(ne00 == ne10);
if (nb10 == sizeof(float)) {
for (int64_t ir = ith; ir < nr; ir += nth) {
@ -7642,20 +7740,21 @@ static void ggml_compute_forward_mul_f32(
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
const int64_t nr0 = ne00 / ne10;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
for (int64_t r = 0 ; r < nr0; ++r) {
#ifdef GGML_USE_ACCELERATE
UNUSED(ggml_vec_mul_f32);
UNUSED(ggml_vec_mul_f32);
vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
#else
ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
#endif
// }
// }
}
}
} else {
// src1 is not contiguous
@ -7673,8 +7772,9 @@ static void ggml_compute_forward_mul_f32(
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
for (int64_t i0 = 0; i0 < ne00; i0++) {
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
for (int64_t i0 = 0; i0 < ne00; ++i0) {
const int64_t i10 = i0 % ne10;
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
}
@ -7708,14 +7808,16 @@ static void ggml_compute_forward_div_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int nr = ggml_nrows(src0);
const int ith = params->ith;
const int nth = params->nth;
const int64_t nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS
@ -7723,41 +7825,50 @@ static void ggml_compute_forward_div_f32(
GGML_ASSERT(nb00 == sizeof(float));
if (nb10 == sizeof(float)) {
for (int ir = 0; ir < nr; ++ir) {
// src0, src1 and dst are same shape => same indices
const int i3 = ir/(ne2*ne1);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
for (int64_t ir = ith; ir < nr; ir += nth) {
// src0 and dst are same shape => same indices
const int64_t i03 = ir/(ne02*ne01);
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
const int64_t nr0 = ne00 / ne10;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
for (int64_t r = 0; r < nr0; ++r) {
#ifdef GGML_USE_ACCELERATE
UNUSED(ggml_vec_div_f32);
UNUSED(ggml_vec_div_f32);
vDSP_vdiv(
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
ne0);
vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
#else
ggml_vec_div_f32(ne0,
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
#endif
// }
// }
}
}
} else {
// src1 is not contiguous
for (int ir = 0; ir < nr; ++ir) {
// src0, src1 and dst are same shape => same indices
const int i3 = ir/(ne2*ne1);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
for (int64_t ir = ith; ir < nr; ir += nth) {
// src0 and dst are same shape => same indices
// src1 is broadcastable across src0 and dst in i1, i2, i3
const int64_t i03 = ir/(ne02*ne01);
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
for (int i0 = 0; i0 < ne0; i0++) {
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
for (int64_t i0 = 0; i0 < ne00; ++i0) {
const int64_t i10 = i0 % ne10;
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
}
@ -8203,7 +8314,7 @@ static void ggml_compute_forward_repeat_f16(
return;
}
GGML_TENSOR_UNARY_OP_LOCALS;
GGML_TENSOR_UNARY_OP_LOCALS
// guaranteed to be an integer due to the check in ggml_can_repeat
const int nr0 = (int)(ne0/ne00);
@ -8348,6 +8459,7 @@ static void ggml_compute_forward_concat_f32(
GGML_ASSERT(src0->nb[0] == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
GGML_TENSOR_BINARY_OP_LOCALS
@ -8357,7 +8469,7 @@ static void ggml_compute_forward_concat_f32(
GGML_ASSERT(nb10 == sizeof(float));
for (int i3 = 0; i3 < ne3; i3++) {
for (int i2 = ith; i2 < ne2; i2++) {
for (int i2 = ith; i2 < ne2; i2 += nth) {
if (i2 < ne02) { // src0
for (int i1 = 0; i1 < ne1; i1++) {
for (int i0 = 0; i0 < ne0; i0++) {
@ -9517,6 +9629,8 @@ static void ggml_compute_forward_mul_mat(
char * wdata = params->wdata;
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
assert(params->wsize >= ne11*ne12*ne13*row_size);
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
@ -9618,6 +9732,26 @@ static void ggml_compute_forward_mul_mat(
}
}
// ggml_compute_forward_mul_mat_id
static void ggml_compute_forward_mul_mat_id(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * ids = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
const int id = ggml_get_op_params_i32(dst, 0);
const int a_id = ((int32_t *)ids->data)[id];
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
const struct ggml_tensor * src0 = dst->src[a_id + 2];
ggml_compute_forward_mul_mat(params, src0, src1, dst);
}
// ggml_compute_forward_out_prod
static void ggml_compute_forward_out_prod_f32(
@ -12021,6 +12155,67 @@ static void ggml_compute_forward_upscale(
}
}
// ggml_compute_forward_argsort
static void ggml_compute_forward_argsort_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
GGML_TENSOR_UNARY_OP_LOCALS
GGML_ASSERT(nb0 == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
const int64_t nr = ggml_nrows(src0);
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
for (int64_t i = ith; i < nr; i += nth) {
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
const float * src_data = (float *)((char *) src0->data + i*nb01);
for (int64_t j = 0; j < ne0; j++) {
dst_data[j] = j;
}
// C doesn't have a functional sort, so we do a bubble sort instead
for (int64_t j = 0; j < ne0; j++) {
for (int64_t k = j + 1; k < ne0; k++) {
if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
(order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
int32_t tmp = dst_data[j];
dst_data[j] = dst_data[k];
dst_data[k] = tmp;
}
}
}
}
}
static void ggml_compute_forward_argsort(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_argsort_f32(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_flash_attn
static void ggml_compute_forward_flash_attn_f32(
@ -13844,6 +14039,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_MUL_MAT_ID:
{
ggml_compute_forward_mul_mat_id(params, tensor);
} break;
case GGML_OP_OUT_PROD:
{
ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
@ -13948,6 +14147,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
} break;
case GGML_OP_ARGSORT:
{
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
} break;
case GGML_OP_FLASH_ATTN:
{
const int32_t t = ggml_get_op_params_i32(tensor, 0);
@ -14598,6 +14801,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
zero_table);
}
} break;
case GGML_OP_MUL_MAT_ID:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_OUT_PROD:
{
GGML_ASSERT(false); // TODO: not implemented
@ -14936,6 +15143,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_ARGSORT:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_FLASH_ATTN:
{
struct ggml_tensor * flash_grad = NULL;
@ -15296,12 +15507,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
}
struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
const size_t obj_size = sizeof(struct ggml_cgraph);
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
*cgraph = (struct ggml_cgraph) {
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
struct ggml_cgraph cgraph = {
/*.size =*/ 0,
/*.n_nodes =*/ i1 - i0,
/*.n_leafs =*/ 0,
@ -15536,7 +15743,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
n_tasks = n_threads;
} break;
case GGML_OP_SUB:
case GGML_OP_DIV:
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_LOG:
@ -15569,10 +15775,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
{
n_tasks = n_threads;
} break;
default:
GGML_ASSERT(false);
}
break;
case GGML_OP_SILU_BACK:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_RMS_NORM_BACK:
@ -15610,6 +15819,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
}
#endif
} break;
case GGML_OP_MUL_MAT_ID:
{
// FIXME: blas
n_tasks = n_threads;
} break;
case GGML_OP_OUT_PROD:
{
n_tasks = n_threads;
@ -15669,6 +15883,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
{
n_tasks = n_threads;
} break;
case GGML_OP_ARGSORT:
{
n_tasks = n_threads;
} break;
case GGML_OP_FLASH_ATTN:
{
n_tasks = n_threads;
@ -15731,6 +15949,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
{
n_tasks = 1;
} break;
case GGML_OP_COUNT:
{
GGML_ASSERT(false);
} break;
default:
{
fprintf(stderr, "%s: op not implemented: ", __func__);
@ -15927,6 +16149,23 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
}
} break;
case GGML_OP_MUL_MAT_ID:
{
const struct ggml_tensor * a = node->src[2];
const struct ggml_tensor * b = node->src[1];
const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
if (a->type != GGML_TYPE_F32) {
// here we need memory just for single 2D matrix from src0
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
}
} else
#endif
if (b->type != vec_dot_type) {
cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
}
} break;
case GGML_OP_OUT_PROD:
{
if (ggml_is_quantized(node->src[0]->type)) {
@ -15962,9 +16201,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
GGML_ASSERT(false);
}
} break;
case GGML_OP_IM2COL:
{
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
const int64_t ne00 = node->src[0]->ne[0]; // W
@ -17803,8 +18039,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
memcpy(&qh, &y[i].qh, sizeof(qh));
for (int j = 0; j < QK5_0; j += 2) {
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
// cast to 16 bins
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@ -17833,8 +18069,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
memcpy(&qh, &y[i].qh, sizeof(qh));
for (int j = 0; j < QK5_1; j += 2) {
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
// cast to 16 bins
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@ -18024,6 +18260,7 @@ struct gguf_kv {
struct gguf_header {
char magic[4];
uint32_t version;
uint64_t n_tensors; // GGUFv2
uint64_t n_kv; // GGUFv2
@ -18113,7 +18350,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
for (uint32_t i = 0; i < sizeof(magic); i++) {
if (magic[i] != GGUF_MAGIC[i]) {
fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
fclose(file);
return NULL;
}
@ -18128,7 +18365,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
{
strncpy(ctx->header.magic, magic, 4);
ctx->kv = NULL;
ctx->infos = NULL;
ctx->data = NULL;