sync : ggml (SD ops, tests, kernels) (#4444)
* sync : ggml (SD ops, tests, kernels) ggml-ci * cuda : restore im2col ggml-ci * metal : fix accuracy of dequantization kernels ggml-ci * cuda : restore correct im2col ggml-ci * metal : try to fix moe test by reducing expert size ggml-ci * cuda : fix bin bcast when src1 and dst have different types ggml-ci --------- Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
parent
70f806b821
commit
4d98d9a656
6 changed files with 1334 additions and 130 deletions
183
ggml.c
183
ggml.c
|
@ -1395,7 +1395,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
|
|||
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
||||
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
||||
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
||||
inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
|
||||
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
||||
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float GELU_QUICK_COEF = -1.702f;
|
||||
|
@ -1623,7 +1623,9 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"POOL_1D",
|
||||
"POOL_2D",
|
||||
"UPSCALE",
|
||||
"PAD",
|
||||
"ARGSORT",
|
||||
"LEAKY_RELU",
|
||||
|
||||
"FLASH_ATTN",
|
||||
"FLASH_FF",
|
||||
|
@ -1650,7 +1652,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"CROSS_ENTROPY_LOSS_BACK",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
||||
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
|
@ -1707,7 +1709,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"pool_1d(x)",
|
||||
"pool_2d(x)",
|
||||
"upscale(x)",
|
||||
"pad(x)",
|
||||
"argsort(x)",
|
||||
"leaky_relu(x)",
|
||||
|
||||
"flash_attn(x)",
|
||||
"flash_ff(x)",
|
||||
|
@ -1734,7 +1738,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"cross_entropy_loss_back(x,y)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
||||
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
|
@ -1750,10 +1754,9 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|||
"GELU",
|
||||
"GELU_QUICK",
|
||||
"SILU",
|
||||
"LEAKY",
|
||||
};
|
||||
|
||||
static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
|
||||
static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
|
||||
|
||||
|
||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||
|
@ -3830,12 +3833,25 @@ struct ggml_tensor * ggml_relu_inplace(
|
|||
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
|
||||
}
|
||||
|
||||
// ggml_leaky
|
||||
// ggml_leaky_relu
|
||||
|
||||
struct ggml_tensor * ggml_leaky(
|
||||
struct ggml_tensor * ggml_leaky_relu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
|
||||
struct ggml_tensor * a, float negative_slope, bool inplace) {
|
||||
bool is_node = false;
|
||||
|
||||
if (!inplace && (a->grad)) {
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
||||
|
||||
result->op = GGML_OP_LEAKY_RELU;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_gelu
|
||||
|
@ -4022,8 +4038,9 @@ static struct ggml_tensor * ggml_group_norm_impl(
|
|||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
result->op = GGML_OP_GROUP_NORM;
|
||||
result->op_params[0] = n_groups;
|
||||
|
||||
result->op = GGML_OP_GROUP_NORM;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
||||
|
@ -5523,6 +5540,30 @@ static struct ggml_tensor * ggml_upscale_impl(
|
|||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_pad(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int p0, int p1, int p2, int p3) {
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad) {
|
||||
GGML_ASSERT(false); // TODO: implement backward
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
||||
a->ne[0] + p0,
|
||||
a->ne[1] + p1,
|
||||
a->ne[2] + p2,
|
||||
a->ne[3] + p3);
|
||||
|
||||
result->op = GGML_OP_PAD;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_upscale(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
@ -7718,8 +7759,10 @@ static void ggml_compute_forward_mul_f32(
|
|||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
// TODO: OpenCL kernel support broadcast
|
||||
#ifdef GGML_USE_CLBLAST
|
||||
if (src1->backend == GGML_BACKEND_GPU) {
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
||||
if (ith == 0) {
|
||||
ggml_cl_mul(src0, src1, dst);
|
||||
}
|
||||
|
@ -8985,10 +9028,9 @@ static void ggml_compute_forward_silu(
|
|||
} break;
|
||||
}
|
||||
}
|
||||
// ggml_compute_forward_leaky_relu
|
||||
|
||||
// ggml_compute_forward_leaky
|
||||
|
||||
static void ggml_compute_forward_leaky_f32(
|
||||
static void ggml_compute_forward_leaky_relu_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
|
@ -9002,24 +9044,27 @@ static void ggml_compute_forward_leaky_f32(
|
|||
const int n = ggml_nrows(src0);
|
||||
const int nc = src0->ne[0];
|
||||
|
||||
float negative_slope;
|
||||
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
||||
|
||||
assert(dst->nb[0] == sizeof(float));
|
||||
assert(src0->nb[0] == sizeof(float));
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
ggml_vec_leaky_f32(nc,
|
||||
ggml_vec_leaky_relu_f32(nc,
|
||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
||||
(float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_leaky(
|
||||
static void ggml_compute_forward_leaky_relu(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_leaky_f32(params, src0, dst);
|
||||
ggml_compute_forward_leaky_relu_f32(params, src0, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
|
@ -12158,6 +12203,7 @@ static void ggml_compute_forward_upscale_f32(
|
|||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
|
@ -12165,16 +12211,17 @@ static void ggml_compute_forward_upscale_f32(
|
|||
|
||||
// TODO: optimize
|
||||
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
for (int i02 = ith; i02 < ne02; i02++) {
|
||||
for (int m = 0; m < dst->ne[1]; m++) {
|
||||
int i01 = m / scale_factor;
|
||||
for (int n = 0; n < dst->ne[0]; n++) {
|
||||
int i00 = n / scale_factor;
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
const int64_t i03 = i3;
|
||||
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
||||
const int64_t i02 = i2;
|
||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||
const int64_t i01 = i1 / scale_factor;
|
||||
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
||||
const int64_t i00 = i0 / scale_factor;
|
||||
|
||||
const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
|
||||
|
||||
float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
|
||||
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
|
@ -12199,6 +12246,64 @@ static void ggml_compute_forward_upscale(
|
|||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_pad
|
||||
|
||||
static void ggml_compute_forward_pad_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT( dst->nb[0] == sizeof(float));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
float * dst_ptr = (float *) dst->data;
|
||||
|
||||
// TODO: optimize
|
||||
|
||||
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
||||
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
||||
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
||||
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
||||
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
||||
|
||||
const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
dst_ptr[dst_idx] = *src_ptr;
|
||||
} else {
|
||||
dst_ptr[dst_idx] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_pad(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_pad_f32(params, src0, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_argsort
|
||||
|
||||
static void ggml_compute_forward_argsort_f32(
|
||||
|
@ -13406,10 +13511,6 @@ static void ggml_compute_forward_unary(
|
|||
{
|
||||
ggml_compute_forward_silu(params, src0, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_LEAKY:
|
||||
{
|
||||
ggml_compute_forward_leaky(params, src0, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
|
@ -14191,10 +14292,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
{
|
||||
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
||||
} break;
|
||||
case GGML_OP_PAD:
|
||||
{
|
||||
ggml_compute_forward_pad(params, tensor->src[0], tensor);
|
||||
} break;
|
||||
case GGML_OP_ARGSORT:
|
||||
{
|
||||
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
|
||||
} break;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
{
|
||||
ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
||||
|
@ -15187,10 +15296,18 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_PAD:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_ARGSORT:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
struct ggml_tensor * flash_grad = NULL;
|
||||
|
@ -15796,6 +15913,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
{
|
||||
n_tasks = 1;
|
||||
} break;
|
||||
|
@ -15808,7 +15926,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_LEAKY:
|
||||
{
|
||||
n_tasks = 1;
|
||||
} break;
|
||||
|
@ -15927,6 +16044,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_PAD:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_ARGSORT:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue