sync : ggml (SD ops, tests, kernels) (#4444)

* sync : ggml (SD ops, tests, kernels)

ggml-ci

* cuda : restore im2col

ggml-ci

* metal : fix accuracy of dequantization kernels

ggml-ci

* cuda : restore correct im2col

ggml-ci

* metal : try to fix moe test by reducing expert size

ggml-ci

* cuda : fix bin bcast when src1 and dst have different types

ggml-ci

---------

Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
Georgi Gerganov 2023-12-13 21:54:54 +02:00 committed by GitHub
parent 70f806b821
commit 4d98d9a656
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 1334 additions and 130 deletions

183
ggml.c
View file

@ -1395,7 +1395,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
static const float GELU_COEF_A = 0.044715f;
static const float GELU_QUICK_COEF = -1.702f;
@ -1623,7 +1623,9 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"POOL_1D",
"POOL_2D",
"UPSCALE",
"PAD",
"ARGSORT",
"LEAKY_RELU",
"FLASH_ATTN",
"FLASH_FF",
@ -1650,7 +1652,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK",
};
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@ -1707,7 +1709,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"pool_1d(x)",
"pool_2d(x)",
"upscale(x)",
"pad(x)",
"argsort(x)",
"leaky_relu(x)",
"flash_attn(x)",
"flash_ff(x)",
@ -1734,7 +1738,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)",
};
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -1750,10 +1754,9 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
"GELU",
"GELU_QUICK",
"SILU",
"LEAKY",
};
static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@ -3830,12 +3833,25 @@ struct ggml_tensor * ggml_relu_inplace(
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
}
// ggml_leaky
// ggml_leaky_relu
struct ggml_tensor * ggml_leaky(
struct ggml_tensor * ggml_leaky_relu(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
struct ggml_tensor * a, float negative_slope, bool inplace) {
bool is_node = false;
if (!inplace && (a->grad)) {
is_node = true;
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
result->op = GGML_OP_LEAKY_RELU;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
return result;
}
// ggml_gelu
@ -4022,8 +4038,9 @@ static struct ggml_tensor * ggml_group_norm_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
result->op = GGML_OP_GROUP_NORM;
result->op_params[0] = n_groups;
result->op = GGML_OP_GROUP_NORM;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = NULL; // TODO: maybe store epsilon here?
@ -5523,6 +5540,30 @@ static struct ggml_tensor * ggml_upscale_impl(
return result;
}
struct ggml_tensor * ggml_pad(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0, int p1, int p2, int p3) {
bool is_node = false;
if (a->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
a->ne[0] + p0,
a->ne[1] + p1,
a->ne[2] + p2,
a->ne[3] + p3);
result->op = GGML_OP_PAD;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
return result;
}
struct ggml_tensor * ggml_upscale(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -7718,8 +7759,10 @@ static void ggml_compute_forward_mul_f32(
const int ith = params->ith;
const int nth = params->nth;
// TODO: OpenCL kernel support broadcast
#ifdef GGML_USE_CLBLAST
if (src1->backend == GGML_BACKEND_GPU) {
GGML_ASSERT(ggml_are_same_shape(src0, src1));
if (ith == 0) {
ggml_cl_mul(src0, src1, dst);
}
@ -8985,10 +9028,9 @@ static void ggml_compute_forward_silu(
} break;
}
}
// ggml_compute_forward_leaky_relu
// ggml_compute_forward_leaky
static void ggml_compute_forward_leaky_f32(
static void ggml_compute_forward_leaky_relu_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
@ -9002,24 +9044,27 @@ static void ggml_compute_forward_leaky_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
float negative_slope;
memcpy(&negative_slope, dst->op_params, sizeof(float));
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_leaky_f32(nc,
ggml_vec_leaky_relu_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
(float *) ((char *) src0->data + i*(src0->nb[1])));
(float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
}
}
static void ggml_compute_forward_leaky(
static void ggml_compute_forward_leaky_relu(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_leaky_f32(params, src0, dst);
ggml_compute_forward_leaky_relu_f32(params, src0, dst);
} break;
default:
{
@ -12158,6 +12203,7 @@ static void ggml_compute_forward_upscale_f32(
GGML_ASSERT(src0->nb[0] == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
GGML_TENSOR_UNARY_OP_LOCALS
@ -12165,16 +12211,17 @@ static void ggml_compute_forward_upscale_f32(
// TODO: optimize
for (int i03 = 0; i03 < ne03; i03++) {
for (int i02 = ith; i02 < ne02; i02++) {
for (int m = 0; m < dst->ne[1]; m++) {
int i01 = m / scale_factor;
for (int n = 0; n < dst->ne[0]; n++) {
int i00 = n / scale_factor;
for (int64_t i3 = 0; i3 < ne3; i3++) {
const int64_t i03 = i3;
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
const int64_t i02 = i2;
for (int64_t i1 = 0; i1 < ne1; i1++) {
const int64_t i01 = i1 / scale_factor;
for (int64_t i0 = 0; i0 < ne0; i0++) {
const int64_t i00 = i0 / scale_factor;
const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
*y = *x;
}
@ -12199,6 +12246,64 @@ static void ggml_compute_forward_upscale(
}
}
// ggml_compute_forward_pad
static void ggml_compute_forward_pad_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
GGML_ASSERT(src0->nb[0] == sizeof(float));
GGML_ASSERT( dst->nb[0] == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
GGML_TENSOR_UNARY_OP_LOCALS
float * dst_ptr = (float *) dst->data;
// TODO: optimize
for (int64_t i2 = 0; i2 < ne2; ++i2) {
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
for (int64_t i0 = 0; i0 < ne0; ++i0) {
for (int64_t i3 = 0; i3 < ne3; ++i3) {
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
dst_ptr[dst_idx] = *src_ptr;
} else {
dst_ptr[dst_idx] = 0;
}
}
}
}
}
}
static void ggml_compute_forward_pad(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_pad_f32(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_argsort
static void ggml_compute_forward_argsort_f32(
@ -13406,10 +13511,6 @@ static void ggml_compute_forward_unary(
{
ggml_compute_forward_silu(params, src0, dst);
} break;
case GGML_UNARY_OP_LEAKY:
{
ggml_compute_forward_leaky(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
@ -14191,10 +14292,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
} break;
case GGML_OP_PAD:
{
ggml_compute_forward_pad(params, tensor->src[0], tensor);
} break;
case GGML_OP_ARGSORT:
{
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
} break;
case GGML_OP_LEAKY_RELU:
{
ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
} break;
case GGML_OP_FLASH_ATTN:
{
const int32_t t = ggml_get_op_params_i32(tensor, 0);
@ -15187,10 +15296,18 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_PAD:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_ARGSORT:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_LEAKY_RELU:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_FLASH_ATTN:
{
struct ggml_tensor * flash_grad = NULL;
@ -15796,6 +15913,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_ARGMAX:
case GGML_OP_REPEAT:
case GGML_OP_REPEAT_BACK:
case GGML_OP_LEAKY_RELU:
{
n_tasks = 1;
} break;
@ -15808,7 +15926,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_ELU:
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_LEAKY:
{
n_tasks = 1;
} break;
@ -15927,6 +16044,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
{
n_tasks = n_threads;
} break;
case GGML_OP_PAD:
{
n_tasks = n_threads;
} break;
case GGML_OP_ARGSORT:
{
n_tasks = n_threads;