Optimize model to leverage inplace to avoid create new tensor

This commit is contained in:
Howard Su 2023-03-31 22:42:09 +08:00
parent 02c5b27e91
commit bcf363cb53
2 changed files with 35 additions and 7 deletions

28
ggml.h
View file

@ -409,11 +409,18 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor);
struct ggml_tensor * ggml_dup(
struct ggml_context * ctx,
struct ggml_tensor * a);
struct ggml_tensor * ggml_dup_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
struct ggml_tensor * ggml_add(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * ggml_add_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * ggml_sub(
struct ggml_context * ctx,
@ -424,6 +431,10 @@ struct ggml_tensor * ggml_mul(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * ggml_mul_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * ggml_div(
struct ggml_context * ctx,
@ -484,16 +495,25 @@ struct ggml_tensor * ggml_gelu(
struct ggml_tensor * ggml_silu(
struct ggml_context * ctx,
struct ggml_tensor * a);
struct ggml_tensor * ggml_silu_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// normalize along rows
// TODO: eps is hardcoded to 1e-5 for now
struct ggml_tensor * ggml_norm(
struct ggml_context * ctx,
struct ggml_tensor * a);
struct ggml_tensor * ggml_norm_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
struct ggml_tensor * ggml_rms_norm(
struct ggml_context * ctx,
struct ggml_tensor * a);
struct ggml_tensor* ggml_rms_norm_inplace(
struct ggml_context* ctx,
struct ggml_tensor* a);
// A: m rows, n columns
// B: p rows, n columns (i.e. we transpose it internally)
@ -512,12 +532,20 @@ struct ggml_tensor * ggml_scale(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * ggml_scale_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// a -> b, return view(b)
struct ggml_tensor * ggml_cpy(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * ggml_cpy_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// return view(a), b specifies the new shape
// TODO: when we start computing gradient, make a copy instead of view

View file

@ -827,9 +827,9 @@ static bool llama_eval_internal(
struct ggml_tensor * Q =
ggml_permute(ctx0,
ggml_rope(ctx0,
ggml_cpy(ctx0,
ggml_reshape_3d(ctx0,
Qcur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
n_embd/n_head, n_head, N),
n_past, n_rot, 0),
0, 2, 1, 3);
@ -848,7 +848,7 @@ static bool llama_eval_internal(
// KQ_scaled = KQ / sqrt(n_embd/n_head)
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
ggml_scale_inplace(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
@ -887,7 +887,7 @@ static bool llama_eval_internal(
lctx.use_buf(ctx0, 1);
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
// feed-forward network
{
@ -910,16 +910,16 @@ static bool llama_eval_internal(
cur);
// SILU activation
cur = ggml_silu(ctx0, cur);
cur = ggml_silu_inplace(ctx0, cur);
cur = ggml_mul(ctx0, cur, tmp);
cur = ggml_mul_inplace(ctx0, cur, tmp);
cur = ggml_mul_mat(ctx0,
model.layers[il].w2,
cur);
}
cur = ggml_add(ctx0, cur, inpFF);
cur = ggml_add_inplace(ctx0, cur, inpFF);
// input for next layer
inpL = cur;