Optimize model to leverage inplace to avoid create new tensor

This commit is contained in:
Howard Su 2023-03-31 22:42:09 +08:00
parent 02c5b27e91
commit bcf363cb53
2 changed files with 35 additions and 7 deletions

28
ggml.h
View file

@ -409,11 +409,18 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor);
struct ggml_tensor * ggml_dup( struct ggml_tensor * ggml_dup(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_dup_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
struct ggml_tensor * ggml_add( struct ggml_tensor * ggml_add(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_add_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * ggml_sub( struct ggml_tensor * ggml_sub(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -424,6 +431,10 @@ struct ggml_tensor * ggml_mul(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_mul_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * ggml_div( struct ggml_tensor * ggml_div(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -484,16 +495,25 @@ struct ggml_tensor * ggml_gelu(
struct ggml_tensor * ggml_silu( struct ggml_tensor * ggml_silu(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_silu_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// normalize along rows // normalize along rows
// TODO: eps is hardcoded to 1e-5 for now // TODO: eps is hardcoded to 1e-5 for now
struct ggml_tensor * ggml_norm( struct ggml_tensor * ggml_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_norm_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
struct ggml_tensor * ggml_rms_norm( struct ggml_tensor * ggml_rms_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor* ggml_rms_norm_inplace(
struct ggml_context* ctx,
struct ggml_tensor* a);
// A: m rows, n columns // A: m rows, n columns
// B: p rows, n columns (i.e. we transpose it internally) // B: p rows, n columns (i.e. we transpose it internally)
@ -512,12 +532,20 @@ struct ggml_tensor * ggml_scale(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_scale_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// a -> b, return view(b) // a -> b, return view(b)
struct ggml_tensor * ggml_cpy( struct ggml_tensor * ggml_cpy(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_cpy_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// return view(a), b specifies the new shape // return view(a), b specifies the new shape
// TODO: when we start computing gradient, make a copy instead of view // TODO: when we start computing gradient, make a copy instead of view

View file

@ -827,9 +827,9 @@ static bool llama_eval_internal(
struct ggml_tensor * Q = struct ggml_tensor * Q =
ggml_permute(ctx0, ggml_permute(ctx0,
ggml_rope(ctx0, ggml_rope(ctx0,
ggml_cpy(ctx0, ggml_reshape_3d(ctx0,
Qcur, Qcur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), n_embd/n_head, n_head, N),
n_past, n_rot, 0), n_past, n_rot, 0),
0, 2, 1, 3); 0, 2, 1, 3);
@ -848,7 +848,7 @@ static bool llama_eval_internal(
// KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled = KQ / sqrt(n_embd/n_head)
struct ggml_tensor * KQ_scaled = struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0, ggml_scale_inplace(ctx0,
KQ, KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
@ -887,7 +887,7 @@ static bool llama_eval_internal(
lctx.use_buf(ctx0, 1); lctx.use_buf(ctx0, 1);
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
// feed-forward network // feed-forward network
{ {
@ -910,16 +910,16 @@ static bool llama_eval_internal(
cur); cur);
// SILU activation // SILU activation
cur = ggml_silu(ctx0, cur); cur = ggml_silu_inplace(ctx0, cur);
cur = ggml_mul(ctx0, cur, tmp); cur = ggml_mul_inplace(ctx0, cur, tmp);
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
model.layers[il].w2, model.layers[il].w2,
cur); cur);
} }
cur = ggml_add(ctx0, cur, inpFF); cur = ggml_add_inplace(ctx0, cur, inpFF);
// input for next layer // input for next layer
inpL = cur; inpL = cur;