Optimize model to leverage inplace to avoid create new tensor
This commit is contained in:
parent
02c5b27e91
commit
bcf363cb53
2 changed files with 35 additions and 7 deletions
28
ggml.h
28
ggml.h
|
@ -409,11 +409,18 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||||
struct ggml_tensor * ggml_dup(
|
struct ggml_tensor * ggml_dup(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
struct ggml_tensor * ggml_dup_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_add(
|
struct ggml_tensor * ggml_add(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
struct ggml_tensor * ggml_add_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_sub(
|
struct ggml_tensor * ggml_sub(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -424,6 +431,10 @@ struct ggml_tensor * ggml_mul(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
struct ggml_tensor * ggml_mul_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_div(
|
struct ggml_tensor * ggml_div(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -484,16 +495,25 @@ struct ggml_tensor * ggml_gelu(
|
||||||
struct ggml_tensor * ggml_silu(
|
struct ggml_tensor * ggml_silu(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
struct ggml_tensor * ggml_silu_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// normalize along rows
|
// normalize along rows
|
||||||
// TODO: eps is hardcoded to 1e-5 for now
|
// TODO: eps is hardcoded to 1e-5 for now
|
||||||
struct ggml_tensor * ggml_norm(
|
struct ggml_tensor * ggml_norm(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
struct ggml_tensor * ggml_norm_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_rms_norm(
|
struct ggml_tensor * ggml_rms_norm(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
struct ggml_tensor* ggml_rms_norm_inplace(
|
||||||
|
struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* a);
|
||||||
|
|
||||||
// A: m rows, n columns
|
// A: m rows, n columns
|
||||||
// B: p rows, n columns (i.e. we transpose it internally)
|
// B: p rows, n columns (i.e. we transpose it internally)
|
||||||
|
@ -512,12 +532,20 @@ struct ggml_tensor * ggml_scale(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
struct ggml_tensor * ggml_scale_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// a -> b, return view(b)
|
// a -> b, return view(b)
|
||||||
struct ggml_tensor * ggml_cpy(
|
struct ggml_tensor * ggml_cpy(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
struct ggml_tensor * ggml_cpy_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// return view(a), b specifies the new shape
|
// return view(a), b specifies the new shape
|
||||||
// TODO: when we start computing gradient, make a copy instead of view
|
// TODO: when we start computing gradient, make a copy instead of view
|
||||||
|
|
14
llama.cpp
14
llama.cpp
|
@ -827,9 +827,9 @@ static bool llama_eval_internal(
|
||||||
struct ggml_tensor * Q =
|
struct ggml_tensor * Q =
|
||||||
ggml_permute(ctx0,
|
ggml_permute(ctx0,
|
||||||
ggml_rope(ctx0,
|
ggml_rope(ctx0,
|
||||||
ggml_cpy(ctx0,
|
ggml_reshape_3d(ctx0,
|
||||||
Qcur,
|
Qcur,
|
||||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
n_embd/n_head, n_head, N),
|
||||||
n_past, n_rot, 0),
|
n_past, n_rot, 0),
|
||||||
0, 2, 1, 3);
|
0, 2, 1, 3);
|
||||||
|
|
||||||
|
@ -848,7 +848,7 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||||
struct ggml_tensor * KQ_scaled =
|
struct ggml_tensor * KQ_scaled =
|
||||||
ggml_scale(ctx0,
|
ggml_scale_inplace(ctx0,
|
||||||
KQ,
|
KQ,
|
||||||
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
|
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
|
||||||
|
|
||||||
|
@ -887,7 +887,7 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
lctx.use_buf(ctx0, 1);
|
lctx.use_buf(ctx0, 1);
|
||||||
|
|
||||||
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
|
||||||
|
|
||||||
// feed-forward network
|
// feed-forward network
|
||||||
{
|
{
|
||||||
|
@ -910,16 +910,16 @@ static bool llama_eval_internal(
|
||||||
cur);
|
cur);
|
||||||
|
|
||||||
// SILU activation
|
// SILU activation
|
||||||
cur = ggml_silu(ctx0, cur);
|
cur = ggml_silu_inplace(ctx0, cur);
|
||||||
|
|
||||||
cur = ggml_mul(ctx0, cur, tmp);
|
cur = ggml_mul_inplace(ctx0, cur, tmp);
|
||||||
|
|
||||||
cur = ggml_mul_mat(ctx0,
|
cur = ggml_mul_mat(ctx0,
|
||||||
model.layers[il].w2,
|
model.layers[il].w2,
|
||||||
cur);
|
cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, inpFF);
|
cur = ggml_add_inplace(ctx0, cur, inpFF);
|
||||||
|
|
||||||
// input for next layer
|
// input for next layer
|
||||||
inpL = cur;
|
inpL = cur;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue