diff --git a/ggml.h b/ggml.h index f7791ed11..5f5542d0f 100644 --- a/ggml.h +++ b/ggml.h @@ -409,11 +409,18 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor); struct ggml_tensor * ggml_dup( struct ggml_context * ctx, struct ggml_tensor * a); +struct ggml_tensor * ggml_dup_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); struct ggml_tensor * ggml_add( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b); +struct ggml_tensor * ggml_add_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); struct ggml_tensor * ggml_sub( struct ggml_context * ctx, @@ -424,6 +431,10 @@ struct ggml_tensor * ggml_mul( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b); +struct ggml_tensor * ggml_mul_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); struct ggml_tensor * ggml_div( struct ggml_context * ctx, @@ -484,16 +495,25 @@ struct ggml_tensor * ggml_gelu( struct ggml_tensor * ggml_silu( struct ggml_context * ctx, struct ggml_tensor * a); +struct ggml_tensor * ggml_silu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); // normalize along rows // TODO: eps is hardcoded to 1e-5 for now struct ggml_tensor * ggml_norm( struct ggml_context * ctx, struct ggml_tensor * a); +struct ggml_tensor * ggml_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); struct ggml_tensor * ggml_rms_norm( struct ggml_context * ctx, struct ggml_tensor * a); +struct ggml_tensor* ggml_rms_norm_inplace( + struct ggml_context* ctx, + struct ggml_tensor* a); // A: m rows, n columns // B: p rows, n columns (i.e. we transpose it internally) @@ -512,12 +532,20 @@ struct ggml_tensor * ggml_scale( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b); +struct ggml_tensor * ggml_scale_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); // a -> b, return view(b) struct ggml_tensor * ggml_cpy( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b); +struct ggml_tensor * ggml_cpy_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); // return view(a), b specifies the new shape // TODO: when we start computing gradient, make a copy instead of view diff --git a/llama.cpp b/llama.cpp index bed24207d..1eabed13d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -827,9 +827,9 @@ static bool llama_eval_internal( struct ggml_tensor * Q = ggml_permute(ctx0, ggml_rope(ctx0, - ggml_cpy(ctx0, + ggml_reshape_3d(ctx0, Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + n_embd/n_head, n_head, N), n_past, n_rot, 0), 0, 2, 1, 3); @@ -848,7 +848,7 @@ static bool llama_eval_internal( // KQ_scaled = KQ / sqrt(n_embd/n_head) struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, + ggml_scale_inplace(ctx0, KQ, ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); @@ -887,7 +887,7 @@ static bool llama_eval_internal( lctx.use_buf(ctx0, 1); - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); // feed-forward network { @@ -910,16 +910,16 @@ static bool llama_eval_internal( cur); // SILU activation - cur = ggml_silu(ctx0, cur); + cur = ggml_silu_inplace(ctx0, cur); - cur = ggml_mul(ctx0, cur, tmp); + cur = ggml_mul_inplace(ctx0, cur, tmp); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); } - cur = ggml_add(ctx0, cur, inpFF); + cur = ggml_add_inplace(ctx0, cur, inpFF); // input for next layer inpL = cur;