implement 5 of 6 missing backward pass operations used by llama
- GGML_OP_DIAG_MASK_INF - GGML_OP_GET_ROWS - GGML_OP_RMS_NORM - GGML_OP_SILU - GGML_OP_SOFT_MAX add necessary ggml operations GGML_OP_ADD1, GGML_OP_SILU_BACK, GGML_OP_RMS_NORM_BACK, GGML_OP_DIAG_MASK_ZERO, and GGML_OP_ROPE_BACK GGML_OP_ADD1 is necessary to add a scalar value in the backward pass of GGML_OP_SOFT_MAX GGML_OP_ADD1 could also be replaced by using GGML_OP_ADD and GGML_OP_REPEAT, but the performance would be worse. additionally GGML_OP_REPEAT will return unexpected value when the the input to GGML_OP_SOFT_MAX contains only a single scalar. in this case GGML_OP_REPEAT will not return the value that should be repeated (src1) but the value which shape the result should take (src0). So in this case it can not replace GGML_OP_ADD1. GGML_OP_SILU_BACK, GGML_OP_RMS_NORM_BACK and GGML_OP_ROPE_BACK are necessary for backward pass of GGML_OP_SILU, GGML_OP_RMS_NORM and GGML_OP_ROPE. The backward pass for these functions cannot be easily composed of existing operations. Since the backward pass builds a computation graph we need operations forward pass implementations of the the required backward passes. Sounds a bit confusing at first, I know... GGML_OP_DIAG_MASK_ZERO is necessary for backward pass of GGML_OP_DIAG_MASK_INF. Some operations where previously inplace-only. for backward pass there needs to be non-inplace variants. staying consistent with other operations that have non-inplace and inplace variants, the operations are changed to non-inplace and functions with "_inplace" are added which are inplace. in llama we need to call the inplace variants so that it is implemented as before. for llama backward pass we need to use the non-inplace variants. still not completely implemented backward passes for llama: - GGML_OP_ROPE: needs forward pass for GGML_OP_ROPE_BACK - GGML_OP_GET_ROWS: only necessary for tokenizer
This commit is contained in:
parent
73ac18d856
commit
b164343529
3 changed files with 1149 additions and 53 deletions
68
ggml.h
68
ggml.h
|
@ -252,6 +252,7 @@ extern "C" {
|
|||
|
||||
GGML_OP_DUP,
|
||||
GGML_OP_ADD,
|
||||
GGML_OP_ADD1
|
||||
GGML_OP_ADD_AT,
|
||||
GGML_OP_SUB,
|
||||
GGML_OP_MUL,
|
||||
|
@ -268,8 +269,10 @@ extern "C" {
|
|||
GGML_OP_RELU,
|
||||
GGML_OP_GELU,
|
||||
GGML_OP_SILU,
|
||||
GGML_OP_SILU_BACK,
|
||||
GGML_OP_NORM, // normalize
|
||||
GGML_OP_RMS_NORM,
|
||||
GGML_OP_RMS_NORM_BACK,
|
||||
|
||||
GGML_OP_MUL_MAT,
|
||||
|
||||
|
@ -282,8 +285,10 @@ extern "C" {
|
|||
GGML_OP_TRANSPOSE,
|
||||
GGML_OP_GET_ROWS,
|
||||
GGML_OP_DIAG_MASK_INF,
|
||||
GGML_OP_DIAG_MASK_ZERO,
|
||||
GGML_OP_SOFT_MAX,
|
||||
GGML_OP_ROPE,
|
||||
GGML_OP_ROPE_BACK,
|
||||
GGML_OP_ALIBI,
|
||||
GGML_OP_CONV_1D_1S,
|
||||
GGML_OP_CONV_1D_2S,
|
||||
|
@ -481,6 +486,11 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_add1(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_add_at(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
@ -563,6 +573,11 @@ extern "C" {
|
|||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
struct ggml_tensor * ggml_silu_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * x,
|
||||
struct ggml_tensor * dy);
|
||||
|
||||
// normalize along rows
|
||||
// TODO: eps is hardcoded to 1e-5 for now
|
||||
GGML_API struct ggml_tensor * ggml_norm(
|
||||
|
@ -573,6 +588,11 @@ extern "C" {
|
|||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * x,
|
||||
struct ggml_tensor * dy);
|
||||
|
||||
// A: m rows, n columns
|
||||
// B: p rows, n columns (i.e. we transpose it internally)
|
||||
// result is m columns, p rows
|
||||
|
@ -585,11 +605,16 @@ extern "C" {
|
|||
// operations on tensors without backpropagation
|
||||
//
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_scale(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// a -> b, return view(b)
|
||||
GGML_API struct ggml_tensor * ggml_cpy(
|
||||
|
@ -670,19 +695,39 @@ extern "C" {
|
|||
struct ggml_tensor * b);
|
||||
|
||||
// set elements above the diagonal to -INF
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past);
|
||||
|
||||
// set elements above the diagonal to 0
|
||||
GGML_API struct ggml_tensor * ggml_diag_mask_zero(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_soft_max(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// rotary position embedding
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// rotary position embedding
|
||||
// if mode & 1 == 1, skip n_past elements
|
||||
// if mode & 2 == 1, GPT-NeoX style
|
||||
// TODO: avoid creating a new tensor every time
|
||||
|
@ -693,6 +738,23 @@ extern "C" {
|
|||
int n_dims,
|
||||
int mode);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode);
|
||||
|
||||
// rotary position embedding backward, i.e compute dx
|
||||
GGML_API struct ggml_tensor * ggml_rope_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * x,
|
||||
struct ggml_tensor * dy,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode);
|
||||
|
||||
// alibi position embedding
|
||||
// in-place, returns view(a)
|
||||
struct ggml_tensor * ggml_alibi(
|
||||
|
|
14
llama.cpp
14
llama.cpp
|
@ -1109,8 +1109,8 @@ static bool llama_eval_internal(
|
|||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
|
||||
// store key and value to memory
|
||||
{
|
||||
|
@ -1144,15 +1144,15 @@ static bool llama_eval_internal(
|
|||
|
||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||
struct ggml_tensor * KQ_scaled =
|
||||
ggml_scale(ctx0,
|
||||
ggml_scale_inplace(ctx0,
|
||||
KQ,
|
||||
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||
|
||||
// KQ = soft_max(KQ_masked)
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||
|
||||
// split cached V into n_head heads
|
||||
struct ggml_tensor * V =
|
||||
|
@ -1250,7 +1250,7 @@ static bool llama_eval_internal(
|
|||
lctx.use_buf(ctx0, -1);
|
||||
|
||||
// logits -> probs
|
||||
//inpL = ggml_soft_max(ctx0, inpL);
|
||||
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
||||
|
||||
// run the computation
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
|
@ -2325,7 +2325,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|||
|
||||
if (scaling != 1.0f) {
|
||||
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
||||
BA = ggml_scale(lora_ctx, BA, scale_tensor);
|
||||
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
||||
}
|
||||
|
||||
ggml_tensor * r;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue