diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index eb3ac9ac3..9d94bdfcf 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -946,406 +946,6 @@ struct ggml_tensor * forward_batch( return inpL; } -struct ggml_tensor * forward_batch_wo_cache( - struct my_llama_model * model, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_batch) { - - const int n_past = 0; - const int N = n_tokens; - - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - - GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); - struct ggml_tensor * tokens = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); - - // inpL shape [n_embd,N*n_batch,1] - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - assert_shape_2d(inpL, n_embd, N*n_batch); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // lctx.use_buf(ctx0, 0); - - // norm - { - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Kcur shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx); - assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); - assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); - - // Vcur shape [N, n_batch, n_embd/n_head, n_head] - struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head); - assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head); - - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Q shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); - - // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] - // K shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * K = - ggml_permute(ctx0, - Kcur, - 0, 2, 1, 3); - assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch); - - // K * Q - // KQ shape [N, N, n_head, n_batch] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - assert_shape_4d(KQ, N, N, n_head, n_batch); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // KQ_scaled shape [N, N, n_head, n_batch] - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); - assert_shape_4d(KQ_scaled, N, N, n_head, n_batch); - - // KQ_masked = mask_past(KQ_scaled) - // KQ_masked shape [N, N, n_head, n_batch] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - assert_shape_4d(KQ_masked, N, N, n_head, n_batch); - - // KQ = soft_max(KQ_masked) - // KQ_soft_max shape [N, N, n_head, n_batch] - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch); - - // Vcur shape [N, n_batch, n_embd/n_head, n_head] - // V shape [N, n_embd/n_head, n_head, n_batch] - struct ggml_tensor * V = - ggml_permute(ctx0, - Vcur, - 0, 3, 1, 2); - assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch); - - // KQV shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); - // KQV_merged shape - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); - assert_shape_2d(cur, n_embd, N*n_batch); - - // projection (no bias) - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // lctx.use_buf(ctx0, 1); - - // inpFF shape [n_embd,N*n_batch,1,1] - struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); - assert_shape_2d(inpFF, n_embd, N*n_batch); - - // feed-forward network - { - // norm - { - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = ffn_norm*cur - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // tmp shape [n_ff,N*n_batch,1,1] - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - assert_shape_2d(tmp, n_ff, N*n_batch); - - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // SILU activation - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_silu(ctx0, cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_mul(ctx0, cur, tmp); - assert_shape_2d(cur, n_ff, N*n_batch); - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_add_inplace(ctx0, cur, inpFF); - assert_shape_2d(cur, n_embd, N*n_batch); - - // input for next layer - // inpL shape [n_embd,N*n_batch,1,1] - inpL = cur; - assert_shape_2d(inpL, n_embd, N*n_batch); - } - - // norm - { - - // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(inpL, n_embd, N*n_batch); - - // inpL = norm*inpL - // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - assert_shape_2d(inpL, n_embd, N*n_batch); - - //embeddings = inpL; - } - - // lm_head - // inpL shape [n_vocab,N*n_batch,1,1] - inpL = ggml_mul_mat(ctx0, model->output, inpL); - assert_shape_2d(inpL, n_vocab, N*n_batch); - - { - // inpL shape [n_vocab,N,n_batch,1] - inpL = ggml_reshape_3d(ctx0, - inpL, - n_vocab, N, n_batch); - assert_shape_3d(inpL, n_vocab, N, n_batch); - } - - // run the computation - // ggml_build_forward_expand(gf, inpL); - - return inpL; -} - -struct ggml_tensor * forward_batch_wo_cache_flash_attn( - struct my_llama_model * model, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_batch) { - - const int n_past = 0; - const int N = n_tokens; - - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - - - GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); - struct ggml_tensor * tokens = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); - - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - assert_shape_2d(inpL, n_embd, N*n_batch); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // norm - { - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx); - assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); - assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); - - struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head); - assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head); - - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); - - struct ggml_tensor * K = - ggml_permute(ctx0, - Kcur, - 0, 2, 1, 3); - assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch); - - struct ggml_tensor * V = - ggml_permute(ctx0, - Vcur, - 0, 3, 1, 2); - assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch); - - bool masked = true; - struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, masked); - assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); - assert_shape_2d(cur, n_embd, N*n_batch); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - assert_shape_2d(inpFF, n_embd, N*n_batch); - - // feed-forward network - { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = ffn_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - assert_shape_2d(tmp, n_ff, N*n_batch); - - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // SILU activation - cur = ggml_silu(ctx0, cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - cur = ggml_mul(ctx0, cur, tmp); - assert_shape_2d(cur, n_ff, N*n_batch); - - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // cur = ggml_add_inplace(ctx0, cur, inpFF); - cur = ggml_add(ctx0, cur, inpFF); - assert_shape_2d(cur, n_embd, N*n_batch); - - // input for next layer - inpL = cur; - assert_shape_2d(inpL, n_embd, N*n_batch); - } - - // norm - { - - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(inpL, n_embd, N*n_batch); - - // inpL = norm*inpL - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - assert_shape_2d(inpL, n_embd, N*n_batch); - } - - // lm_head - inpL = ggml_mul_mat(ctx0, model->output, inpL); - assert_shape_2d(inpL, n_vocab, N*n_batch); - - { - inpL = ggml_reshape_3d(ctx0, - inpL, - n_vocab, N, n_batch); - assert_shape_3d(inpL, n_vocab, N, n_batch); - } - - // run the computation - // ggml_build_forward_expand(gf, inpL); - - return inpL; -} - - static size_t hash(void * p) { return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; } @@ -1703,1146 +1303,6 @@ struct ggml_tensor * llama_build_train_graphs( return t36; } - -// expand the graph nodes without creating leafs. -struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) { - // check if already visited - for (int i = 0; i < g->n_nodes; i++) { - if (g->nodes[i] == t) { - return t; - } - } - - for (int i = 0; i < g->n_leafs; i++) { - if (g->leafs[i] == t) { - return t; - } - } - - for (int i = 0; i < GGML_MAX_SRC; ++i) { - if (t->src[i]) { - expand(g, t->src[i]); - } - } - - GGML_ASSERT(g->n_nodes < GGML_MAX_NODES); - - if (strlen(t->name) == 0) { - snprintf(t->name, sizeof(t->name), "node_%d", g->n_nodes); - } - - g->nodes[g->n_nodes] = t; - g->grads[g->n_nodes] = t->grad; - g->n_nodes++; - return t; -} - -void graph_set_leafs_grads(struct ggml_cgraph * g) { - // moves leaf nodes to g->leafs. - // i.e. g->n_nodes might change. - int n_nodes = 0; - for (int i = 0; i < g->n_nodes; ++i) { - struct ggml_tensor * node = g->nodes[i]; - const bool is_leaf = node->op == GGML_OP_NONE && node->grad == NULL; - if (is_leaf) { - GGML_ASSERT(g->n_leafs < GGML_MAX_NODES); - - if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "leaf_%d", g->n_leafs); - } - - g->leafs[g->n_leafs] = node; - g->n_leafs++; - } else { - GGML_ASSERT(n_nodes < GGML_MAX_NODES); - - if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "node_%d", n_nodes); - } - - g->nodes[n_nodes] = node; - g->grads[n_nodes] = node->grad; - n_nodes++; - } - } - for (int i=n_nodes; i < g->n_nodes; ++i) { - g->nodes[i] = NULL; - g->grads[i] = NULL; - } - g->n_nodes = n_nodes; -} - -struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( - struct my_llama_model * model, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb, - struct ggml_tensor * * logits, - struct ggml_tensor * tokens_input, - struct ggml_tensor * targets, - void * compute_buf_0, - void * compute_buf_1, - size_t size_buf_0, - size_t size_buf_1, - const int n_tokens, - const int n_batch) { - - ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - - const int n_past = 0; - const int N = n_tokens; - - gf->n_nodes = 0; - gf->n_leafs = 0; - gf->perf_runs = 0; - gf->perf_cycles = 0; - gf->perf_time_us = 0; - - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - const int rope_mode = 0; - - bool track_max_mem = true; - - int last_buf = -1; - size_t buf_offs[2] = { 0, 0 }; - size_t buf_size[2] = { size_buf_0, - size_buf_1 }; - void * buf_data[2] = { compute_buf_0, - compute_buf_1 }; - size_t buf_maxs[2] = { 0, 0 }; - - auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) { - size_t last_offs = 0; - last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - if (last_buf >= 0) { - buf_offs[last_buf] = last_offs; - buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]); - } - if (buf >= 0) { - size_t offs = buf_offs[buf]; - size_t size = buf_size[buf]; - void * data = buf_data[buf]; - ggml_set_scratch(ctx0, { offs, size, data, }); - } - last_buf = buf; - }; - - - auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) { - if (buf < 0) return; - if (track_max_mem) { - size_t last_offs = 0; - last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - if (last_buf >= 0) { - buf_offs[last_buf] = last_offs; - buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]); - } - } - buf_offs[buf] = 0; - if (track_max_mem && last_buf >= 0) { - size_t offs = buf_offs[last_buf]; - size_t size = buf_size[last_buf]; - void * data = buf_data[last_buf]; - ggml_set_scratch(ctx0, { offs, size, data, }); - } - }; - - - auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { - int64_t ne0 = n_embd/n_head; - int64_t ne1 = N; - int64_t ne2 = n_head; - int64_t ne3 = n_batch; - size_t nb0 = ggml_element_size(t); - size_t nb1 = nb0*ne0; - size_t nb2 = nb1*ne1; - size_t nb3 = nb2*ne2; - size_t offset = 0; - return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); - }; - - auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { - int64_t ne0 = n_embd/n_head; - int64_t ne1 = N; - int64_t ne2 = n_head; - int64_t ne3 = n_batch; - size_t nb0 = ggml_element_size(t); - size_t nb1 = nb0*ne0; - size_t nb2 = nb1*ne1; - size_t nb3 = nb2*ne2; - size_t offset = nb3*ne3; - return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); - }; - - auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { - int64_t ne0 = N; - int64_t ne1 = n_embd/n_head; - int64_t ne2 = n_head; - int64_t ne3 = n_batch; - size_t nb0 = ggml_element_size(t); - size_t nb1 = nb0*ne0; - size_t nb2 = nb1*ne1; - size_t nb3 = nb2*ne2; - size_t offset = 2*nb3*ne3; - return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); - }; - - auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * { - if (a == NULL) { - return b; - } else { - return ggml_add_inplace(ctx0, a, b); - } - }; - - use_buf(-1); - - model->tok_embeddings->grad = NULL; - model->norm->grad = NULL; - model->output->grad = NULL; - - for (int il = 0; il < n_layer; ++il) { - struct my_llama_layer & layer = model->layers[il]; - layer.attention_norm->grad = NULL; - layer.wq->grad = NULL; - layer.wk->grad = NULL; - layer.wv->grad = NULL; - layer.wo->grad = NULL; - layer.ffn_norm->grad = NULL; - layer.w1->grad = NULL; - layer.w2->grad = NULL; - layer.w3->grad = NULL; - } - - clr_buf(0); - clr_buf(1); - - use_buf(-1); - - GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); - struct ggml_tensor * t00 = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); assert_shape_1d(t00, N*n_batch); - - use_buf(-1); - - struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch); - - // need to remember these for the backward pass - std::vector t02L; t02L.resize(n_layer, NULL); - std::vector t03L; t03L.resize(n_layer, NULL); - std::vector t04L; t04L.resize(n_layer, NULL); - std::vector t05L; t05L.resize(n_layer, NULL); - std::vector t06L; t06L.resize(n_layer, NULL); - std::vector t07L; t07L.resize(n_layer, NULL); - std::vector t08L; t08L.resize(n_layer, NULL); - std::vector t09L; t09L.resize(n_layer, NULL); - std::vector t10L; t10L.resize(n_layer, NULL); - std::vector t11L; t11L.resize(n_layer, NULL); - std::vector t12L; t12L.resize(n_layer, NULL); - std::vector t13L; t13L.resize(n_layer, NULL); - std::vector t14L; t14L.resize(n_layer, NULL); - std::vector t15L; t15L.resize(n_layer, NULL); - std::vector t16L; t16L.resize(n_layer, NULL); - std::vector t17L; t17L.resize(n_layer, NULL); - std::vector t18L; t18L.resize(n_layer, NULL); - std::vector t19L; t19L.resize(n_layer, NULL); - std::vector t20L; t20L.resize(n_layer, NULL); - std::vector t21L; t21L.resize(n_layer, NULL); - std::vector t22L; t22L.resize(n_layer, NULL); - std::vector t23L; t23L.resize(n_layer, NULL); - std::vector t24L; t24L.resize(n_layer, NULL); - std::vector t25L; t25L.resize(n_layer, NULL); - std::vector t26L; t26L.resize(n_layer, NULL); - std::vector t27L; t27L.resize(n_layer, NULL); - std::vector t28L; t28L.resize(n_layer, NULL); - std::vector t29L; t29L.resize(n_layer, NULL); - std::vector t30L; t30L.resize(n_layer, NULL); - - struct ggml_tensor * cur = t01; - - for (int il = 0; il < n_layer; ++il) { - clr_buf(0); - struct my_llama_layer & layer = model->layers[il]; - // tensors with values necessary for backward pass are in persistent buf(-1) - // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed. - use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t02, n_embd, N*n_batch); - use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd); - use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); - use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); - use_buf(-1); struct ggml_tensor * t14 = expand(gf, ggml_permute (ctx0, t10, 0, 2, 1, 3)); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); - use_buf(-1); struct ggml_tensor * t15 = expand(gf, ggml_permute (ctx0, t12, 0, 3, 1, 2)); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); - use_buf(-1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn (ctx0, t13, t14, t15, true)); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); - use_buf( 0); struct ggml_tensor * t17 = expand(gf, ggml_permute (ctx0, t16, 0, 2, 1, 3)); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t18 = expand(gf, ggml_cont (ctx0, t17)); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch); - use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21, rms_norm_eps)); assert_shape_2d(t22, n_embd, N*n_batch); - use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch); - use_buf(-1); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat (ctx0, layer.w1, t24)); assert_shape_2d(t26, n_ff, N*n_batch); - use_buf(-1); struct ggml_tensor * t27 = expand(gf, ggml_silu (ctx0, t26)); assert_shape_2d(t27, n_ff, N*n_batch); - use_buf(-1); struct ggml_tensor * t28 = expand(gf, ggml_mul (ctx0, t27, t25)); assert_shape_2d(t28, n_ff, N*n_batch); - use_buf( 0); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat (ctx0, layer.w2, t28)); assert_shape_2d(t29, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t30 = expand(gf, ggml_add (ctx0, t21, t29)); assert_shape_2d(t30, n_embd, N*n_batch); - t02L[il] = t02; - t03L[il] = t03; - t04L[il] = t04; - t05L[il] = t05; - t06L[il] = t06; - t07L[il] = t07; - t08L[il] = t08; - t09L[il] = t09; - t10L[il] = t10; - t11L[il] = t11; - t12L[il] = t12; - t13L[il] = t13; - t14L[il] = t14; - t15L[il] = t15; - t16L[il] = t16; - t17L[il] = t17; - t18L[il] = t18; - t19L[il] = t19; - t20L[il] = t20; - t21L[il] = t21; - t22L[il] = t22; - t23L[il] = t23; - t24L[il] = t24; - t25L[il] = t25; - t26L[il] = t26; - t27L[il] = t27; - t28L[il] = t28; - t29L[il] = t29; - t30L[il] = t30; - - cur = t30; - } - clr_buf(0); - use_buf(0); - struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t31, n_embd, N*n_batch); - struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch); - struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch); - use_buf(-1); - struct ggml_tensor * t34 = expand(gf, ggml_mul_mat (ctx0, model->output, t33)); assert_shape_2d(t34, n_vocab, N*n_batch); - struct ggml_tensor * t35 = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch)); assert_shape_3d(t35, n_vocab, N, n_batch); - struct ggml_tensor * t36 = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets)); assert_shape_1d(t36, 1); - - { - /* - tok_embeddings | grad_tok_embeddings = ggml_get_rows_back(grad_t01, t00) - L0_att_norm | grad_L0_att_norm = ggml_repeat_back(grad_t03L0, L0_att_norm.shape) - L0_wq | grad_L0_wq = ggml_out_prod(t04L0, grad_t05L0) - L0_wk | grad_L0_wk = ggml_out_prod(t04L0, grad_t08L0) - L0_wv | grad_L0_wv = ggml_out_prod(t04L0, ggml_transpose(grad_t11L0)) - L0_wo | grad_L0_wo = ggml_out_prod(t19L0, grad_t20L0) - L0_ffn_norm | grad_L0_ffn_norm = ggml_repeat_back(grad_t23L0, L0_ffn_norm.shape) - L0_w1 | grad_L0_w1 = ggml_out_prod(t24L0, grad_t26L0) - L0_w2 | grad_L0_w2 = ggml_out_prod(t28L0, grad_t29L0) - L0_w3 | grad_L0_w3 = ggml_out_prod(t24L0, grad_t25L0) - L1_att_norm | grad_L1_att_norm = ggml_repeat_back(grad_t03L1, L1_att_norm.shape) - L1_wq | grad_L1_wq = ggml_out_prod(t04L1, grad_t05L1) - L1_wk | grad_L1_wk = ggml_out_prod(t04L1, grad_t08L1) - L1_wv | grad_L1_wv = ggml_out_prod(t04L1, ggml_transpose(grad_t11L1)) - L1_wo | grad_L1_wo = ggml_out_prod(t19L1, grad_t20L1) - L1_ffn_norm | grad_L1_ffn_norm = ggml_repeat_back(grad_t23L1, L1_ffn_norm.shape) - L1_w1 | grad_L1_w1 = ggml_out_prod(t24L1, grad_t26L1) - L1_w2 | grad_L1_w2 = ggml_out_prod(t28L1, grad_t29L1) - L1_w3 | grad_L1_w3 = ggml_out_prod(t24L1, grad_t25L1) - norm | grad_norm = ggml_repeat_back(grad_t32, norm.shape) - output | grad_output = ggml_out_prod(t33, grad_t34) - | - t01 = ggml_get_rows(tok_embeddings, t00) | grad_t01 = grad_t21L0 + ggml_rms_norm_back(t01, grad_t02L0) - for layer: | - t02L0*= ggml_rms_norm (t01) | grad_t02L0 = ggml_mul(grad_t04L0, t03L0) - t03L0 = ggml_repeat (L0_att_norm, t02L0_shape) | grad_t03L0 = ggml_mul(grad_t04L0, t02L0) - t04L0*= ggml_mul (t02L0, t03L0) | grad_t04L0 = ggml_out_prod(L0_wv, grad_t11L0) + ggml_out_prod(L0_wk, ggml_transpose(grad_t08L0)) + ggml_out_prod(L0_wq, ggml_transpose(grad_t05L0)) - t05L0 = ggml_mul_mat (L0_wq, t04L0) | grad_t05L0 = ggml_reshape(grad_t06L0, t05L0_shape) - t06L0 = ggml_reshape_4d (t05L0, n_embd/n_head, n_head, N, n_batch) | grad_t06L0 = ggml_rope_back(grad_t07L0) - t07L0 = ggml_rope_inplace (t06L0) | grad_t07L0 = ggml_permute_back(grad_t13L0, 0, 2, 1, 3) = ggml_permute(grad_t13L0, 0, 2, 1, 3) - t08L0 = ggml_mul_mat (L0_wk, t04L0) | grad_t08L0 = ggml_reshape(grad_t09L0, t08L0_shape) - t09L0 = ggml_reshape_4d (t08L0, n_embd/n_head, n_head, N, n_batch) | grad_t09L0 = ggml_rope_back(grad_t10L0) - t10L0 = ggml_rope_inplace (t09L0) | grad_t10L0 = ggml_permute_back(grad_t14L0, 0, 2, 1, 3) = ggml_permute(grad_t14L0, 0, 2, 1, 3) - t11L0 = ggml_mul_mat (t04L0, L0_wv) | grad_t11L0 = ggml_reshape(grad_t12L0, t11L0_shape) - t12L0 = ggml_reshape_4d (t11L0, N, n_batch, n_embd/n_head, n_head) | grad_t12L0 = ggml_permute_back(grad_t15L0, 0, 3, 1, 2) = ggml_permute(grad_t15L0, 0, 2, 3, 1) - t13L0*= ggml_permute (t07L0, 0, 2, 1, 3) | grad_t13L0 = view__q(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) - t14L0*= ggml_permute (t10L0, 0, 2, 1, 3) | grad_t14L0 = view__k(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) - t15L0*= ggml_permute (t12L0, 0, 3, 1, 2) | grad_t15L0 = view__v(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) - t16L0 = ggml_flash_attn (t13L0, t14L0, t15L0) | grad_t16L0 = ggml_permute_back(grad_t17L0, 0, 2, 1, 3) = ggml_permute(grad_t17L0, 0, 2, 1, 3) - t17L0 = ggml_permute (t16L0, 0, 2, 1, 3) | grad_t17L0 = grad_t18L0 - t18L0 = ggml_cont (t17L0) | grad_t18L0 = ggml_reshape(grad_t19L0, t18L0_shape) - t19L0*= ggml_reshape_2d (t18L0, n_embd, N*n_batch) | grad_t19L0 = ggml_out_prod(L0_wo, ggml_transpose(grad_t20L0)) - t20L0 = ggml_mul_mat (L0_wo, t19L0) | grad_t20L0 = grad_t21L0 - t21L0*= ggml_add (t20L0, t01) | grad_t21L0 = grad_t30L0 + ggml_rms_norm_back(t21L0, grad_t22L0) - t22L0*= ggml_rms_norm (t21L0) | grad_t22L0 = ggml_mul(grad_t24L0, t23L0) - t23L0 = ggml_repeat (L0_ffn_norm, t22L0_shape) | grad_t23L0 = ggml_mul(grad_t24L0, t22L0) - t24L0*= ggml_mul (t23L0, t22L0) | grad_t24L0 = ggml_out_prod(L0_w1, ggml_transpose(grad_t26L0)) + ggml_out_prod(L0_w3, ggml_transpose(grad_t25L0)) - t25L0*= ggml_mul_mat (L0_w3, t24L0) | grad_t25L0 = ggml_mul(grad_t28L0, t27L0) - t26L0*= ggml_mul_mat (L0_w1, t24L0) | grad_t26L0 = ggml_silu_back(t26L0, grad_t27L0) - t27L0*= ggml_silu (t26L0) | grad_t27L0 = ggml_mul(grad_t28L0, t25L0) - t28L0*= ggml_mul (t27L0, t25L0) | grad_t28L0 = ggml_out_prod(L0_w2, ggml_transpose(grad_t29L0)) - t29L0 = ggml_mul_mat (L0_w2, t28L0) | grad_t29L0 = grad_t30L0 - t30L0*= ggml_add (t21L0, t29L0) | grad_t30L0 = ggml_rms_norm_back(t30L0, grad_t02L1) + grad_t21L1 - ^ - t02L1*= ggml_rms_norm (t30L0) | grad_t02L1 = ggml_mul(grad_t04L1, t03L1) - t03L1 = ggml_repeat (L1_att_norm, t02L1_shape) | grad_t03L1 = ggml_mul(grad_t04L1, t02L1) - t04L1*= ggml_mul (t02L1, t03L1) | grad_t04L1 = ggml_out_prod(L1_wv, grad_t11L1) + ggml_out_prod(L1_wk, ggml_transpose(grad_t08L1)) + ggml_out_prod(L1_wq, ggml_transpose(grad_t05L1)) - t05L1 = ggml_mul_mat (L1_wq, t04L1) | grad_t05L1 = ggml_reshape(grad_t06L1, t05L1_shape) - t06L1 = ggml_reshape_4d (t05L1, n_embd/n_head, n_head, N, n_batch) | grad_t06L1 = ggml_rope_back(grad_t07L1) - t07L1 = ggml_rope_inplace (t06L1) | grad_t07L1 = ggml_permute_back(grad_t13L1, 0, 2, 1, 3) = ggml_permute(grad_t13L1, 0, 2, 1, 3) - t08L1 = ggml_mul_mat (L1_wk, t04L1) | grad_t08L1 = ggml_reshape(grad_t09L1, t08L1_shape) - t09L1 = ggml_reshape_4d (t08L1, n_embd/n_head, n_head, N, n_batch) | grad_t09L1 = ggml_rope_back(grad_t10L1) - t10L1 = ggml_rope_inplace (t09L1) | grad_t10L1 = ggml_permute_back(grad_t14L1, 0, 2, 1, 3) = ggml_permute(grad_t14L1, 0, 2, 1, 3) - t11L1 = ggml_mul_mat (t04L1, L1_wv) | grad_t11L1 = ggml_reshape(grad_t12L1, t11L1_shape) - t12L1 = ggml_reshape_4d (t11L1, N, n_batch, n_embd/n_head, n_head) | grad_t12L1 = ggml_permute_back(grad_t15L1, 0, 3, 1, 2) = ggml_permute(grad_t15L1, 0, 2, 3, 1) - t13L1*= ggml_permute (t07L1, 0, 2, 1, 3) | grad_t13L1 = view__q(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) - t14L1*= ggml_permute (t10L1, 0, 2, 1, 3) | grad_t14L1 = view__k(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) - t15L1*= ggml_permute (t12L1, 0, 3, 1, 2) | grad_t15L1 = view__v(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) - t16L1 = ggml_flash_attn (t13L1, t14L1, t15L1) | grad_t16L1 = ggml_permute_back(grad_t17L1, 0, 2, 1, 3) = ggml_permute(grad_t17L1, 0, 2, 1, 3) - t17L1 = ggml_permute (t16L1, 0, 2, 1, 3) | grad_t17L1 = grad_t18L1 - t18L1 = ggml_cont (t17L1) | grad_t18L1 = ggml_reshape(grad_t19L1, t18L1_shape) - t19L1*= ggml_reshape_2d (t18L1, n_embd, N*n_batch) | grad_t19L1 = ggml_out_prod(L1_wo, ggml_transpose(grad_t20L1)) - t20L1 = ggml_mul_mat (L1_wo, t19L1) | grad_t20L1 = grad_t21L1 - t21L1*= ggml_add (t20L1, t30L0) | grad_t21L1 = grad_t30L1 + ggml_rms_norm_back(t21L1, grad_t22L1) - t22L1*= ggml_rms_norm (t21L1) | grad_t22L1 = ggml_mul(grad_t24L1, t23L1) - t23L1 = ggml_repeat (L1_ffn_norm, t22L1_shape) | grad_t23L1 = ggml_mul(grad_t24L1, t22L1) - t24L1*= ggml_mul (t23L1, t22L1) | grad_t24L1 = ggml_out_prod(L1_w1, ggml_transpose(grad_t26L1)) + ggml_out_prod(L1_w3, ggml_transpose(grad_t25L1)) - t25L1*= ggml_mul_mat (L1_w3, t24L1) | grad_t25L1 = ggml_mul(grad_t28L1, t27L1) - t26L1*= ggml_mul_mat (L1_w1, t24L1) | grad_t26L1 = ggml_silu_back(t26L1, grad_t27L1) - t27L1*= ggml_silu (t26L1) | grad_t27L1 = ggml_mul(grad_t28L1, t25L1) - t28L1*= ggml_mul (t27L1, t25L1) | grad_t28L1 = ggml_out_prod(L1_w2, ggml_transpose(grad_t29L1)) - t29L1 = ggml_mul_mat (L1_w2, t28L1) | grad_t29L1 = grad_t30L1 - t30L1*= ggml_add (t21L1, t29L1) | grad_t30L1 = ggml_rms_norm_back(t30L1, grad_t31) - ^ - t31 = ggml_rms_norm (t30L1) | grad_t31 = ggml_mul(grad_t33, t32) - t32 = ggml_repeat (norm, t31.shape) | grad_t32 = ggml_mul(grad_t33, t31) - t33 = ggml_mul (t32, t31) | grad_t33 = ggml_out_prod(output, ggml_transpose(grad_t34)) - t34 = ggml_mul_mat (output, t33) | grad_t34 = ggml_reshape(grad_t35, t34.shape) - t35 = ggml_reshape_3d (t34, n_vocab, N, n_batch) | grad_t35 = ggml_cross_entropy_loss_back(t35, targets, grad_t36) - t36 = ggml_cross_entropy_loss(t35, targets) | grad_t36 = 1 (optimizer) - tensors marked with * need to be stored until grad computation - tensors during grad computation are all temporary - */ - } - - *gb = *gf; - - // t36->grad gets set to one by optimizer, so we need the tensor. - // initialize it with 1.0f to make sure. - use_buf(-1); - t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f)); - - use_buf(0); - t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad)); assert_shape_3d(t35->grad, n_vocab, N, n_batch); - t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch)); assert_shape_2d(t34->grad, n_vocab, N*n_batch); - t33->grad = expand(gb, ggml_out_prod (ctx0, model->output, ggml_transpose(ctx0, t34->grad))); assert_shape_2d(t33->grad, n_embd, N*n_batch); - t32->grad = expand(gb, ggml_mul (ctx0, t33->grad, t31)); assert_shape_2d(t32->grad, n_embd, N*n_batch); - - use_buf(-1); - - model->norm->grad = expand(gb, add_or_set(model->norm->grad, ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd); - model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad))); assert_shape_2d(model->output->grad, n_embd, n_vocab); - - clr_buf(1); - use_buf(1); - t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32)); assert_shape_2d(t31->grad, n_embd, N*n_batch); - - struct ggml_tensor * back_layer_inp = t31; - struct ggml_tensor * grad_layer_inp = NULL; - - for (int k = 0; k < n_layer; ++k) { - int il = n_layer-1-k; - struct my_llama_layer & layer = model->layers[il]; - - struct ggml_tensor * t02 = t02L[il]; - struct ggml_tensor * t03 = t03L[il]; - struct ggml_tensor * t04 = t04L[il]; - struct ggml_tensor * t05 = t05L[il]; - struct ggml_tensor * t06 = t06L[il]; - struct ggml_tensor * t07 = t07L[il]; - struct ggml_tensor * t08 = t08L[il]; - struct ggml_tensor * t09 = t09L[il]; - struct ggml_tensor * t10 = t10L[il]; - struct ggml_tensor * t11 = t11L[il]; - struct ggml_tensor * t12 = t12L[il]; - struct ggml_tensor * t13 = t13L[il]; - struct ggml_tensor * t14 = t14L[il]; - struct ggml_tensor * t15 = t15L[il]; - struct ggml_tensor * t16 = t16L[il]; - struct ggml_tensor * t17 = t17L[il]; - struct ggml_tensor * t18 = t18L[il]; - struct ggml_tensor * t19 = t19L[il]; - struct ggml_tensor * t20 = t20L[il]; - struct ggml_tensor * t21 = t21L[il]; - struct ggml_tensor * t22 = t22L[il]; - struct ggml_tensor * t23 = t23L[il]; - struct ggml_tensor * t24 = t24L[il]; - struct ggml_tensor * t25 = t25L[il]; - struct ggml_tensor * t26 = t26L[il]; - struct ggml_tensor * t27 = t27L[il]; - struct ggml_tensor * t28 = t28L[il]; - struct ggml_tensor * t29 = t29L[il]; - struct ggml_tensor * t30 = t30L[il]; - - clr_buf(0); - use_buf(0); - t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad, rms_norm_eps)); assert_shape_2d(t30->grad, n_embd, N*n_batch); - if (grad_layer_inp) { - t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); - } - clr_buf(1); - t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch); - t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad))); assert_shape_2d(t28->grad, n_ff, N*n_batch); - t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25)); assert_shape_2d(t27->grad, n_ff, N*n_batch); - t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad)); assert_shape_2d(t26->grad, n_ff, N*n_batch); - t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27)); assert_shape_2d(t25->grad, n_ff, N*n_batch); - t24->grad = expand(gb, ggml_add_inplace(ctx0, - ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)), - ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad)))); assert_shape_2d(t24->grad, n_embd, N*n_batch); - t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22)); assert_shape_2d(t23->grad, n_embd, N*n_batch); - t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad))); assert_shape_2d(t22->grad, n_embd, N*n_batch); - use_buf(1); - t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad, rms_norm_eps))); assert_shape_2d(t21->grad, n_embd, N*n_batch); - grad_layer_inp = t21; - use_buf(0); - t20->grad = t21->grad; assert_shape_2d(t20->grad, n_embd, N*n_batch); - t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad))); assert_shape_2d(t19->grad, n_embd, N*n_batch); - t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch); - t17->grad = t18->grad; assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch); - t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3)); assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch); - struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true)); assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch); - t15->grad = expand(gb, view__v(flash_attn)); assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch); - t14->grad = expand(gb, view__k(flash_attn)); assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch); - t13->grad = expand(gb, view__q(flash_attn)); assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch); - t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head); - t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd)); assert_shape_2d(t11->grad, N*n_batch, n_embd); - t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch); - t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch); - t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch)); assert_shape_2d(t08->grad, n_embd, N*n_batch); - t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch); - t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch); - t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch)); assert_shape_2d(t05->grad, n_embd, N*n_batch); - t04->grad = expand(gb, ggml_add_inplace(ctx0, - ggml_add_inplace(ctx0, - ggml_out_prod(ctx0, layer.wv, t11->grad), - ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))), - ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad)))); assert_shape_2d(t04->grad, n_embd, N*n_batch); - t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02)); assert_shape_2d(t04->grad, n_embd, N*n_batch); - use_buf(1); - t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02))); assert_shape_2d(t02->grad, n_embd, N*n_batch); - back_layer_inp = t02; - // use_buf(0); - - use_buf(-1); - layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm))); assert_shape_1d(layer.attention_norm->grad, n_embd); - layer.wq->grad = expand(gb, add_or_set(layer.wq->grad, ggml_out_prod(ctx0, t04, t05->grad))); assert_shape_2d(layer.wq->grad, n_embd, n_embd); - layer.wk->grad = expand(gb, add_or_set(layer.wk->grad, ggml_out_prod(ctx0, t04, t08->grad))); assert_shape_2d(layer.wk->grad, n_embd, n_embd); - layer.wv->grad = expand(gb, add_or_set(layer.wv->grad, ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad, n_embd, n_embd); - layer.wo->grad = expand(gb, add_or_set(layer.wo->grad, ggml_out_prod(ctx0, t19, t20->grad))); assert_shape_2d(layer.wo->grad, n_embd, n_embd); - layer.ffn_norm->grad = expand(gb, add_or_set(layer.ffn_norm->grad, ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm))); assert_shape_1d(layer.ffn_norm->grad, n_embd); - layer.w1->grad = expand(gb, add_or_set(layer.w1->grad, ggml_out_prod(ctx0, t24, t26->grad))); assert_shape_2d(layer.w1->grad, n_embd, n_ff); - layer.w2->grad = expand(gb, add_or_set(layer.w2->grad, ggml_out_prod(ctx0, t28, t29->grad))); assert_shape_2d(layer.w2->grad, n_ff, n_embd); - layer.w3->grad = expand(gb, add_or_set(layer.w3->grad, ggml_out_prod(ctx0, t24, t25->grad))); assert_shape_2d(layer.w3->grad, n_embd, n_ff); - // use_buf(0); - } - clr_buf(0); - use_buf(0); - t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad, rms_norm_eps))); assert_shape_2d(t01->grad, n_embd, N*n_batch); - use_buf(-1); - model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings)); assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab); - // clr_buf(1); - // clr_buf(0); - - *logits = t35; - - clr_buf(0); - clr_buf(1); - - if (track_max_mem) { - printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]); - printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]); - } - - // now that all grads are created, set the graph leafs and grads - graph_set_leafs_grads(gf); - graph_set_leafs_grads(gb); - - return t36; -} - -struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing( - struct my_llama_model * model, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb, - struct ggml_tensor * * logits, - struct ggml_tensor * tokens_input, - struct ggml_tensor * targets, - void * compute_buf_0, - void * compute_buf_1, - void * compute_buf_2, - size_t size_buf_0, - size_t size_buf_1, - size_t size_buf_2, - const int n_tokens, - const int n_batch) { - - // implements gradient-checkpointing as explained in readme of https://github.com/cybertronai/gradient-checkpointing - - ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - - const int n_past = 0; - const int N = n_tokens; - - gf->n_nodes = 0; - gf->n_leafs = 0; - gf->perf_runs = 0; - gf->perf_cycles = 0; - gf->perf_time_us = 0; - - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - const int rope_mode = 0; - - bool track_max_mem = true; - - int last_buf = -1; - size_t buf_offs[3] = { 0, 0, 0 }; - size_t buf_size[3] = { size_buf_0, - size_buf_1, - size_buf_2 }; - void * buf_data[3] = { compute_buf_0, - compute_buf_1, - compute_buf_2 }; - size_t buf_maxs[3] = { 0, 0, 0 }; - - auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) { - size_t last_offs = 0; - last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - if (last_buf >= 0) { - buf_offs[last_buf] = last_offs; - buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]); - } - if (buf >= 0) { - size_t offs = buf_offs[buf]; - size_t size = buf_size[buf]; - void * data = buf_data[buf]; - ggml_set_scratch(ctx0, { offs, size, data, }); - } - last_buf = buf; - }; - - - auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) { - if (buf < 0) return; - if (track_max_mem) { - size_t last_offs = 0; - last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - if (last_buf >= 0) { - buf_offs[last_buf] = last_offs; - buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]); - } - } - buf_offs[buf] = 0; - if (track_max_mem && last_buf >= 0) { - size_t offs = buf_offs[last_buf]; - size_t size = buf_size[last_buf]; - void * data = buf_data[last_buf]; - ggml_set_scratch(ctx0, { offs, size, data, }); - } - }; - - - auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { - int64_t ne0 = n_embd/n_head; - int64_t ne1 = N; - int64_t ne2 = n_head; - int64_t ne3 = n_batch; - size_t nb0 = ggml_element_size(t); - size_t nb1 = nb0*ne0; - size_t nb2 = nb1*ne1; - size_t nb3 = nb2*ne2; - size_t offset = 0; - return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); - }; - - auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { - int64_t ne0 = n_embd/n_head; - int64_t ne1 = N; - int64_t ne2 = n_head; - int64_t ne3 = n_batch; - size_t nb0 = ggml_element_size(t); - size_t nb1 = nb0*ne0; - size_t nb2 = nb1*ne1; - size_t nb3 = nb2*ne2; - size_t offset = nb3*ne3; - return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); - }; - - auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { - int64_t ne0 = N; - int64_t ne1 = n_embd/n_head; - int64_t ne2 = n_head; - int64_t ne3 = n_batch; - size_t nb0 = ggml_element_size(t); - size_t nb1 = nb0*ne0; - size_t nb2 = nb1*ne1; - size_t nb3 = nb2*ne2; - size_t offset = 2*nb3*ne3; - return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); - }; - - auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * { - if (a == NULL) { - return b; - } else { - return ggml_add_inplace(ctx0, a, b); - } - }; - - use_buf(-1); - - model->tok_embeddings->grad = NULL; - model->norm->grad = NULL; - model->output->grad = NULL; - - for (int il = 0; il < n_layer; ++il) { - struct my_llama_layer & layer = model->layers[il]; - layer.attention_norm->grad = NULL; - layer.wq->grad = NULL; - layer.wk->grad = NULL; - layer.wv->grad = NULL; - layer.wo->grad = NULL; - layer.ffn_norm->grad = NULL; - layer.w1->grad = NULL; - layer.w2->grad = NULL; - layer.w3->grad = NULL; - } - - clr_buf(0); - clr_buf(1); - clr_buf(2); - - use_buf(-1); - - GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); - struct ggml_tensor * t00 = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); assert_shape_1d(t00, N*n_batch); - - use_buf(-1); - - struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch); - - - { - // given: n, u, v - // objective: minimize(a*u+b*v) where a*b=n, a>0, b>0 - // b=n/a - // minimize(a*u+v*n/a) - // diff(a*u+v*n/a, a) = u - (v*n/a)/a - // diff(a*u+v*n/a, a) == 0 - // u - (v*n/a)/a == 0 - // u == v*n/(a*a) - // u*a*a = v*n - // a*a = v*n/u - // a = sqrt(n*v/u) - } - - float memcost_checkpoint = n_embd; // (..)*N*n_batch - float memcost_snd_fwd_pass = 14*n_embd+4*n_ff; // (..)*N*n_batch - - int n_checkstep = (int)(sqrtf(n_layer*memcost_checkpoint/memcost_snd_fwd_pass) + 0.5f); - if (n_checkstep < 1) { - n_checkstep = 1; - } - std::vector checkpoints; - for (int chk = n_checkstep-1; chk+1 < n_layer; chk += n_checkstep) { - checkpoints.push_back(chk); - } - int n_check = checkpoints.size(); - // printf("%s: n_check = %d n_checkstep = %d\n", __func__, n_check, n_checkstep); - - // for (int i = 0; i < n_check; ++i) { - // printf("%s: checkpoint #%d = %d\n", __func__, i, checkpoints[i]); - // } - - // example for 16 layers and memcost_checkpoint=memcost_snd_fwd_pass: - // inp ~ implicit zeroth checkpoint == input - // L00 f 4b [ - // L01 f 4b 4th second forward pass - // L02 f 4b - // L03 fc4b ] first checkpoint - // L04 f 3b [ - // L05 f 3b 3rd second forward pass - // L06 f 3b - // L07 fc3b ] second checkpoint - // L08 f 2b [ - // L09 f 2b 2nd second forward pass - // L10 f 2b - // L11 fc2b ] third checkpoint - // L12 f 1b [ - // L13 f 1b 1st second forward pass - // L14 f 1b - // L15 f 1b ] - - // need to remember these for the backward pass - std::vector t02L; t02L.resize(n_layer, NULL); - std::vector t03L; t03L.resize(n_layer, NULL); - std::vector t04L; t04L.resize(n_layer, NULL); - std::vector t05L; t05L.resize(n_layer, NULL); - std::vector t06L; t06L.resize(n_layer, NULL); - std::vector t07L; t07L.resize(n_layer, NULL); - std::vector t08L; t08L.resize(n_layer, NULL); - std::vector t09L; t09L.resize(n_layer, NULL); - std::vector t10L; t10L.resize(n_layer, NULL); - std::vector t11L; t11L.resize(n_layer, NULL); - std::vector t12L; t12L.resize(n_layer, NULL); - std::vector t13L; t13L.resize(n_layer, NULL); - std::vector t14L; t14L.resize(n_layer, NULL); - std::vector t15L; t15L.resize(n_layer, NULL); - std::vector t16L; t16L.resize(n_layer, NULL); - std::vector t17L; t17L.resize(n_layer, NULL); - std::vector t18L; t18L.resize(n_layer, NULL); - std::vector t19L; t19L.resize(n_layer, NULL); - std::vector t20L; t20L.resize(n_layer, NULL); - std::vector t21L; t21L.resize(n_layer, NULL); - std::vector t22L; t22L.resize(n_layer, NULL); - std::vector t23L; t23L.resize(n_layer, NULL); - std::vector t24L; t24L.resize(n_layer, NULL); - std::vector t25L; t25L.resize(n_layer, NULL); - std::vector t26L; t26L.resize(n_layer, NULL); - std::vector t27L; t27L.resize(n_layer, NULL); - std::vector t28L; t28L.resize(n_layer, NULL); - std::vector t29L; t29L.resize(n_layer, NULL); - std::vector t30L; t30L.resize(n_layer, NULL); - - struct ggml_tensor * cur = t01; - - int chk_idx = 0; - for (int il = 0; il < n_layer; ++il) { - struct my_llama_layer & layer = model->layers[il]; - // tensors with values necessary for backward pass are in persistent buf(-1) - // other tensors with buf(0), buf(1), etc are only temporary needed, and their memory reused - bool is_checkpoint = (chk_idx < n_check && il == checkpoints[chk_idx]); - if (is_checkpoint) { - // printf("%s: layer %d is_checkpoint\n", __func__, il); - chk_idx += 1; - } - const int prs = 0; // in first forward pass even persistent tensors are only temporary - const int tmp = 0; // temporary - // nxt is required to compute next layer. - // for checkpoints we need to remember this for usage in backward pass, - // otherwise temporary until next of this kind - const int nxt = is_checkpoint ? -1 : 1; - clr_buf(0); - use_buf(prs); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t02, n_embd, N*n_batch); - use_buf(tmp); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); - use_buf(prs); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); - use_buf(prs); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); - use_buf(prs); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); - use_buf(prs); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd); - use_buf(prs); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); - use_buf(prs); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); - use_buf(prs); struct ggml_tensor * t14 = expand(gf, ggml_permute (ctx0, t10, 0, 2, 1, 3)); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); - use_buf(prs); struct ggml_tensor * t15 = expand(gf, ggml_permute (ctx0, t12, 0, 3, 1, 2)); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); - use_buf(prs); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn (ctx0, t13, t14, t15, true)); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); - use_buf(tmp); struct ggml_tensor * t17 = expand(gf, ggml_permute (ctx0, t16, 0, 2, 1, 3)); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); - use_buf(prs); struct ggml_tensor * t18 = expand(gf, ggml_cont (ctx0, t17)); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); - use_buf(prs); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch); - use_buf(tmp); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21, rms_norm_eps)); assert_shape_2d(t22, n_embd, N*n_batch); - use_buf(tmp); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch); - use_buf(prs); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat (ctx0, layer.w1, t24)); assert_shape_2d(t26, n_ff, N*n_batch); - use_buf(prs); struct ggml_tensor * t27 = expand(gf, ggml_silu (ctx0, t26)); assert_shape_2d(t27, n_ff, N*n_batch); - use_buf(prs); struct ggml_tensor * t28 = expand(gf, ggml_mul (ctx0, t27, t25)); assert_shape_2d(t28, n_ff, N*n_batch); - use_buf(tmp); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat (ctx0, layer.w2, t28)); assert_shape_2d(t29, n_embd, N*n_batch); - clr_buf( 1); - use_buf(nxt); struct ggml_tensor * t30 = expand(gf, ggml_add (ctx0, t21, t29)); assert_shape_2d(t30, n_embd, N*n_batch); - - // only t30L is remembered for checkpointing in first forward pass - if (is_checkpoint) { - t30L[il] = t30; - } - cur = t30; - } - clr_buf(0); - use_buf(0); - struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t31, n_embd, N*n_batch); - struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch); - struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch); - use_buf(-1); - struct ggml_tensor * t34 = expand(gf, ggml_mul_mat (ctx0, model->output, t33)); assert_shape_2d(t34, n_vocab, N*n_batch); - struct ggml_tensor * t35 = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch)); assert_shape_3d(t35, n_vocab, N, n_batch); - struct ggml_tensor * t36 = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets)); assert_shape_1d(t36, 1); - - *gb = *gf; - - // t36->grad gets set to one by optimizer, so we need the tensor. - // initialize it with 1.0f to make sure. - use_buf(-1); - t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f)); - - use_buf(0); - t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad)); assert_shape_3d(t35->grad, n_vocab, N, n_batch); - t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch)); assert_shape_2d(t34->grad, n_vocab, N*n_batch); - t33->grad = expand(gb, ggml_out_prod (ctx0, model->output, ggml_transpose(ctx0, t34->grad))); assert_shape_2d(t33->grad, n_embd, N*n_batch); - t32->grad = expand(gb, ggml_mul (ctx0, t33->grad, t31)); assert_shape_2d(t32->grad, n_embd, N*n_batch); - - use_buf(-1); - - model->norm->grad = expand(gb, add_or_set(model->norm->grad, ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd); - model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad))); assert_shape_2d(model->output->grad, n_embd, n_vocab); - - clr_buf(1); - use_buf(1); - t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32)); assert_shape_2d(t31->grad, n_embd, N*n_batch); - - struct ggml_tensor * back_layer_inp = t31; - struct ggml_tensor * grad_layer_inp = NULL; - - // printf("%s: n_check = %u\n", __func__, n_check); - chk_idx = n_check-1; - int avail_begin = n_layer; - int avail_end = n_layer; - // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end); - for (int k = 0; k < n_layer; ++k) { - // second forward pass for checkpointing - int il = n_layer-1-k; - if (il < avail_begin) { - // make sure, that txxL[il] is available - // forward pass from last checkpoint - GGML_ASSERT(chk_idx >= -1); - int begin = (chk_idx == -1) - ? 0 - : checkpoints[chk_idx] + 1; // checkpoint[chk_idx] contains t30 for computing following layers -> +1 - int end = (chk_idx+1 < n_check) - ? (checkpoints[chk_idx+1] + 1) - : n_layer; - GGML_ASSERT(begin <= il); - GGML_ASSERT(il < end); - cur = (chk_idx == -1) ? t01 : t30L[checkpoints[chk_idx]]; - clr_buf(2); - // printf("%s: second forward pass chk_idx=%d begin=%d end=%d\n", __func__, chk_idx, begin, end); - for (int i = begin; i < end; ++i) { - struct my_llama_layer & layer = model->layers[i]; - const int prs = 2; // persistent until next checkpoint - const int tmp = 0; // temporary for this layer - const bool is_checkpoint = (i == end-1); - clr_buf(0); - use_buf(prs); struct ggml_tensor * t02 = expand(gb, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t02, n_embd, N*n_batch); - use_buf(tmp); struct ggml_tensor * t03 = expand(gb, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t04 = expand(gb, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t05 = expand(gb, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t06 = expand(gb, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); - use_buf(prs); struct ggml_tensor * t07 = expand(gb, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); - use_buf(prs); struct ggml_tensor * t08 = expand(gb, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t09 = expand(gb, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); - use_buf(prs); struct ggml_tensor * t10 = expand(gb, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); - use_buf(prs); struct ggml_tensor * t11 = expand(gb, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd); - use_buf(prs); struct ggml_tensor * t12 = expand(gb, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); - use_buf(prs); struct ggml_tensor * t13 = expand(gb, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); - use_buf(prs); struct ggml_tensor * t14 = expand(gb, ggml_permute (ctx0, t10, 0, 2, 1, 3)); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); - use_buf(prs); struct ggml_tensor * t15 = expand(gb, ggml_permute (ctx0, t12, 0, 3, 1, 2)); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); - use_buf(prs); struct ggml_tensor * t16 = expand(gb, ggml_flash_attn (ctx0, t13, t14, t15, true)); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); - use_buf(tmp); struct ggml_tensor * t17 = expand(gb, ggml_permute (ctx0, t16, 0, 2, 1, 3)); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); - use_buf(prs); struct ggml_tensor * t18 = expand(gb, ggml_cont (ctx0, t17)); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); - use_buf(prs); struct ggml_tensor * t19 = expand(gb, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch); - use_buf(tmp); struct ggml_tensor * t20 = expand(gb, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t21 = expand(gb, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t22 = expand(gb, ggml_rms_norm (ctx0, t21, rms_norm_eps)); assert_shape_2d(t22, n_embd, N*n_batch); - use_buf(tmp); struct ggml_tensor * t23 = expand(gb, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t24 = expand(gb, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch); - use_buf(prs); struct ggml_tensor * t25 = expand(gb, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch); - use_buf(prs); struct ggml_tensor * t26 = expand(gb, ggml_mul_mat (ctx0, layer.w1, t24)); assert_shape_2d(t26, n_ff, N*n_batch); - use_buf(prs); struct ggml_tensor * t27 = expand(gb, ggml_silu (ctx0, t26)); assert_shape_2d(t27, n_ff, N*n_batch); - use_buf(prs); struct ggml_tensor * t28 = expand(gb, ggml_mul (ctx0, t27, t25)); assert_shape_2d(t28, n_ff, N*n_batch); - use_buf(tmp); struct ggml_tensor * t29 = expand(gb, ggml_mul_mat (ctx0, layer.w2, t28)); assert_shape_2d(t29, n_embd, N*n_batch); - if (t30L[i] == NULL) { - use_buf(prs); struct ggml_tensor * t30 = expand(gb, ggml_add (ctx0, t21, t29)); assert_shape_2d(t30, n_embd, N*n_batch); - t30L[i] = t30; - cur = t30; - } - t02L[i] = t02; - t03L[i] = t03; - t04L[i] = t04; - t05L[i] = t05; - t06L[i] = t06; - t07L[i] = t07; - t08L[i] = t08; - t09L[i] = t09; - t10L[i] = t10; - t11L[i] = t11; - t12L[i] = t12; - t13L[i] = t13; - t14L[i] = t14; - t15L[i] = t15; - t16L[i] = t16; - t17L[i] = t17; - t18L[i] = t18; - t19L[i] = t19; - t20L[i] = t20; - t21L[i] = t21; - t22L[i] = t22; - t23L[i] = t23; - t24L[i] = t24; - t25L[i] = t25; - t26L[i] = t26; - t27L[i] = t27; - t28L[i] = t28; - t29L[i] = t29; - } - --chk_idx; - avail_begin = begin; - avail_end = end; - // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end); - } - // printf("%s: backward pass il=%d\n", __func__, il); - - struct my_llama_layer & layer = model->layers[il]; - - struct ggml_tensor * t02 = t02L[il]; - struct ggml_tensor * t03 = t03L[il]; - struct ggml_tensor * t04 = t04L[il]; - struct ggml_tensor * t05 = t05L[il]; - struct ggml_tensor * t06 = t06L[il]; - struct ggml_tensor * t07 = t07L[il]; - struct ggml_tensor * t08 = t08L[il]; - struct ggml_tensor * t09 = t09L[il]; - struct ggml_tensor * t10 = t10L[il]; - struct ggml_tensor * t11 = t11L[il]; - struct ggml_tensor * t12 = t12L[il]; - struct ggml_tensor * t13 = t13L[il]; - struct ggml_tensor * t14 = t14L[il]; - struct ggml_tensor * t15 = t15L[il]; - struct ggml_tensor * t16 = t16L[il]; - struct ggml_tensor * t17 = t17L[il]; - struct ggml_tensor * t18 = t18L[il]; - struct ggml_tensor * t19 = t19L[il]; - struct ggml_tensor * t20 = t20L[il]; - struct ggml_tensor * t21 = t21L[il]; - struct ggml_tensor * t22 = t22L[il]; - struct ggml_tensor * t23 = t23L[il]; - struct ggml_tensor * t24 = t24L[il]; - struct ggml_tensor * t25 = t25L[il]; - struct ggml_tensor * t26 = t26L[il]; - struct ggml_tensor * t27 = t27L[il]; - struct ggml_tensor * t28 = t28L[il]; - struct ggml_tensor * t29 = t29L[il]; - struct ggml_tensor * t30 = t30L[il]; - - clr_buf(0); - use_buf(0); - t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad, rms_norm_eps)); assert_shape_2d(t30->grad, n_embd, N*n_batch); - if (grad_layer_inp) { - t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); - } - clr_buf(1); - t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch); - t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad))); assert_shape_2d(t28->grad, n_ff, N*n_batch); - t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25)); assert_shape_2d(t27->grad, n_ff, N*n_batch); - t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad)); assert_shape_2d(t26->grad, n_ff, N*n_batch); - t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27)); assert_shape_2d(t25->grad, n_ff, N*n_batch); - t24->grad = expand(gb, ggml_add_inplace(ctx0, - ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)), - ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad)))); assert_shape_2d(t24->grad, n_embd, N*n_batch); - t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22)); assert_shape_2d(t23->grad, n_embd, N*n_batch); - t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad))); assert_shape_2d(t22->grad, n_embd, N*n_batch); - use_buf(1); - t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad, rms_norm_eps))); assert_shape_2d(t21->grad, n_embd, N*n_batch); - grad_layer_inp = t21; - use_buf(0); - t20->grad = t21->grad; assert_shape_2d(t20->grad, n_embd, N*n_batch); - t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad))); assert_shape_2d(t19->grad, n_embd, N*n_batch); - t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch); - t17->grad = t18->grad; assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch); - t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3)); assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch); - struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true)); assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch); - t15->grad = expand(gb, view__v(flash_attn)); assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch); - t14->grad = expand(gb, view__k(flash_attn)); assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch); - t13->grad = expand(gb, view__q(flash_attn)); assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch); - t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head); - t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd)); assert_shape_2d(t11->grad, N*n_batch, n_embd); - t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch); - t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch); - t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch)); assert_shape_2d(t08->grad, n_embd, N*n_batch); - t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch); - t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch); - t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch)); assert_shape_2d(t05->grad, n_embd, N*n_batch); - t04->grad = expand(gb, ggml_add_inplace(ctx0, - ggml_add_inplace(ctx0, - ggml_out_prod(ctx0, layer.wv, t11->grad), - ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))), - ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad)))); assert_shape_2d(t04->grad, n_embd, N*n_batch); - t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02)); assert_shape_2d(t04->grad, n_embd, N*n_batch); - use_buf(1); - t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02))); assert_shape_2d(t02->grad, n_embd, N*n_batch); - back_layer_inp = t02; - - use_buf(-1); - layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm))); assert_shape_1d(layer.attention_norm->grad, n_embd); - layer.wq->grad = expand(gb, add_or_set(layer.wq->grad, ggml_out_prod(ctx0, t04, t05->grad))); assert_shape_2d(layer.wq->grad, n_embd, n_embd); - layer.wk->grad = expand(gb, add_or_set(layer.wk->grad, ggml_out_prod(ctx0, t04, t08->grad))); assert_shape_2d(layer.wk->grad, n_embd, n_embd); - layer.wv->grad = expand(gb, add_or_set(layer.wv->grad, ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad, n_embd, n_embd); - layer.wo->grad = expand(gb, add_or_set(layer.wo->grad, ggml_out_prod(ctx0, t19, t20->grad))); assert_shape_2d(layer.wo->grad, n_embd, n_embd); - layer.ffn_norm->grad = expand(gb, add_or_set(layer.ffn_norm->grad, ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm))); assert_shape_1d(layer.ffn_norm->grad, n_embd); - layer.w1->grad = expand(gb, add_or_set(layer.w1->grad, ggml_out_prod(ctx0, t24, t26->grad))); assert_shape_2d(layer.w1->grad, n_embd, n_ff); - layer.w2->grad = expand(gb, add_or_set(layer.w2->grad, ggml_out_prod(ctx0, t28, t29->grad))); assert_shape_2d(layer.w2->grad, n_ff, n_embd); - layer.w3->grad = expand(gb, add_or_set(layer.w3->grad, ggml_out_prod(ctx0, t24, t25->grad))); assert_shape_2d(layer.w3->grad, n_embd, n_ff); - } - // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end); - GGML_ASSERT(n_check == 0 || chk_idx == -2); - GGML_ASSERT(avail_begin == 0); - clr_buf(0); - use_buf(0); - t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad, rms_norm_eps))); assert_shape_2d(t01->grad, n_embd, N*n_batch); - use_buf(-1); - model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings)); assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab); - - *logits = t35; - - clr_buf(0); - clr_buf(1); - clr_buf(2); - - if (track_max_mem) { - printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]); - printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]); - printf("%s: max size compute buf2: %zu\n", __func__, buf_maxs[2]); - } - - // now that all grads are created, set the graph leafs and grads - graph_set_leafs_grads(gf); - graph_set_leafs_grads(gb); - - return t36; -} - void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) { float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); *ptr = value; @@ -4485,44 +2945,14 @@ int main(int argc, char ** argv) { struct ggml_tensor * loss = NULL; struct ggml_tensor * logits = NULL; - if (params.use_alloc || params.use_unified) { - loss = llama_build_train_graphs( - &model, alloc, ctx0, - gf, gb, gb_tmp, - &logits, tokens_input, target_probs, - n_tokens, n_batch, - params.use_flash, - params.use_checkpointing - ); - } else if (params.use_checkpointing) { - loss = forward_batch_wo_cache_flash_attn_train_grad_checkpointing( - &model, ctx0, - gf, gb, - &logits, tokens_input, target_probs, - compute_buf_0, compute_buf_1, compute_buf_2, - size_buf_0, size_buf_1, size_buf_2, - n_tokens, n_batch); - } else if (params.use_scratch) { - loss = forward_batch_wo_cache_flash_attn_train( - &model, ctx0, - gf, gb, - &logits, tokens_input, target_probs, - compute_buf_0, compute_buf_1, - size_buf_0, size_buf_1, - n_tokens, n_batch); - } else if (params.use_flash) { - logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch); - loss = cross_entropy_loss(ctx0, logits, target_probs); - ggml_build_forward_expand(gf, loss); - *gb = *gf; - ggml_build_backward_expand(ctx0, gf, gb, true); - } else { - logits = forward_batch_wo_cache(&model, ctx0, gf, tokens_input, n_tokens, n_batch); - loss = cross_entropy_loss(ctx0, logits, target_probs); - ggml_build_forward_expand(gf, loss); - *gb = *gf; - ggml_build_backward_expand(ctx0, gf, gb, true); - } + loss = llama_build_train_graphs( + &model, alloc, ctx0, + gf, gb, gb_tmp, + &logits, tokens_input, target_probs, + n_tokens, n_batch, + params.use_flash, + params.use_checkpointing + ); size_t used_mem_before_opt = ggml_used_mem(ctx0);