diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index eb3ac9ac3..9d94bdfcf 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -946,406 +946,6 @@ struct ggml_tensor * forward_batch(
     return inpL;
 }
 
-struct ggml_tensor * forward_batch_wo_cache(
-        struct my_llama_model * model,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_tensor    * tokens_input,
-        const  int              n_tokens,
-        const  int              n_batch) {
-
-    const int n_past = 0;
-    const int N = n_tokens;
-
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_vocab = hparams.n_vocab;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-    const int n_ff    = get_n_ff(&hparams);
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * tokens = ggml_reshape_1d(ctx0, tokens_input, N*n_batch);
-
-    // inpL shape [n_embd,N*n_batch,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    assert_shape_2d(inpL, n_embd, N*n_batch);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // lctx.use_buf(ctx0, 0);
-
-        // norm
-        {
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
-            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
-
-            // Vcur shape [N, n_batch, n_embd/n_head, n_head]
-            struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head);
-            assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head);
-
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Q shape    [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
-
-            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-            // K shape [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        Kcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch);
-
-            // K * Q
-            // KQ shape [N, N, n_head, n_batch]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            assert_shape_4d(KQ, N, N, n_head, n_batch);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
-            assert_shape_4d(KQ_scaled, N, N, n_head, n_batch);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-            assert_shape_4d(KQ_masked, N, N, n_head, n_batch);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch);
-
-            // Vcur shape [N, n_batch, n_embd/n_head, n_head]
-            // V shape    [N, n_embd/n_head, n_head, n_batch]
-            struct ggml_tensor * V =
-                ggml_permute(ctx0,
-                    Vcur,
-                    0, 3, 1, 2);
-            assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch);
-
-            // KQV shape [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
-            // KQV_merged shape
-
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // projection (no bias)
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // lctx.use_buf(ctx0, 1);
-
-        // inpFF shape [n_embd,N*n_batch,1,1]
-        struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
-        assert_shape_2d(inpFF, n_embd, N*n_batch);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-            }
-
-            // tmp shape [n_ff,N*n_batch,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-            assert_shape_2d(tmp, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // SILU activation
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_silu(ctx0, cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // cur shape [n_embd,N*n_batch,1,1]
-        cur = ggml_add_inplace(ctx0, cur, inpFF);
-        assert_shape_2d(cur, n_embd, N*n_batch);
-
-        // input for next layer
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = cur;
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        //embeddings = inpL;
-    }
-
-    // lm_head
-    // inpL shape [n_vocab,N*n_batch,1,1]
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-    assert_shape_2d(inpL, n_vocab, N*n_batch);
-
-    {
-        // inpL shape [n_vocab,N,n_batch,1]
-        inpL = ggml_reshape_3d(ctx0,
-                        inpL,
-                        n_vocab, N, n_batch);
-        assert_shape_3d(inpL, n_vocab, N, n_batch);
-    }
-
-    // run the computation
-    // ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
-struct ggml_tensor * forward_batch_wo_cache_flash_attn(
-        struct my_llama_model * model,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_tensor    * tokens_input,
-        const  int              n_tokens,
-        const  int              n_batch) {
-
-    const int n_past = 0;
-    const int N = n_tokens;
-
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_vocab = hparams.n_vocab;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-    const int n_ff    = get_n_ff(&hparams);
-
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * tokens = ggml_reshape_1d(ctx0, tokens_input, N*n_batch);
-
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    assert_shape_2d(inpL, n_embd, N*n_batch);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
-            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
-
-            struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head);
-            assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head);
-
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
-
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        Kcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch);
-
-            struct ggml_tensor * V =
-                ggml_permute(ctx0,
-                    Vcur,
-                    0, 3, 1, 2);
-            assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch);
-
-            bool masked = true;
-            struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, masked);
-            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // projection (no bias)
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        assert_shape_2d(inpFF, n_embd, N*n_batch);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-
-                // cur = ffn_norm*cur
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-            }
-
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-            assert_shape_2d(tmp, n_ff, N*n_batch);
-
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // SILU activation
-            cur = ggml_silu(ctx0, cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            cur = ggml_mul(ctx0, cur, tmp);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // cur = ggml_add_inplace(ctx0, cur, inpFF);
-        cur = ggml_add(ctx0, cur, inpFF);
-        assert_shape_2d(cur, n_embd, N*n_batch);
-
-        // input for next layer
-        inpL = cur;
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // norm
-    {
-
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        // inpL = norm*inpL
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // lm_head
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-    assert_shape_2d(inpL, n_vocab, N*n_batch);
-
-    {
-        inpL = ggml_reshape_3d(ctx0,
-                        inpL,
-                        n_vocab, N, n_batch);
-        assert_shape_3d(inpL, n_vocab, N, n_batch);
-    }
-
-    // run the computation
-    // ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
-
 static size_t hash(void * p) {
     return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
 }
@@ -1703,1146 +1303,6 @@ struct ggml_tensor * llama_build_train_graphs(
     return t36;
 }
 
-
-// expand the graph nodes without creating leafs.
-struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
-    // check if already visited
-    for (int i = 0; i < g->n_nodes; i++) {
-        if (g->nodes[i] == t) {
-            return t;
-        }
-    }
-
-    for (int i = 0; i < g->n_leafs; i++) {
-        if (g->leafs[i] == t) {
-            return t;
-        }
-    }
-
-    for (int i = 0; i < GGML_MAX_SRC; ++i) {
-        if (t->src[i]) {
-            expand(g, t->src[i]);
-        }
-    }
-
-    GGML_ASSERT(g->n_nodes < GGML_MAX_NODES);
-
-    if (strlen(t->name) == 0) {
-        snprintf(t->name, sizeof(t->name), "node_%d", g->n_nodes);
-    }
-
-    g->nodes[g->n_nodes] = t;
-    g->grads[g->n_nodes] = t->grad;
-    g->n_nodes++;
-    return t;
-}
-
-void graph_set_leafs_grads(struct ggml_cgraph * g) {
-    // moves leaf nodes to g->leafs.
-    // i.e. g->n_nodes might change.
-    int n_nodes = 0;
-    for (int i = 0; i < g->n_nodes; ++i) {
-        struct ggml_tensor * node = g->nodes[i];
-        const bool is_leaf = node->op == GGML_OP_NONE && node->grad == NULL;
-        if (is_leaf) {
-            GGML_ASSERT(g->n_leafs < GGML_MAX_NODES);
-
-            if (strlen(node->name) == 0) {
-                snprintf(node->name, sizeof(node->name), "leaf_%d", g->n_leafs);
-            }
-
-            g->leafs[g->n_leafs] = node;
-            g->n_leafs++;
-        } else {
-            GGML_ASSERT(n_nodes < GGML_MAX_NODES);
-
-            if (strlen(node->name) == 0) {
-                snprintf(node->name, sizeof(node->name), "node_%d", n_nodes);
-            }
-
-            g->nodes[n_nodes] = node;
-            g->grads[n_nodes] = node->grad;
-            n_nodes++;
-        }
-    }
-    for (int i=n_nodes; i < g->n_nodes; ++i) {
-        g->nodes[i] = NULL;
-        g->grads[i] = NULL;
-    }
-    g->n_nodes = n_nodes;
-}
-
-struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
-        struct my_llama_model * model,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_tensor  * * logits,
-        struct ggml_tensor    * tokens_input,
-        struct ggml_tensor    * targets,
-        void                  * compute_buf_0,
-        void                  * compute_buf_1,
-        size_t                  size_buf_0,
-        size_t                  size_buf_1,
-        const  int              n_tokens,
-        const  int              n_batch) {
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    const int n_past = 0;
-    const int N = n_tokens;
-
-    gf->n_nodes = 0;
-    gf->n_leafs = 0;
-    gf->perf_runs = 0;
-    gf->perf_cycles = 0;
-    gf->perf_time_us = 0;
-
-    const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
-    const int n_vocab    = hparams.n_vocab;
-    const int n_embd     = hparams.n_embd;
-    const int n_layer    = hparams.n_layer;
-    const int n_head     = hparams.n_head;
-    const int n_rot      = hparams.n_rot;
-    const int n_ff       = get_n_ff(&hparams);
-    const int rope_mode  = 0;
-
-    bool track_max_mem = true;
-
-    int last_buf = -1;
-    size_t buf_offs[2] = { 0, 0 };
-    size_t buf_size[2] = { size_buf_0,
-                           size_buf_1 };
-    void * buf_data[2] = { compute_buf_0,
-                           compute_buf_1 };
-    size_t buf_maxs[2] = { 0, 0 };
-
-    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) {
-        size_t last_offs = 0;
-        last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-        if (last_buf >= 0) {
-            buf_offs[last_buf] = last_offs;
-            buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
-        }
-        if (buf >= 0) {
-            size_t offs = buf_offs[buf];
-            size_t size = buf_size[buf];
-            void * data = buf_data[buf];
-            ggml_set_scratch(ctx0, { offs, size, data, });
-        }
-        last_buf = buf;
-    };
-
-
-    auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) {
-        if (buf < 0) return;
-        if (track_max_mem) {
-            size_t last_offs = 0;
-            last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-            if (last_buf >= 0) {
-                buf_offs[last_buf] = last_offs;
-                buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
-            }
-        }
-        buf_offs[buf] = 0;
-        if (track_max_mem && last_buf >= 0) {
-            size_t offs = buf_offs[last_buf];
-            size_t size = buf_size[last_buf];
-            void * data = buf_data[last_buf];
-            ggml_set_scratch(ctx0, { offs, size, data, });
-        }
-    };
-
-
-    auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = n_embd/n_head;
-        int64_t ne1 = N;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = 0;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = n_embd/n_head;
-        int64_t ne1 = N;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = nb3*ne3;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = N;
-        int64_t ne1 = n_embd/n_head;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = 2*nb3*ne3;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * {
-        if (a == NULL) {
-            return b;
-        } else {
-            return ggml_add_inplace(ctx0, a, b);
-        }
-    };
-
-    use_buf(-1);
-
-    model->tok_embeddings->grad    = NULL;
-    model->norm->grad              = NULL;
-    model->output->grad            = NULL;
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        layer.attention_norm->grad = NULL;
-        layer.wq->grad             = NULL;
-        layer.wk->grad             = NULL;
-        layer.wv->grad             = NULL;
-        layer.wo->grad             = NULL;
-        layer.ffn_norm->grad       = NULL;
-        layer.w1->grad             = NULL;
-        layer.w2->grad             = NULL;
-        layer.w3->grad             = NULL;
-    }
-
-    clr_buf(0);
-    clr_buf(1);
-
-    use_buf(-1);
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); assert_shape_1d(t00, N*n_batch);
-
-    use_buf(-1);
-
-    struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
-
-    // need to remember these for the backward pass
-    std::vector<struct ggml_tensor *> t02L; t02L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t03L; t03L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t04L; t04L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t05L; t05L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t06L; t06L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t07L; t07L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t08L; t08L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t09L; t09L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t10L; t10L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t11L; t11L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t12L; t12L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t13L; t13L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t14L; t14L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t15L; t15L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t16L; t16L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t17L; t17L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t18L; t18L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t19L; t19L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t20L; t20L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t21L; t21L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t22L; t22L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t23L; t23L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t24L; t24L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t25L; t25L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t26L; t26L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t27L; t27L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t28L; t28L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t29L; t29L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t30L; t30L.resize(n_layer, NULL);
-
-    struct ggml_tensor * cur = t01;
-
-    for (int il = 0; il < n_layer; ++il) {
-        clr_buf(0);
-        struct my_llama_layer & layer = model->layers[il];
-        // tensors with values necessary for backward pass are in persistent buf(-1)
-        // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed.
-        use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
-        use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
-        use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        use_buf(-1); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        use_buf(-1); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-        use_buf(-1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        use_buf( 0); struct ggml_tensor * t17 = expand(gf, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
-        use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
-        use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
-        use_buf( 0); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
-        t02L[il] = t02;
-        t03L[il] = t03;
-        t04L[il] = t04;
-        t05L[il] = t05;
-        t06L[il] = t06;
-        t07L[il] = t07;
-        t08L[il] = t08;
-        t09L[il] = t09;
-        t10L[il] = t10;
-        t11L[il] = t11;
-        t12L[il] = t12;
-        t13L[il] = t13;
-        t14L[il] = t14;
-        t15L[il] = t15;
-        t16L[il] = t16;
-        t17L[il] = t17;
-        t18L[il] = t18;
-        t19L[il] = t19;
-        t20L[il] = t20;
-        t21L[il] = t21;
-        t22L[il] = t22;
-        t23L[il] = t23;
-        t24L[il] = t24;
-        t25L[il] = t25;
-        t26L[il] = t26;
-        t27L[il] = t27;
-        t28L[il] = t28;
-        t29L[il] = t29;
-        t30L[il] = t30;
-
-        cur      = t30;
-    }
-    clr_buf(0);
-    use_buf(0);
-    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur, rms_norm_eps));         assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
-    use_buf(-1);
-    struct ggml_tensor * t34   = expand(gf, ggml_mul_mat   (ctx0, model->output, t33));        assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch));  assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets));      assert_shape_1d(t36, 1);
-
-    {
-        /*
-        tok_embeddings                                                        | grad_tok_embeddings = ggml_get_rows_back(grad_t01, t00)
-        L0_att_norm                                                           | grad_L0_att_norm    = ggml_repeat_back(grad_t03L0, L0_att_norm.shape)
-        L0_wq                                                                 | grad_L0_wq          = ggml_out_prod(t04L0, grad_t05L0)
-        L0_wk                                                                 | grad_L0_wk          = ggml_out_prod(t04L0, grad_t08L0)
-        L0_wv                                                                 | grad_L0_wv          = ggml_out_prod(t04L0, ggml_transpose(grad_t11L0))
-        L0_wo                                                                 | grad_L0_wo          = ggml_out_prod(t19L0, grad_t20L0)
-        L0_ffn_norm                                                           | grad_L0_ffn_norm    = ggml_repeat_back(grad_t23L0, L0_ffn_norm.shape)
-        L0_w1                                                                 | grad_L0_w1          = ggml_out_prod(t24L0, grad_t26L0)
-        L0_w2                                                                 | grad_L0_w2          = ggml_out_prod(t28L0, grad_t29L0)
-        L0_w3                                                                 | grad_L0_w3          = ggml_out_prod(t24L0, grad_t25L0)
-        L1_att_norm                                                           | grad_L1_att_norm    = ggml_repeat_back(grad_t03L1, L1_att_norm.shape)
-        L1_wq                                                                 | grad_L1_wq          = ggml_out_prod(t04L1, grad_t05L1)
-        L1_wk                                                                 | grad_L1_wk          = ggml_out_prod(t04L1, grad_t08L1)
-        L1_wv                                                                 | grad_L1_wv          = ggml_out_prod(t04L1, ggml_transpose(grad_t11L1))
-        L1_wo                                                                 | grad_L1_wo          = ggml_out_prod(t19L1, grad_t20L1)
-        L1_ffn_norm                                                           | grad_L1_ffn_norm    = ggml_repeat_back(grad_t23L1, L1_ffn_norm.shape)
-        L1_w1                                                                 | grad_L1_w1          = ggml_out_prod(t24L1, grad_t26L1)
-        L1_w2                                                                 | grad_L1_w2          = ggml_out_prod(t28L1, grad_t29L1)
-        L1_w3                                                                 | grad_L1_w3          = ggml_out_prod(t24L1, grad_t25L1)
-        norm                                                                  | grad_norm           = ggml_repeat_back(grad_t32, norm.shape)
-        output                                                                | grad_output         = ggml_out_prod(t33, grad_t34)
-                                                                              |
-        t01 = ggml_get_rows(tok_embeddings, t00)                              | grad_t01   = grad_t21L0 + ggml_rms_norm_back(t01, grad_t02L0)
-        for layer:                                                            |
-        t02L0*= ggml_rms_norm     (t01)                                       | grad_t02L0 = ggml_mul(grad_t04L0, t03L0)
-        t03L0 = ggml_repeat       (L0_att_norm, t02L0_shape)                  | grad_t03L0 = ggml_mul(grad_t04L0, t02L0)
-        t04L0*= ggml_mul          (t02L0, t03L0)                              | grad_t04L0 = ggml_out_prod(L0_wv, grad_t11L0) + ggml_out_prod(L0_wk, ggml_transpose(grad_t08L0)) + ggml_out_prod(L0_wq, ggml_transpose(grad_t05L0))
-        t05L0 = ggml_mul_mat      (L0_wq, t04L0)                              | grad_t05L0 = ggml_reshape(grad_t06L0, t05L0_shape)
-        t06L0 = ggml_reshape_4d   (t05L0, n_embd/n_head, n_head, N, n_batch)  | grad_t06L0 = ggml_rope_back(grad_t07L0)
-        t07L0 = ggml_rope_inplace (t06L0)                                     | grad_t07L0 = ggml_permute_back(grad_t13L0, 0, 2, 1, 3) = ggml_permute(grad_t13L0, 0, 2, 1, 3)
-        t08L0 = ggml_mul_mat      (L0_wk, t04L0)                              | grad_t08L0 = ggml_reshape(grad_t09L0, t08L0_shape)
-        t09L0 = ggml_reshape_4d   (t08L0, n_embd/n_head, n_head, N, n_batch)  | grad_t09L0 = ggml_rope_back(grad_t10L0)
-        t10L0 = ggml_rope_inplace (t09L0)                                     | grad_t10L0 = ggml_permute_back(grad_t14L0, 0, 2, 1, 3) = ggml_permute(grad_t14L0, 0, 2, 1, 3)
-        t11L0 = ggml_mul_mat      (t04L0, L0_wv)                              | grad_t11L0 = ggml_reshape(grad_t12L0, t11L0_shape)
-        t12L0 = ggml_reshape_4d   (t11L0, N, n_batch, n_embd/n_head, n_head)  | grad_t12L0 = ggml_permute_back(grad_t15L0, 0, 3, 1, 2) = ggml_permute(grad_t15L0, 0, 2, 3, 1)
-        t13L0*= ggml_permute      (t07L0, 0, 2, 1, 3)                         | grad_t13L0 = view__q(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0))
-        t14L0*= ggml_permute      (t10L0, 0, 2, 1, 3)                         | grad_t14L0 = view__k(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0))
-        t15L0*= ggml_permute      (t12L0, 0, 3, 1, 2)                         | grad_t15L0 = view__v(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0))
-        t16L0 = ggml_flash_attn   (t13L0, t14L0, t15L0)                       | grad_t16L0 = ggml_permute_back(grad_t17L0, 0, 2, 1, 3) = ggml_permute(grad_t17L0, 0, 2, 1, 3)
-        t17L0 = ggml_permute      (t16L0, 0, 2, 1, 3)                         | grad_t17L0 = grad_t18L0
-        t18L0 = ggml_cont         (t17L0)                                     | grad_t18L0 = ggml_reshape(grad_t19L0, t18L0_shape)
-        t19L0*= ggml_reshape_2d   (t18L0, n_embd, N*n_batch)                  | grad_t19L0 = ggml_out_prod(L0_wo, ggml_transpose(grad_t20L0))
-        t20L0 = ggml_mul_mat      (L0_wo, t19L0)                              | grad_t20L0 = grad_t21L0
-        t21L0*= ggml_add          (t20L0, t01)                                | grad_t21L0 = grad_t30L0 + ggml_rms_norm_back(t21L0, grad_t22L0)
-        t22L0*= ggml_rms_norm     (t21L0)                                     | grad_t22L0 = ggml_mul(grad_t24L0, t23L0)
-        t23L0 = ggml_repeat       (L0_ffn_norm, t22L0_shape)                  | grad_t23L0 = ggml_mul(grad_t24L0, t22L0)
-        t24L0*= ggml_mul          (t23L0, t22L0)                              | grad_t24L0 = ggml_out_prod(L0_w1, ggml_transpose(grad_t26L0)) + ggml_out_prod(L0_w3, ggml_transpose(grad_t25L0))
-        t25L0*= ggml_mul_mat      (L0_w3, t24L0)                              | grad_t25L0 = ggml_mul(grad_t28L0, t27L0)
-        t26L0*= ggml_mul_mat      (L0_w1, t24L0)                              | grad_t26L0 = ggml_silu_back(t26L0, grad_t27L0)
-        t27L0*= ggml_silu         (t26L0)                                     | grad_t27L0 = ggml_mul(grad_t28L0, t25L0)
-        t28L0*= ggml_mul          (t27L0, t25L0)                              | grad_t28L0 = ggml_out_prod(L0_w2, ggml_transpose(grad_t29L0))
-        t29L0 = ggml_mul_mat      (L0_w2, t28L0)                              | grad_t29L0 = grad_t30L0
-        t30L0*= ggml_add          (t21L0, t29L0)                              | grad_t30L0 = ggml_rms_norm_back(t30L0, grad_t02L1) + grad_t21L1
-                                                                              ^
-        t02L1*= ggml_rms_norm     (t30L0)                                     | grad_t02L1 = ggml_mul(grad_t04L1, t03L1)
-        t03L1 = ggml_repeat       (L1_att_norm, t02L1_shape)                  | grad_t03L1 = ggml_mul(grad_t04L1, t02L1)
-        t04L1*= ggml_mul          (t02L1, t03L1)                              | grad_t04L1 = ggml_out_prod(L1_wv, grad_t11L1) + ggml_out_prod(L1_wk, ggml_transpose(grad_t08L1)) + ggml_out_prod(L1_wq, ggml_transpose(grad_t05L1))
-        t05L1 = ggml_mul_mat      (L1_wq, t04L1)                              | grad_t05L1 = ggml_reshape(grad_t06L1, t05L1_shape)
-        t06L1 = ggml_reshape_4d   (t05L1, n_embd/n_head, n_head, N, n_batch)  | grad_t06L1 = ggml_rope_back(grad_t07L1)
-        t07L1 = ggml_rope_inplace (t06L1)                                     | grad_t07L1 = ggml_permute_back(grad_t13L1, 0, 2, 1, 3) = ggml_permute(grad_t13L1, 0, 2, 1, 3)
-        t08L1 = ggml_mul_mat      (L1_wk, t04L1)                              | grad_t08L1 = ggml_reshape(grad_t09L1, t08L1_shape)
-        t09L1 = ggml_reshape_4d   (t08L1, n_embd/n_head, n_head, N, n_batch)  | grad_t09L1 = ggml_rope_back(grad_t10L1)
-        t10L1 = ggml_rope_inplace (t09L1)                                     | grad_t10L1 = ggml_permute_back(grad_t14L1, 0, 2, 1, 3) = ggml_permute(grad_t14L1, 0, 2, 1, 3)
-        t11L1 = ggml_mul_mat      (t04L1, L1_wv)                              | grad_t11L1 = ggml_reshape(grad_t12L1, t11L1_shape)
-        t12L1 = ggml_reshape_4d   (t11L1, N, n_batch, n_embd/n_head, n_head)  | grad_t12L1 = ggml_permute_back(grad_t15L1, 0, 3, 1, 2) = ggml_permute(grad_t15L1, 0, 2, 3, 1)
-        t13L1*= ggml_permute      (t07L1, 0, 2, 1, 3)                         | grad_t13L1 = view__q(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1))
-        t14L1*= ggml_permute      (t10L1, 0, 2, 1, 3)                         | grad_t14L1 = view__k(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1))
-        t15L1*= ggml_permute      (t12L1, 0, 3, 1, 2)                         | grad_t15L1 = view__v(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1))
-        t16L1 = ggml_flash_attn   (t13L1, t14L1, t15L1)                       | grad_t16L1 = ggml_permute_back(grad_t17L1, 0, 2, 1, 3) = ggml_permute(grad_t17L1, 0, 2, 1, 3)
-        t17L1 = ggml_permute      (t16L1, 0, 2, 1, 3)                         | grad_t17L1 = grad_t18L1
-        t18L1 = ggml_cont         (t17L1)                                     | grad_t18L1 = ggml_reshape(grad_t19L1, t18L1_shape)
-        t19L1*= ggml_reshape_2d   (t18L1, n_embd, N*n_batch)                  | grad_t19L1 = ggml_out_prod(L1_wo, ggml_transpose(grad_t20L1))
-        t20L1 = ggml_mul_mat      (L1_wo, t19L1)                              | grad_t20L1 = grad_t21L1
-        t21L1*= ggml_add          (t20L1, t30L0)                              | grad_t21L1 = grad_t30L1 + ggml_rms_norm_back(t21L1, grad_t22L1)
-        t22L1*= ggml_rms_norm     (t21L1)                                     | grad_t22L1 = ggml_mul(grad_t24L1, t23L1)
-        t23L1 = ggml_repeat       (L1_ffn_norm, t22L1_shape)                  | grad_t23L1 = ggml_mul(grad_t24L1, t22L1)
-        t24L1*= ggml_mul          (t23L1, t22L1)                              | grad_t24L1 = ggml_out_prod(L1_w1, ggml_transpose(grad_t26L1)) + ggml_out_prod(L1_w3, ggml_transpose(grad_t25L1))
-        t25L1*= ggml_mul_mat      (L1_w3, t24L1)                              | grad_t25L1 = ggml_mul(grad_t28L1, t27L1)
-        t26L1*= ggml_mul_mat      (L1_w1, t24L1)                              | grad_t26L1 = ggml_silu_back(t26L1, grad_t27L1)
-        t27L1*= ggml_silu         (t26L1)                                     | grad_t27L1 = ggml_mul(grad_t28L1, t25L1)
-        t28L1*= ggml_mul          (t27L1, t25L1)                              | grad_t28L1 = ggml_out_prod(L1_w2, ggml_transpose(grad_t29L1))
-        t29L1 = ggml_mul_mat      (L1_w2, t28L1)                              | grad_t29L1 = grad_t30L1
-        t30L1*= ggml_add          (t21L1, t29L1)                              | grad_t30L1 = ggml_rms_norm_back(t30L1, grad_t31)
-                                                                              ^
-        t31   = ggml_rms_norm     (t30L1)                                     | grad_t31   = ggml_mul(grad_t33, t32)
-        t32   = ggml_repeat       (norm, t31.shape)                           | grad_t32   = ggml_mul(grad_t33, t31)
-        t33   = ggml_mul          (t32, t31)                                  | grad_t33   = ggml_out_prod(output, ggml_transpose(grad_t34))
-        t34   = ggml_mul_mat      (output, t33)                               | grad_t34   = ggml_reshape(grad_t35, t34.shape)
-        t35   = ggml_reshape_3d   (t34, n_vocab, N, n_batch)                  | grad_t35   = ggml_cross_entropy_loss_back(t35, targets, grad_t36)
-        t36   = ggml_cross_entropy_loss(t35, targets)                         | grad_t36   = 1 (optimizer)
-        tensors marked with * need to be stored until grad computation
-        tensors during grad computation are all temporary
-        */
-    }
-
-    *gb = *gf;
-
-    // t36->grad gets set to one by optimizer, so we need the tensor.
-    // initialize it with 1.0f to make sure.
-    use_buf(-1);
-    t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
-
-    use_buf(0);
-    t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad));              assert_shape_3d(t35->grad, n_vocab, N, n_batch);
-    t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch));                    assert_shape_2d(t34->grad, n_vocab, N*n_batch);
-    t33->grad = expand(gb, ggml_out_prod   (ctx0, model->output, ggml_transpose(ctx0, t34->grad)));   assert_shape_2d(t33->grad, n_embd, N*n_batch);
-    t32->grad = expand(gb, ggml_mul        (ctx0, t33->grad, t31));                                   assert_shape_2d(t32->grad, n_embd, N*n_batch);
-
-    use_buf(-1);
-
-    model->norm->grad   = expand(gb, add_or_set(model->norm->grad,   ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd);
-    model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad)));            assert_shape_2d(model->output->grad, n_embd, n_vocab);
-
-    clr_buf(1);
-    use_buf(1);
-    t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32));  assert_shape_2d(t31->grad, n_embd, N*n_batch);
-
-    struct ggml_tensor * back_layer_inp = t31;
-    struct ggml_tensor * grad_layer_inp = NULL;
-
-    for (int k = 0; k < n_layer; ++k) {
-        int il = n_layer-1-k;
-        struct my_llama_layer & layer = model->layers[il];
-
-        struct ggml_tensor * t02 = t02L[il];
-        struct ggml_tensor * t03 = t03L[il];
-        struct ggml_tensor * t04 = t04L[il];
-        struct ggml_tensor * t05 = t05L[il];
-        struct ggml_tensor * t06 = t06L[il];
-        struct ggml_tensor * t07 = t07L[il];
-        struct ggml_tensor * t08 = t08L[il];
-        struct ggml_tensor * t09 = t09L[il];
-        struct ggml_tensor * t10 = t10L[il];
-        struct ggml_tensor * t11 = t11L[il];
-        struct ggml_tensor * t12 = t12L[il];
-        struct ggml_tensor * t13 = t13L[il];
-        struct ggml_tensor * t14 = t14L[il];
-        struct ggml_tensor * t15 = t15L[il];
-        struct ggml_tensor * t16 = t16L[il];
-        struct ggml_tensor * t17 = t17L[il];
-        struct ggml_tensor * t18 = t18L[il];
-        struct ggml_tensor * t19 = t19L[il];
-        struct ggml_tensor * t20 = t20L[il];
-        struct ggml_tensor * t21 = t21L[il];
-        struct ggml_tensor * t22 = t22L[il];
-        struct ggml_tensor * t23 = t23L[il];
-        struct ggml_tensor * t24 = t24L[il];
-        struct ggml_tensor * t25 = t25L[il];
-        struct ggml_tensor * t26 = t26L[il];
-        struct ggml_tensor * t27 = t27L[il];
-        struct ggml_tensor * t28 = t28L[il];
-        struct ggml_tensor * t29 = t29L[il];
-        struct ggml_tensor * t30 = t30L[il];
-
-        clr_buf(0);
-        use_buf(0);
-        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad, rms_norm_eps)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
-        if (grad_layer_inp) {
-            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
-        }
-        clr_buf(1);
-        t29->grad = t30->grad;                                                                                        assert_shape_2d(t29->grad, n_embd, N*n_batch);
-        t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad)));                       assert_shape_2d(t28->grad, n_ff, N*n_batch);
-        t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25));                                                       assert_shape_2d(t27->grad, n_ff, N*n_batch);
-        t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad));                                                 assert_shape_2d(t26->grad, n_ff, N*n_batch);
-        t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27));                                                       assert_shape_2d(t25->grad, n_ff, N*n_batch);
-        t24->grad = expand(gb, ggml_add_inplace(ctx0,
-                        ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)),
-                        ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad))));                             assert_shape_2d(t24->grad, n_embd, N*n_batch);
-        t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
-        t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
-        use_buf(1);
-        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad, rms_norm_eps)));    assert_shape_2d(t21->grad, n_embd, N*n_batch);
-        grad_layer_inp = t21;
-        use_buf(0);
-        t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
-        t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad)));                       assert_shape_2d(t19->grad, n_embd, N*n_batch);
-        t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch));                  assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch);
-        t17->grad = t18->grad;                                                                                        assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch);
-        t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3));                                            assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true));     assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch);
-        t15->grad = expand(gb, view__v(flash_attn));                                                                  assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch);
-        t14->grad = expand(gb, view__k(flash_attn));                                                                  assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch);
-        t13->grad = expand(gb, view__q(flash_attn));                                                                  assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch);
-        t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
-        t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
-        t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
-        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
-        t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
-        t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
-        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
-        t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
-        t04->grad = expand(gb, ggml_add_inplace(ctx0,
-                        ggml_add_inplace(ctx0,
-                            ggml_out_prod(ctx0, layer.wv, t11->grad),
-                            ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))),
-                        ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad))));                             assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02));                                                       assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        use_buf(1);
-        t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02)));              assert_shape_2d(t02->grad, n_embd, N*n_batch);
-        back_layer_inp = t02;
-        // use_buf(0);
-
-        use_buf(-1);
-        layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm)));   assert_shape_1d(layer.attention_norm->grad, n_embd);
-        layer.wq->grad             = expand(gb, add_or_set(layer.wq->grad,             ggml_out_prod(ctx0, t04, t05->grad)));                       assert_shape_2d(layer.wq->grad,             n_embd, n_embd);
-        layer.wk->grad             = expand(gb, add_or_set(layer.wk->grad,             ggml_out_prod(ctx0, t04, t08->grad)));                       assert_shape_2d(layer.wk->grad,             n_embd, n_embd);
-        layer.wv->grad             = expand(gb, add_or_set(layer.wv->grad,             ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad,             n_embd, n_embd);
-        layer.wo->grad             = expand(gb, add_or_set(layer.wo->grad,             ggml_out_prod(ctx0, t19, t20->grad)));                       assert_shape_2d(layer.wo->grad,             n_embd, n_embd);
-        layer.ffn_norm->grad       = expand(gb, add_or_set(layer.ffn_norm->grad,       ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm)));         assert_shape_1d(layer.ffn_norm->grad,       n_embd);
-        layer.w1->grad             = expand(gb, add_or_set(layer.w1->grad,             ggml_out_prod(ctx0, t24, t26->grad)));                       assert_shape_2d(layer.w1->grad,             n_embd, n_ff);
-        layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
-        layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
-        // use_buf(0);
-    }
-    clr_buf(0);
-    use_buf(0);
-    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad, rms_norm_eps)));        assert_shape_2d(t01->grad, n_embd, N*n_batch);
-    use_buf(-1);
-    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                                      assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
-    // clr_buf(1);
-    // clr_buf(0);
-
-    *logits = t35;
-
-    clr_buf(0);
-    clr_buf(1);
-
-    if (track_max_mem) {
-        printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
-        printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
-    }
-
-    // now that all grads are created, set the graph leafs and grads
-    graph_set_leafs_grads(gf);
-    graph_set_leafs_grads(gb);
-
-    return t36;
-}
-
-struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
-        struct my_llama_model * model,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_tensor  * * logits,
-        struct ggml_tensor    * tokens_input,
-        struct ggml_tensor    * targets,
-        void                  * compute_buf_0,
-        void                  * compute_buf_1,
-        void                  * compute_buf_2,
-        size_t                  size_buf_0,
-        size_t                  size_buf_1,
-        size_t                  size_buf_2,
-        const  int              n_tokens,
-        const  int              n_batch) {
-
-    // implements gradient-checkpointing as explained in readme of https://github.com/cybertronai/gradient-checkpointing
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    const int n_past = 0;
-    const int N = n_tokens;
-
-    gf->n_nodes = 0;
-    gf->n_leafs = 0;
-    gf->perf_runs = 0;
-    gf->perf_cycles = 0;
-    gf->perf_time_us = 0;
-
-    const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
-    const int n_vocab    = hparams.n_vocab;
-    const int n_embd     = hparams.n_embd;
-    const int n_layer    = hparams.n_layer;
-    const int n_head     = hparams.n_head;
-    const int n_rot      = hparams.n_rot;
-    const int n_ff       = get_n_ff(&hparams);
-    const int rope_mode  = 0;
-
-    bool track_max_mem = true;
-
-    int last_buf = -1;
-    size_t buf_offs[3] = { 0, 0, 0 };
-    size_t buf_size[3] = { size_buf_0,
-                           size_buf_1,
-                           size_buf_2 };
-    void * buf_data[3] = { compute_buf_0,
-                           compute_buf_1,
-                           compute_buf_2 };
-    size_t buf_maxs[3] = { 0, 0, 0 };
-
-    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) {
-        size_t last_offs = 0;
-        last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-        if (last_buf >= 0) {
-            buf_offs[last_buf] = last_offs;
-            buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
-        }
-        if (buf >= 0) {
-            size_t offs = buf_offs[buf];
-            size_t size = buf_size[buf];
-            void * data = buf_data[buf];
-            ggml_set_scratch(ctx0, { offs, size, data, });
-        }
-        last_buf = buf;
-    };
-
-
-    auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) {
-        if (buf < 0) return;
-        if (track_max_mem) {
-            size_t last_offs = 0;
-            last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-            if (last_buf >= 0) {
-                buf_offs[last_buf] = last_offs;
-                buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
-            }
-        }
-        buf_offs[buf] = 0;
-        if (track_max_mem && last_buf >= 0) {
-            size_t offs = buf_offs[last_buf];
-            size_t size = buf_size[last_buf];
-            void * data = buf_data[last_buf];
-            ggml_set_scratch(ctx0, { offs, size, data, });
-        }
-    };
-
-
-    auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = n_embd/n_head;
-        int64_t ne1 = N;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = 0;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = n_embd/n_head;
-        int64_t ne1 = N;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = nb3*ne3;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = N;
-        int64_t ne1 = n_embd/n_head;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = 2*nb3*ne3;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * {
-        if (a == NULL) {
-            return b;
-        } else {
-            return ggml_add_inplace(ctx0, a, b);
-        }
-    };
-
-    use_buf(-1);
-
-    model->tok_embeddings->grad    = NULL;
-    model->norm->grad              = NULL;
-    model->output->grad            = NULL;
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        layer.attention_norm->grad = NULL;
-        layer.wq->grad             = NULL;
-        layer.wk->grad             = NULL;
-        layer.wv->grad             = NULL;
-        layer.wo->grad             = NULL;
-        layer.ffn_norm->grad       = NULL;
-        layer.w1->grad             = NULL;
-        layer.w2->grad             = NULL;
-        layer.w3->grad             = NULL;
-    }
-
-    clr_buf(0);
-    clr_buf(1);
-    clr_buf(2);
-
-    use_buf(-1);
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); assert_shape_1d(t00, N*n_batch);
-
-    use_buf(-1);
-
-    struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
-
-
-    {
-        // given: n, u, v
-        // objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
-        // b=n/a
-        // minimize(a*u+v*n/a)
-        // diff(a*u+v*n/a, a) = u - (v*n/a)/a
-        // diff(a*u+v*n/a, a) == 0
-        // u - (v*n/a)/a == 0
-        // u == v*n/(a*a)
-        // u*a*a = v*n
-        // a*a = v*n/u
-        // a = sqrt(n*v/u)
-    }
-
-    float memcost_checkpoint   = n_embd;           // (..)*N*n_batch
-    float memcost_snd_fwd_pass = 14*n_embd+4*n_ff; // (..)*N*n_batch
-
-    int n_checkstep = (int)(sqrtf(n_layer*memcost_checkpoint/memcost_snd_fwd_pass) + 0.5f);
-    if (n_checkstep < 1) {
-        n_checkstep = 1;
-    }
-    std::vector<int> checkpoints;
-    for (int chk = n_checkstep-1; chk+1 < n_layer; chk += n_checkstep) {
-        checkpoints.push_back(chk);
-    }
-    int n_check = checkpoints.size();
-    // printf("%s: n_check = %d n_checkstep = %d\n", __func__, n_check, n_checkstep);
-
-    // for (int i = 0; i < n_check; ++i) {
-    //     printf("%s: checkpoint #%d = %d\n", __func__, i, checkpoints[i]);
-    // }
-
-    // example for 16 layers and memcost_checkpoint=memcost_snd_fwd_pass:
-    // inp  ~    implicit zeroth checkpoint == input
-    // L00 f 4b  [
-    // L01 f 4b    4th second forward pass
-    // L02 f 4b
-    // L03 fc4b  ] first checkpoint
-    // L04 f 3b  [
-    // L05 f 3b   3rd second forward pass
-    // L06 f 3b
-    // L07 fc3b  ] second checkpoint
-    // L08 f 2b  [
-    // L09 f 2b   2nd second forward pass
-    // L10 f 2b
-    // L11 fc2b  ] third checkpoint
-    // L12 f 1b  [
-    // L13 f 1b   1st second forward pass
-    // L14 f 1b
-    // L15 f 1b  ]
-
-    // need to remember these for the backward pass
-    std::vector<struct ggml_tensor *> t02L; t02L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t03L; t03L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t04L; t04L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t05L; t05L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t06L; t06L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t07L; t07L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t08L; t08L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t09L; t09L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t10L; t10L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t11L; t11L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t12L; t12L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t13L; t13L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t14L; t14L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t15L; t15L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t16L; t16L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t17L; t17L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t18L; t18L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t19L; t19L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t20L; t20L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t21L; t21L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t22L; t22L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t23L; t23L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t24L; t24L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t25L; t25L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t26L; t26L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t27L; t27L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t28L; t28L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t29L; t29L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t30L; t30L.resize(n_layer, NULL);
-
-    struct ggml_tensor * cur = t01;
-
-    int chk_idx = 0;
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        // tensors with values necessary for backward pass are in persistent buf(-1)
-        // other tensors with buf(0), buf(1), etc are only temporary needed, and their memory reused
-        bool is_checkpoint = (chk_idx < n_check && il == checkpoints[chk_idx]);
-        if (is_checkpoint) {
-            // printf("%s: layer %d is_checkpoint\n", __func__, il);
-            chk_idx += 1;
-        }
-        const int prs = 0; // in first forward pass even persistent tensors are only temporary
-        const int tmp = 0; // temporary
-        // nxt is required to compute next layer.
-        // for checkpoints we need to remember this for usage in backward pass,
-        // otherwise temporary until next of this kind
-        const int nxt = is_checkpoint ? -1 : 1;
-        clr_buf(0);
-        use_buf(prs); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
-        use_buf(tmp); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
-        use_buf(prs); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        use_buf(prs); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        use_buf(prs); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        use_buf(prs); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-        use_buf(prs); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        use_buf(tmp); struct ggml_tensor * t17 = expand(gf, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
-        use_buf(tmp); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
-        use_buf(tmp); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
-        use_buf(tmp); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
-        clr_buf( 1);
-        use_buf(nxt); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
-
-        // only t30L is remembered for checkpointing in first forward pass
-        if (is_checkpoint) {
-            t30L[il] = t30;
-        }
-        cur = t30;
-    }
-    clr_buf(0);
-    use_buf(0);
-    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur, rms_norm_eps));         assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
-    use_buf(-1);
-    struct ggml_tensor * t34   = expand(gf, ggml_mul_mat   (ctx0, model->output, t33));        assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch));  assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets));      assert_shape_1d(t36, 1);
-
-    *gb = *gf;
-
-    // t36->grad gets set to one by optimizer, so we need the tensor.
-    // initialize it with 1.0f to make sure.
-    use_buf(-1);
-    t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
-
-    use_buf(0);
-    t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad));              assert_shape_3d(t35->grad, n_vocab, N, n_batch);
-    t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch));                    assert_shape_2d(t34->grad, n_vocab, N*n_batch);
-    t33->grad = expand(gb, ggml_out_prod   (ctx0, model->output, ggml_transpose(ctx0, t34->grad)));   assert_shape_2d(t33->grad, n_embd, N*n_batch);
-    t32->grad = expand(gb, ggml_mul        (ctx0, t33->grad, t31));                                   assert_shape_2d(t32->grad, n_embd, N*n_batch);
-
-    use_buf(-1);
-
-    model->norm->grad   = expand(gb, add_or_set(model->norm->grad,   ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd);
-    model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad)));            assert_shape_2d(model->output->grad, n_embd, n_vocab);
-
-    clr_buf(1);
-    use_buf(1);
-    t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32));  assert_shape_2d(t31->grad, n_embd, N*n_batch);
-
-    struct ggml_tensor * back_layer_inp = t31;
-    struct ggml_tensor * grad_layer_inp = NULL;
-
-    // printf("%s: n_check = %u\n", __func__, n_check);
-    chk_idx = n_check-1;
-    int avail_begin = n_layer;
-    int avail_end = n_layer;
-    // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
-    for (int k = 0; k < n_layer; ++k) {
-        // second forward pass for checkpointing
-        int il = n_layer-1-k;
-        if (il < avail_begin) {
-            // make sure, that txxL[il] is available
-            // forward pass from last checkpoint
-            GGML_ASSERT(chk_idx >= -1);
-            int begin = (chk_idx == -1)
-                        ? 0
-                        : checkpoints[chk_idx] + 1; // checkpoint[chk_idx] contains t30 for computing following layers -> +1
-            int end   = (chk_idx+1 < n_check)
-                        ? (checkpoints[chk_idx+1] + 1)
-                        : n_layer;
-            GGML_ASSERT(begin <= il);
-            GGML_ASSERT(il < end);
-            cur = (chk_idx == -1) ? t01 : t30L[checkpoints[chk_idx]];
-            clr_buf(2);
-            // printf("%s: second forward pass chk_idx=%d begin=%d end=%d\n", __func__, chk_idx, begin, end);
-            for (int i = begin; i < end; ++i) {
-                struct my_llama_layer & layer = model->layers[i];
-                const int prs = 2; // persistent until next checkpoint
-                const int tmp = 0; // temporary for this layer
-                const bool is_checkpoint = (i == end-1);
-                clr_buf(0);
-                use_buf(prs); struct ggml_tensor * t02 = expand(gb, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
-                use_buf(tmp); struct ggml_tensor * t03 = expand(gb, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t04 = expand(gb, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t05 = expand(gb, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t06 = expand(gb, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t07 = expand(gb, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t08 = expand(gb, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t09 = expand(gb, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t10 = expand(gb, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t11 = expand(gb, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
-                use_buf(prs); struct ggml_tensor * t12 = expand(gb, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-                use_buf(prs); struct ggml_tensor * t13 = expand(gb, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-                use_buf(prs); struct ggml_tensor * t14 = expand(gb, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-                use_buf(prs); struct ggml_tensor * t15 = expand(gb, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-                use_buf(prs); struct ggml_tensor * t16 = expand(gb, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-                use_buf(tmp); struct ggml_tensor * t17 = expand(gb, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t18 = expand(gb, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t19 = expand(gb, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
-                use_buf(tmp); struct ggml_tensor * t20 = expand(gb, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t21 = expand(gb, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t22 = expand(gb, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
-                use_buf(tmp); struct ggml_tensor * t23 = expand(gb, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t24 = expand(gb, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t25 = expand(gb, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t26 = expand(gb, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t27 = expand(gb, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t28 = expand(gb, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
-                use_buf(tmp); struct ggml_tensor * t29 = expand(gb, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
-                if (t30L[i] == NULL) {
-                    use_buf(prs); struct ggml_tensor * t30 = expand(gb, ggml_add      (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
-                    t30L[i] = t30;
-                    cur     = t30;
-                }
-                t02L[i] = t02;
-                t03L[i] = t03;
-                t04L[i] = t04;
-                t05L[i] = t05;
-                t06L[i] = t06;
-                t07L[i] = t07;
-                t08L[i] = t08;
-                t09L[i] = t09;
-                t10L[i] = t10;
-                t11L[i] = t11;
-                t12L[i] = t12;
-                t13L[i] = t13;
-                t14L[i] = t14;
-                t15L[i] = t15;
-                t16L[i] = t16;
-                t17L[i] = t17;
-                t18L[i] = t18;
-                t19L[i] = t19;
-                t20L[i] = t20;
-                t21L[i] = t21;
-                t22L[i] = t22;
-                t23L[i] = t23;
-                t24L[i] = t24;
-                t25L[i] = t25;
-                t26L[i] = t26;
-                t27L[i] = t27;
-                t28L[i] = t28;
-                t29L[i] = t29;
-            }
-            --chk_idx;
-            avail_begin = begin;
-            avail_end   = end;
-            // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
-        }
-        // printf("%s: backward pass il=%d\n", __func__, il);
-
-        struct my_llama_layer & layer = model->layers[il];
-
-        struct ggml_tensor * t02 = t02L[il];
-        struct ggml_tensor * t03 = t03L[il];
-        struct ggml_tensor * t04 = t04L[il];
-        struct ggml_tensor * t05 = t05L[il];
-        struct ggml_tensor * t06 = t06L[il];
-        struct ggml_tensor * t07 = t07L[il];
-        struct ggml_tensor * t08 = t08L[il];
-        struct ggml_tensor * t09 = t09L[il];
-        struct ggml_tensor * t10 = t10L[il];
-        struct ggml_tensor * t11 = t11L[il];
-        struct ggml_tensor * t12 = t12L[il];
-        struct ggml_tensor * t13 = t13L[il];
-        struct ggml_tensor * t14 = t14L[il];
-        struct ggml_tensor * t15 = t15L[il];
-        struct ggml_tensor * t16 = t16L[il];
-        struct ggml_tensor * t17 = t17L[il];
-        struct ggml_tensor * t18 = t18L[il];
-        struct ggml_tensor * t19 = t19L[il];
-        struct ggml_tensor * t20 = t20L[il];
-        struct ggml_tensor * t21 = t21L[il];
-        struct ggml_tensor * t22 = t22L[il];
-        struct ggml_tensor * t23 = t23L[il];
-        struct ggml_tensor * t24 = t24L[il];
-        struct ggml_tensor * t25 = t25L[il];
-        struct ggml_tensor * t26 = t26L[il];
-        struct ggml_tensor * t27 = t27L[il];
-        struct ggml_tensor * t28 = t28L[il];
-        struct ggml_tensor * t29 = t29L[il];
-        struct ggml_tensor * t30 = t30L[il];
-
-        clr_buf(0);
-        use_buf(0);
-        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad, rms_norm_eps));                    assert_shape_2d(t30->grad, n_embd, N*n_batch);
-        if (grad_layer_inp) {
-            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad));                                  assert_shape_2d(t30->grad, n_embd, N*n_batch);
-        }
-        clr_buf(1);
-        t29->grad = t30->grad;                                                                                        assert_shape_2d(t29->grad, n_embd, N*n_batch);
-        t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad)));                       assert_shape_2d(t28->grad, n_ff, N*n_batch);
-        t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25));                                                       assert_shape_2d(t27->grad, n_ff, N*n_batch);
-        t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad));                                                 assert_shape_2d(t26->grad, n_ff, N*n_batch);
-        t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27));                                                       assert_shape_2d(t25->grad, n_ff, N*n_batch);
-        t24->grad = expand(gb, ggml_add_inplace(ctx0,
-                        ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)),
-                        ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad))));                             assert_shape_2d(t24->grad, n_embd, N*n_batch);
-        t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
-        t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
-        use_buf(1);
-        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad, rms_norm_eps)));    assert_shape_2d(t21->grad, n_embd, N*n_batch);
-        grad_layer_inp = t21;
-        use_buf(0);
-        t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
-        t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad)));                       assert_shape_2d(t19->grad, n_embd, N*n_batch);
-        t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch));                  assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch);
-        t17->grad = t18->grad;                                                                                        assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch);
-        t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3));                                            assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true));     assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch);
-        t15->grad = expand(gb, view__v(flash_attn));                                                                  assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch);
-        t14->grad = expand(gb, view__k(flash_attn));                                                                  assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch);
-        t13->grad = expand(gb, view__q(flash_attn));                                                                  assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch);
-        t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
-        t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
-        t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
-        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
-        t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
-        t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
-        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
-        t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
-        t04->grad = expand(gb, ggml_add_inplace(ctx0,
-                        ggml_add_inplace(ctx0,
-                            ggml_out_prod(ctx0, layer.wv, t11->grad),
-                            ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))),
-                        ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad))));                             assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02));                                                       assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        use_buf(1);
-        t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02)));              assert_shape_2d(t02->grad, n_embd, N*n_batch);
-        back_layer_inp = t02;
-
-        use_buf(-1);
-        layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm)));   assert_shape_1d(layer.attention_norm->grad, n_embd);
-        layer.wq->grad             = expand(gb, add_or_set(layer.wq->grad,             ggml_out_prod(ctx0, t04, t05->grad)));                       assert_shape_2d(layer.wq->grad,             n_embd, n_embd);
-        layer.wk->grad             = expand(gb, add_or_set(layer.wk->grad,             ggml_out_prod(ctx0, t04, t08->grad)));                       assert_shape_2d(layer.wk->grad,             n_embd, n_embd);
-        layer.wv->grad             = expand(gb, add_or_set(layer.wv->grad,             ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad,             n_embd, n_embd);
-        layer.wo->grad             = expand(gb, add_or_set(layer.wo->grad,             ggml_out_prod(ctx0, t19, t20->grad)));                       assert_shape_2d(layer.wo->grad,             n_embd, n_embd);
-        layer.ffn_norm->grad       = expand(gb, add_or_set(layer.ffn_norm->grad,       ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm)));         assert_shape_1d(layer.ffn_norm->grad,       n_embd);
-        layer.w1->grad             = expand(gb, add_or_set(layer.w1->grad,             ggml_out_prod(ctx0, t24, t26->grad)));                       assert_shape_2d(layer.w1->grad,             n_embd, n_ff);
-        layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
-        layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
-    }
-    // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
-    GGML_ASSERT(n_check == 0 || chk_idx == -2);
-    GGML_ASSERT(avail_begin == 0);
-    clr_buf(0);
-    use_buf(0);
-    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad, rms_norm_eps)));        assert_shape_2d(t01->grad, n_embd, N*n_batch);
-    use_buf(-1);
-    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                                      assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
-
-    *logits = t35;
-
-    clr_buf(0);
-    clr_buf(1);
-    clr_buf(2);
-
-    if (track_max_mem) {
-        printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
-        printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
-        printf("%s: max size compute buf2: %zu\n", __func__, buf_maxs[2]);
-    }
-
-    // now that all grads are created, set the graph leafs and grads
-    graph_set_leafs_grads(gf);
-    graph_set_leafs_grads(gb);
-
-    return t36;
-}
-
 void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
     float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
     *ptr = value;
@@ -4485,44 +2945,14 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * loss   = NULL;
         struct ggml_tensor * logits = NULL;
 
-        if (params.use_alloc || params.use_unified) {
-            loss = llama_build_train_graphs(
-                &model, alloc, ctx0,
-                gf, gb, gb_tmp,
-                &logits, tokens_input, target_probs,
-                n_tokens, n_batch,
-                params.use_flash, 
-                params.use_checkpointing
-            );
-        } else if (params.use_checkpointing) {
-            loss = forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
-                    &model, ctx0,
-                    gf, gb,
-                    &logits, tokens_input, target_probs,
-                    compute_buf_0, compute_buf_1, compute_buf_2,
-                    size_buf_0, size_buf_1, size_buf_2,
-                    n_tokens, n_batch);
-        } else if (params.use_scratch) {
-            loss = forward_batch_wo_cache_flash_attn_train(
-                    &model, ctx0,
-                    gf, gb,
-                    &logits, tokens_input, target_probs,
-                    compute_buf_0, compute_buf_1,
-                    size_buf_0, size_buf_1,
-                    n_tokens, n_batch);
-        } else if (params.use_flash) {
-            logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
-            loss   = cross_entropy_loss(ctx0, logits, target_probs);
-            ggml_build_forward_expand(gf, loss);
-            *gb = *gf;
-            ggml_build_backward_expand(ctx0, gf, gb, true);
-        } else {
-            logits = forward_batch_wo_cache(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
-            loss   = cross_entropy_loss(ctx0, logits, target_probs);
-            ggml_build_forward_expand(gf, loss);
-            *gb = *gf;
-            ggml_build_backward_expand(ctx0, gf, gb, true);
-        }
+        loss = llama_build_train_graphs(
+            &model, alloc, ctx0,
+            gf, gb, gb_tmp,
+            &logits, tokens_input, target_probs,
+            n_tokens, n_batch,
+            params.use_flash, 
+            params.use_checkpointing
+        );
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);