llama/ggml: add LLM training support

more compact progress bar refactor: llama_prepare_sbatch/ubatch llama_save_model_to_file gqa_mode arg for repeat_back llama_opt_param_filter ggml_graph_dup force_grads refactor ggml_opt, fix test-opt
2024-11-17 14:58:51 +01:00 · 2024-11-17 14:58:51 +01:00 · c25557362a
commit c25557362a
parent a5203b4465
26 changed files with 1294 additions and 339 deletions
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -810,7 +810,7 @@ struct test_case {

        ggml_build_forward_expand(gf, out);
        ggml_graph_cpy(gf, gb);
-        ggml_build_backward_expand(ctx, ctx, gb, false);
+        ggml_build_backward_expand(ctx, gb, nullptr);
        if (expect.size() != 1 || expect[0] != 0.0f) {
            GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
@ -996,7 +996,7 @@ struct test_example : public test_case {
        // Step 3: return the output tensor.
        return out;
    }
-    // In order to also check the gradients for your op, add calls like ggml_set_param(ctx, a)
+    // In order to also check the gradients for your op, add calls like ggml_set_param(a)
    // immediately after you create the tensors.
    // This is optional and only makes sense if a backward pass has actually been implemented for the new op.
 };
@ -1028,7 +1028,7 @@ struct test_unary : public test_case {
            auto ne = ne_a; ne[0] *= 3;
            a = ggml_new_tensor(ctx, type, 4, ne.data());
            if (grad_supported) {
-                ggml_set_param(ctx, a);
+                ggml_set_param(a);
            }
            ggml_set_name(a, "a");

@ -1037,7 +1037,7 @@ struct test_unary : public test_case {
        } else {
            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
            if (grad_supported) {
-                ggml_set_param(ctx, a);
+                ggml_set_param(a);
            }
            ggml_set_name(a, "a");
        }
@ -1103,7 +1103,7 @@ struct test_get_rows : public test_case {

        const bool grad_supported = ggml_is_matrix(in) && ggml_is_vector(rows);
        if (grad_supported) {
-            ggml_set_param(ctx, in);
+            ggml_set_param(in);
            // rows is a constant input -> no gradients
        }

@ -1292,7 +1292,7 @@ struct test_repeat : public test_case {
        ggml_set_name(target, "target");

        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, src);
+        ggml_set_param(src);
        ggml_set_name(src, "src");

        ggml_tensor * out = ggml_repeat(ctx, src, target);
@ -1376,7 +1376,7 @@ struct test_dup : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, src);
+        ggml_set_param(src);
        ggml_set_name(src, "src");

        if (_use_permute) {
@ -1412,7 +1412,7 @@ struct test_set : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
-        ggml_set_param(ctx, src);
+        ggml_set_param(src);
        ggml_set_name(src, "src");

        auto ne_dst = ne;
@ -1420,7 +1420,7 @@ struct test_set : public test_case {
            ne_dst[i] *= 2;
        }
        ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
-        ggml_set_param(ctx, dst);
+        ggml_set_param(dst);
        ggml_set_name(dst, "dst");

        size_t offset = 0;
@ -1464,7 +1464,7 @@ struct test_cpy : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
-        ggml_set_param(ctx, src);
+        ggml_set_param(src);
        ggml_set_name(src, "src");

        if (_src_use_permute) {
@ -1497,7 +1497,7 @@ struct test_cont : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, src);
+        ggml_set_param(src);
        ggml_set_name(src, "src");

        src = ggml_transpose(ctx, src);
@ -1543,8 +1543,8 @@ struct test_bin_bcast : public test_case {
        // The backward pass supports broadcasting only for GGML_ADD:
        const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
        if (grad_supported) {
-            ggml_set_param(ctx, a);
-            ggml_set_param(ctx, b);
+            ggml_set_param(a);
+            ggml_set_param(b);
        }

        ggml_tensor * out = op(ctx, a, b);
@ -1592,11 +1592,11 @@ struct test_add1 : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1);
-        // ggml_set_param(ctx, b); // TODO: implement
+        // ggml_set_param(b); // TODO: implement
        ggml_set_name(b, "b");

        ggml_tensor * out = ggml_add1(ctx, a, b);
@ -1627,7 +1627,7 @@ struct test_scale : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * out = ggml_scale(ctx, a, scale);
@ -1713,7 +1713,7 @@ struct test_rms_norm : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
@ -1937,9 +1937,9 @@ struct test_mul_mat : public test_case {
            b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
            if (!ggml_is_quantized(type_a)) {
                if (bs[1] == 1 && nr[1] == 1) {
-                    ggml_set_param(ctx, a);
+                    ggml_set_param(a);
                }
-                ggml_set_param(ctx, b);
+                ggml_set_param(b);
            }
            ggml_set_name(a, "a");
            ggml_set_name(b, "b");
@ -1953,9 +1953,9 @@ struct test_mul_mat : public test_case {
            b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
            if (!ggml_is_quantized(type_a)) {
                if (bs[1] == 1 && nr[1] == 1) {
-                    ggml_set_param(ctx, a);
+                    ggml_set_param(a);
                }
-                ggml_set_param(ctx, b);
+                ggml_set_param(b);
            }
            ggml_set_name(a, "a");
            ggml_set_name(b, "b");
@ -2104,7 +2104,7 @@ struct test_sqr : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * out = ggml_sqr(ctx, a);
@ -2133,7 +2133,7 @@ struct test_sqrt : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * out = ggml_sqrt(ctx, a);
@ -2173,7 +2173,7 @@ struct test_log : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * out = ggml_log(ctx, a);
@ -2209,7 +2209,7 @@ struct test_sin : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * out = ggml_sin(ctx, a);
@ -2252,7 +2252,7 @@ struct test_cos : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * out = ggml_cos(ctx, a);
@ -2332,7 +2332,7 @@ struct test_diag_mask_inf : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
@ -2369,7 +2369,7 @@ struct test_soft_max : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * mask = nullptr;
@ -2451,7 +2451,7 @@ struct test_rope : public test_case {
            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
            a = ggml_new_tensor(ctx, type, 4, ne.data());
            if (forward) {
-                ggml_set_param(ctx, a);
+                ggml_set_param(a);
            }
            ggml_set_name(a, "a");

@ -2460,7 +2460,7 @@ struct test_rope : public test_case {
        } else {
            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
            if (forward) {
-                ggml_set_param(ctx, a);
+                ggml_set_param(a);
            }
            ggml_set_name(a, "a");
        }
@ -2572,7 +2572,7 @@ struct test_pool2d : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
-        ggml_set_param(ctx, input);
+        ggml_set_param(input);
        ggml_set_name(input, "input");

        ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
@ -2648,7 +2648,7 @@ struct test_im2col : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
-        ggml_set_param(ctx, input);
+        ggml_set_param(input);
        ggml_set_name(input, "input");

        ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
@ -2783,7 +2783,7 @@ struct test_sum : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * out = ggml_sum(ctx, a);
@ -2812,7 +2812,7 @@ struct test_sum_rows : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * out = ggml_sum_rows(ctx, a);
@ -2837,7 +2837,7 @@ struct test_mean : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * out = ggml_mean(ctx, a);
@ -2954,11 +2954,11 @@ struct test_acc : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
-        ggml_set_param(ctx, a);
+        ggml_set_param(a);
        ggml_set_name(a, "a");

        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
-        ggml_set_param(ctx, b);
+        ggml_set_param(b);
        ggml_set_name(b, "b");

        ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
@ -3190,7 +3190,7 @@ struct test_cross_entropy_loss : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(ctx, logits);
+        ggml_set_param(logits);
        ggml_set_name(logits, "logits");

        ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
@ -3272,7 +3272,7 @@ struct test_opt_step_adamw : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
-        ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
+        ggml_set_param(a); // Despite tensor a having gradients the output tensor will not.
        ggml_set_name(a, "a");

        ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);