cleanup

2024-04-17 18:37:28 +02:00 · 2024-04-17 18:37:28 +02:00 · bf56fdecb3
commit bf56fdecb3
parent fb168ac5f7
5 changed files with 4 additions and 106 deletions
--- a/ggml.c
+++ b/ggml.c
@ -4588,7 +4588,7 @@ void ggml_mul_mat_set_prec(

    in b, n_experts_used can be broadcasted to match the n_expert_used of ids

-    c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e in ids
+    c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
 */
 struct ggml_tensor * ggml_mul_mat_id(
        struct ggml_context * ctx,
--- a/ggml.h
+++ b/ggml.h
@ -1161,7 +1161,6 @@ extern "C" {
            enum ggml_prec       prec);

    // indirect matrix multiplication
-    // TODO: document
    GGML_API struct ggml_tensor * ggml_mul_mat_id(
            struct ggml_context * ctx,
            struct ggml_tensor  * as,
--- a/llama.cpp
+++ b/llama.cpp
@ -7365,6 +7365,7 @@ struct llm_build_context {
                    n_expert, n_expert_used,
                    LLM_FFN_SILU, true,
                    cb, il);
+            cb(cur, "ffn_moe_out", il);

            cur = ggml_add(ctx0, cur, ffn_inp);
            cb(cur, "ffn_out", il);
@ -8694,6 +8695,7 @@ struct llm_build_context {
                        n_expert, n_expert_used,
                        LLM_FFN_SILU, false,
                        cb, il);
+            cb(cur, "ffn_moe_out", il);

            // FFN shared expert
            {
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@ -12,19 +12,7 @@ bench_args="${@:3}"

 rm -f llama-bench.sqlite

-backend="cpu"
-
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    backend="metal"
-elif command -v nvcc &> /dev/null; then
-    backend="cuda"
-fi
-
-make_opts=""
-
-#if [[ "$backend" == "cuda" ]]; then
-#    make_opts="LLAMA_CUDA=1"
-#fi
+# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...)

 git checkout $1
 make clean && make -j32 $make_opts llama-bench
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -1613,7 +1613,6 @@ public:
    }
 };

-
 // Llama
 struct test_llama : public test_llm {
    static constexpr float freq_base = 10000.0f;
@ -1860,90 +1859,6 @@ struct test_falcon : public test_llm {
    }
 };

-
-// Mixtral MOE
-struct test_moe : public test_case {
-    const int n_expert;
-    const int n_expert_used;
-    const int n_tokens;
-    const int n_embd;
-    const int n_ff;
-
-    std::string op_desc(ggml_tensor * t) override {
-        return "MOE";
-
-        GGML_UNUSED(t);
-    }
-
-    std::string vars() override {
-        return VARS_TO_STR5(n_expert, n_expert_used, n_tokens, n_embd, n_ff);
-    }
-
-    test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
-        : n_expert(n_experts), n_expert_used(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
-    }
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_type wtype = GGML_TYPE_F32;
-        ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_expert);
-
-        ggml_tensor * ffn_gate_exps = ggml_new_tensor_3d(ctx, wtype, n_embd, n_ff, n_expert);
-        ggml_tensor * ffn_down_exps = ggml_new_tensor_3d(ctx, wtype, n_ff, n_embd, n_expert);
-        ggml_tensor * ffn_up_exps = ggml_new_tensor_3d(ctx, wtype, n_embd, n_ff, n_expert);
-
-        ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); // [n_expert, n_tokens]
-
-        //ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
-        ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f);
-
-        // select experts
-        ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
-
-        ggml_tensor * weights = ggml_get_rows(ctx,
-                ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
-
-        weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
-
-        ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
-
-        weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
-
-        cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
-        ggml_tensor * up = ggml_mul_mat_id(ctx, ffn_up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-
-        ggml_tensor * gate = ggml_mul_mat_id(ctx, ffn_gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-
-        gate = ggml_silu(ctx, gate);
-
-        ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
-
-        ggml_tensor * experts = ggml_mul_mat_id(ctx, ffn_down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
-
-        experts = ggml_mul(ctx, experts,
-                ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens));
-
-        // aggregate experts
-        ggml_tensor * moe_out = nullptr;
-        for (int i = 0; i < n_expert_used; ++i) {
-            ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
-                    experts->nb[2], i*experts->nb[1]);
-            cur_expert = ggml_cont(ctx, cur_expert);
-            if (i == 0) {
-                moe_out = cur_expert;
-            } else {
-                moe_out = ggml_add(ctx, moe_out, cur_expert);
-            }
-        }
-
-        cur = moe_out;
-
-        return cur;
-    }
-};
-
-
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
    std::vector<std::unique_ptr<test_case>> test_cases;
    std::default_random_engine rng(0);
@ -2031,10 +1946,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        }
    };

-    // mul: src0: 4096 2 32 1
-    // mul: src1: 1 2 32 1
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 2, 32, 1}, {4096, 1, 1, 1});
-
    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
@ -2194,8 +2105,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_llama(2));
    test_cases.emplace_back(new test_falcon(1));
    test_cases.emplace_back(new test_falcon(2));
-    test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
-    test_cases.emplace_back(new test_moe(8, 2, 32, 4096, 8*1024));
 #endif

    // run tests