diff --git a/ggml.c b/ggml.c index f52cf6173..821ce25ed 100644 --- a/ggml.c +++ b/ggml.c @@ -4588,7 +4588,7 @@ void ggml_mul_mat_set_prec( in b, n_experts_used can be broadcasted to match the n_expert_used of ids - c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e in ids + c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids */ struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, diff --git a/ggml.h b/ggml.h index fcd69d84b..4d1d77fe9 100644 --- a/ggml.h +++ b/ggml.h @@ -1161,7 +1161,6 @@ extern "C" { enum ggml_prec prec); // indirect matrix multiplication - // TODO: document GGML_API struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, struct ggml_tensor * as, diff --git a/llama.cpp b/llama.cpp index 8c7b7d3f6..ed6f44cb3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7365,6 +7365,7 @@ struct llm_build_context { n_expert, n_expert_used, LLM_FFN_SILU, true, cb, il); + cb(cur, "ffn_moe_out", il); cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); @@ -8694,6 +8695,7 @@ struct llm_build_context { n_expert, n_expert_used, LLM_FFN_SILU, false, cb, il); + cb(cur, "ffn_moe_out", il); // FFN shared expert { diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh index 6d6699f40..fd0ee88b2 100755 --- a/scripts/compare-commits.sh +++ b/scripts/compare-commits.sh @@ -12,19 +12,7 @@ bench_args="${@:3}" rm -f llama-bench.sqlite -backend="cpu" - -if [[ "$OSTYPE" == "darwin"* ]]; then - backend="metal" -elif command -v nvcc &> /dev/null; then - backend="cuda" -fi - -make_opts="" - -#if [[ "$backend" == "cuda" ]]; then -# make_opts="LLAMA_CUDA=1" -#fi +# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...) git checkout $1 make clean && make -j32 $make_opts llama-bench diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 09c287a0a..f89044032 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1613,7 +1613,6 @@ public: } }; - // Llama struct test_llama : public test_llm { static constexpr float freq_base = 10000.0f; @@ -1860,90 +1859,6 @@ struct test_falcon : public test_llm { } }; - -// Mixtral MOE -struct test_moe : public test_case { - const int n_expert; - const int n_expert_used; - const int n_tokens; - const int n_embd; - const int n_ff; - - std::string op_desc(ggml_tensor * t) override { - return "MOE"; - - GGML_UNUSED(t); - } - - std::string vars() override { - return VARS_TO_STR5(n_expert, n_expert_used, n_tokens, n_embd, n_ff); - } - - test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336) - : n_expert(n_experts), n_expert_used(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) { - } - - ggml_tensor * build_graph(ggml_context * ctx) override { - ggml_type wtype = GGML_TYPE_F32; - ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_expert); - - ggml_tensor * ffn_gate_exps = ggml_new_tensor_3d(ctx, wtype, n_embd, n_ff, n_expert); - ggml_tensor * ffn_down_exps = ggml_new_tensor_3d(ctx, wtype, n_ff, n_embd, n_expert); - ggml_tensor * ffn_up_exps = ggml_new_tensor_3d(ctx, wtype, n_embd, n_ff, n_expert); - - ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); // [n_expert, n_tokens] - - //ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] - ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f); - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens] - - ggml_tensor * weights = ggml_get_rows(ctx, - ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] - - weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens); - - ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens] - - weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens] - - cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); - ggml_tensor * up = ggml_mul_mat_id(ctx, ffn_up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - - ggml_tensor * gate = ggml_mul_mat_id(ctx, ffn_gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - - gate = ggml_silu(ctx, gate); - - ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] - - ggml_tensor * experts = ggml_mul_mat_id(ctx, ffn_down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] - - experts = ggml_mul(ctx, experts, - ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens)); - - // aggregate experts - ggml_tensor * moe_out = nullptr; - for (int i = 0; i < n_expert_used; ++i) { - ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens, - experts->nb[2], i*experts->nb[1]); - cur_expert = ggml_cont(ctx, cur_expert); - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx, moe_out, cur_expert); - } - } - - cur = moe_out; - - return cur; - } -}; - - static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) { std::vector> test_cases; std::default_random_engine rng(0); @@ -2031,10 +1946,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } }; - // mul: src0: 4096 2 32 1 - // mul: src1: 1 2 32 1 - add_test_bin_bcast(GGML_TYPE_F32, {1, 2, 32, 1}, {4096, 1, 1, 1}); - add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1}); @@ -2194,8 +2105,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_llama(2)); test_cases.emplace_back(new test_falcon(1)); test_cases.emplace_back(new test_falcon(2)); - test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024)); - test_cases.emplace_back(new test_moe(8, 2, 32, 4096, 8*1024)); #endif // run tests