This commit is contained in:
slaren 2024-04-17 18:37:28 +02:00
parent fb168ac5f7
commit bf56fdecb3
5 changed files with 4 additions and 106 deletions

2
ggml.c
View file

@ -4588,7 +4588,7 @@ void ggml_mul_mat_set_prec(
in b, n_experts_used can be broadcasted to match the n_expert_used of ids
c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e in ids
c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
*/
struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx,

1
ggml.h
View file

@ -1161,7 +1161,6 @@ extern "C" {
enum ggml_prec prec);
// indirect matrix multiplication
// TODO: document
GGML_API struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx,
struct ggml_tensor * as,

View file

@ -7365,6 +7365,7 @@ struct llm_build_context {
n_expert, n_expert_used,
LLM_FFN_SILU, true,
cb, il);
cb(cur, "ffn_moe_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "ffn_out", il);
@ -8694,6 +8695,7 @@ struct llm_build_context {
n_expert, n_expert_used,
LLM_FFN_SILU, false,
cb, il);
cb(cur, "ffn_moe_out", il);
// FFN shared expert
{

View file

@ -12,19 +12,7 @@ bench_args="${@:3}"
rm -f llama-bench.sqlite
backend="cpu"
if [[ "$OSTYPE" == "darwin"* ]]; then
backend="metal"
elif command -v nvcc &> /dev/null; then
backend="cuda"
fi
make_opts=""
#if [[ "$backend" == "cuda" ]]; then
# make_opts="LLAMA_CUDA=1"
#fi
# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...)
git checkout $1
make clean && make -j32 $make_opts llama-bench

View file

@ -1613,7 +1613,6 @@ public:
}
};
// Llama
struct test_llama : public test_llm {
static constexpr float freq_base = 10000.0f;
@ -1860,90 +1859,6 @@ struct test_falcon : public test_llm {
}
};
// Mixtral MOE
struct test_moe : public test_case {
const int n_expert;
const int n_expert_used;
const int n_tokens;
const int n_embd;
const int n_ff;
std::string op_desc(ggml_tensor * t) override {
return "MOE";
GGML_UNUSED(t);
}
std::string vars() override {
return VARS_TO_STR5(n_expert, n_expert_used, n_tokens, n_embd, n_ff);
}
test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
: n_expert(n_experts), n_expert_used(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_type wtype = GGML_TYPE_F32;
ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_expert);
ggml_tensor * ffn_gate_exps = ggml_new_tensor_3d(ctx, wtype, n_embd, n_ff, n_expert);
ggml_tensor * ffn_down_exps = ggml_new_tensor_3d(ctx, wtype, n_ff, n_embd, n_expert);
ggml_tensor * ffn_up_exps = ggml_new_tensor_3d(ctx, wtype, n_embd, n_ff, n_expert);
ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); // [n_expert, n_tokens]
//ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f);
// select experts
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
ggml_tensor * weights = ggml_get_rows(ctx,
ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
ggml_tensor * up = ggml_mul_mat_id(ctx, ffn_up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
ggml_tensor * gate = ggml_mul_mat_id(ctx, ffn_gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
gate = ggml_silu(ctx, gate);
ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
ggml_tensor * experts = ggml_mul_mat_id(ctx, ffn_down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
experts = ggml_mul(ctx, experts,
ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens));
// aggregate experts
ggml_tensor * moe_out = nullptr;
for (int i = 0; i < n_expert_used; ++i) {
ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
experts->nb[2], i*experts->nb[1]);
cur_expert = ggml_cont(ctx, cur_expert);
if (i == 0) {
moe_out = cur_expert;
} else {
moe_out = ggml_add(ctx, moe_out, cur_expert);
}
}
cur = moe_out;
return cur;
}
};
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
std::vector<std::unique_ptr<test_case>> test_cases;
std::default_random_engine rng(0);
@ -2031,10 +1946,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}
};
// mul: src0: 4096 2 32 1
// mul: src1: 1 2 32 1
add_test_bin_bcast(GGML_TYPE_F32, {1, 2, 32, 1}, {4096, 1, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
@ -2194,8 +2105,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_llama(2));
test_cases.emplace_back(new test_falcon(1));
test_cases.emplace_back(new test_falcon(2));
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
test_cases.emplace_back(new test_moe(8, 2, 32, 4096, 8*1024));
#endif
// run tests