Merge branch 'master' into gg/flash-attn
This commit is contained in:
commit
89961dea87
71 changed files with 30093 additions and 6878 deletions
|
@ -989,17 +989,13 @@ struct test_mul_mat_id : public test_case {
|
|||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
||||
std::vector<ggml_tensor *> mats;
|
||||
for (int i = 0; i < n_mats; i++) {
|
||||
ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m);
|
||||
mats.push_back(a);
|
||||
}
|
||||
ggml_tensor * mats = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
|
||||
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
|
||||
if (v) {
|
||||
ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0);
|
||||
}
|
||||
ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
|
||||
ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b);
|
||||
ggml_tensor * out = ggml_mul_mat_id(ctx, mats, ids, v ? id/2 : id, b);
|
||||
return out;
|
||||
}
|
||||
|
||||
|
@ -1515,7 +1511,8 @@ struct test_flash_attn_ext : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
// Attention
|
||||
// ATTN
|
||||
// TODO: this is temporary until the FA branch is merged
|
||||
struct test_attn : public test_case {
|
||||
const int64_t hs; // head size
|
||||
const int64_t nh; // num heads
|
||||
|
@ -1557,91 +1554,6 @@ struct test_attn : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
// Mixtral MOE
|
||||
struct test_moe : public test_case {
|
||||
const int n_experts;
|
||||
const int n_experts_per_tok;
|
||||
const int n_tokens;
|
||||
const int n_embd;
|
||||
const int n_ff;
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
return "MOE";
|
||||
|
||||
GGML_UNUSED(t);
|
||||
}
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff);
|
||||
}
|
||||
|
||||
test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
|
||||
: n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
|
||||
}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts);
|
||||
|
||||
std::vector<ggml_tensor *> ffn_up_exp(n_experts);
|
||||
std::vector<ggml_tensor *> ffn_gate_exp(n_experts);
|
||||
std::vector<ggml_tensor *> ffn_down_exp(n_experts);
|
||||
|
||||
for (int i = 0; i < n_experts; ++i) {
|
||||
ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
||||
ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
||||
ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
|
||||
}
|
||||
|
||||
ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
|
||||
|
||||
ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
|
||||
ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f);
|
||||
|
||||
// select experts
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
|
||||
|
||||
ggml_tensor * weights = ggml_get_rows(ctx,
|
||||
ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts);
|
||||
|
||||
weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens);
|
||||
|
||||
ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights);
|
||||
|
||||
weights = ggml_div(ctx, weights, weights_sum);
|
||||
|
||||
// compute expert outputs
|
||||
ggml_tensor * moe_out = nullptr;
|
||||
|
||||
for (int i = 0; i < n_experts_per_tok; ++i) {
|
||||
ggml_tensor * cur_expert;
|
||||
|
||||
ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur);
|
||||
|
||||
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur);
|
||||
|
||||
cur_gate = ggml_silu(ctx, cur_gate);
|
||||
|
||||
cur_expert = ggml_mul(ctx, cur_up, cur_gate);
|
||||
|
||||
cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert);
|
||||
|
||||
cur_expert = ggml_mul(ctx, cur_expert,
|
||||
ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
||||
|
||||
if (i == 0) {
|
||||
moe_out = cur_expert;
|
||||
} else {
|
||||
moe_out = ggml_add(ctx, moe_out, cur_expert);
|
||||
}
|
||||
}
|
||||
|
||||
cur = moe_out;
|
||||
|
||||
return cur;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
enum llm_norm_type {
|
||||
LLM_NORM,
|
||||
LLM_NORM_RMS,
|
||||
|
@ -2249,6 +2161,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_sum_rows());
|
||||
|
@ -2286,11 +2199,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
|
||||
// these tests are disabled to save execution time, but they can be handy for debugging
|
||||
#if 0
|
||||
#if !defined(__SANITIZE_THREAD__)
|
||||
// FIXME: these tests use too much memory with thread sanitizer
|
||||
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
|
||||
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
|
||||
#endif
|
||||
test_cases.emplace_back(new test_llama(1));
|
||||
test_cases.emplace_back(new test_llama(2));
|
||||
test_cases.emplace_back(new test_falcon(1));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue