Merge branch 'master' into gg/flash-attn
This commit is contained in:
commit
31109ca00a
87 changed files with 5115 additions and 1531 deletions
|
@ -1095,24 +1095,32 @@ struct test_diag_mask_inf : public test_case {
|
|||
struct test_soft_max : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const float scale;
|
||||
const bool mask;
|
||||
const float scale;
|
||||
const float max_bias;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR4(type, ne, scale, mask);
|
||||
return VARS_TO_STR5(type, ne, mask, scale, max_bias);
|
||||
}
|
||||
|
||||
test_soft_max(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
||||
bool mask = false,
|
||||
float scale = 1.0f,
|
||||
bool mask = false)
|
||||
: type(type), ne(ne), scale(scale), mask(mask) {}
|
||||
float max_bias = 0.0f)
|
||||
: type(type), ne(ne), mask(mask), scale(scale), max_bias(max_bias) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * b = nullptr;
|
||||
if (mask) { b = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, ne[0], ne[1]); }
|
||||
ggml_tensor * out = ggml_soft_max_ext(ctx, a, b, scale);
|
||||
ggml_tensor * mask = nullptr;
|
||||
if (this->mask) {
|
||||
mask = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]);
|
||||
}
|
||||
ggml_tensor * pos = nullptr;
|
||||
if (max_bias > 0.0f) {
|
||||
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]);
|
||||
}
|
||||
ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, pos, scale, max_bias);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
@ -1157,30 +1165,6 @@ struct test_rope : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
// GGML_OP_ALIBI
|
||||
struct test_alibi : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
int n_past;
|
||||
int n_head;
|
||||
float bias_max;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR5(type, ne, n_past, n_head, bias_max);
|
||||
}
|
||||
|
||||
test_alibi(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
||||
int n_past = 512, int n_head = 10, float bias_max = 0.5f)
|
||||
: type(type), ne(ne), n_past(n_past), n_head(n_head), bias_max(bias_max) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * out = ggml_alibi(ctx, a, n_past, n_head, bias_max);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_POOL2D
|
||||
struct test_pool2d : public test_case {
|
||||
enum ggml_op_pool pool_type;
|
||||
|
@ -1568,7 +1552,7 @@ struct test_moe : public test_case {
|
|||
ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
|
||||
|
||||
ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
|
||||
ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd));
|
||||
ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f);
|
||||
|
||||
// select experts
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
|
||||
|
@ -1697,7 +1681,6 @@ public:
|
|||
ggml_cpy(ctx, v_cur_t, v_cache_view);
|
||||
}
|
||||
|
||||
// if max_alibi_bias > 0 then apply ALiBi
|
||||
struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * k_l,
|
||||
|
@ -1716,7 +1699,7 @@ public:
|
|||
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
|
||||
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
|
||||
kq = ggml_soft_max_ext(ctx, kq, kq_mask, nullptr, kq_scale, 0.0f);
|
||||
|
||||
// split cached v into n_head heads
|
||||
struct ggml_tensor * v =
|
||||
|
@ -2014,7 +1997,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
||||
GGML_TYPE_Q6_K,
|
||||
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
|
||||
GGML_TYPE_IQ3_XXS,
|
||||
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S,
|
||||
};
|
||||
|
||||
// unary ops
|
||||
|
@ -2163,6 +2146,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 1}, 5));
|
||||
test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
|
||||
|
||||
#if 0
|
||||
std::uniform_int_distribution<> dist_ne1(1, 50);
|
||||
int exponent = 1;
|
||||
while (exponent < (1 << 17)) {
|
||||
|
@ -2171,14 +2155,29 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
for (int n = 0; n < 10; ++n) {
|
||||
int64_t ne0 = dist_ne0(rng);
|
||||
int64_t ne1 = dist_ne1(rng);
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
|
||||
}
|
||||
|
||||
exponent <<= 1;
|
||||
}
|
||||
#endif
|
||||
for (bool mask : {false, true}) {
|
||||
for (float max_bias : {0.0f, 8.0f}) {
|
||||
for (float scale : {1.0f, 0.1f}) {
|
||||
for (int64_t ne0 : {16, 1024}) {
|
||||
for (int64_t ne1 : {16, 1024}) {
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, 0.1f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, 0.1f, true));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 8.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
||||
|
||||
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512)); // llama 7B
|
||||
|
@ -2193,7 +2192,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2)
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_alibi());
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
|
||||
|
||||
|
@ -2233,14 +2231,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
}
|
||||
#endif
|
||||
|
||||
// these tests are disabled to save execution time, but they can be handy for debugging
|
||||
#if 0
|
||||
#if !defined(__SANITIZE_THREAD__)
|
||||
// FIXME: these tests use too much memory with thread sanitizer
|
||||
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
|
||||
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
|
||||
#endif
|
||||
|
||||
// these tests are disabled to save execution time, but they can be handy for debugging
|
||||
#if 0
|
||||
test_cases.emplace_back(new test_llama(1));
|
||||
test_cases.emplace_back(new test_llama(2));
|
||||
test_cases.emplace_back(new test_falcon(1));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue