ggml : add ggml_soft_max_ext (#4256)
* metal : implement soft_max_ext * cuda : implement soft_max_ext * ggml : implement soft_max_ext (CPU) * batched-bench : print threads ggml-ci * metal : simplify soft_max encoding ggml-ci * cuda : use 512 threads for soft_max instead of 32 * ggml : update soft max cpu * cuda : do warp-based block reduce * cuda : increase max block size to 1024 * cuda : fix warp reduction initialization of shared mem * metal : warp-based reduction for soft max kernel * metal : warp-based reduce for rms_norm * metal : simplify soft max kernel ggml-ci * alloc : fix build with debug
This commit is contained in:
parent
1d144112c0
commit
ef47ec18da
8 changed files with 311 additions and 196 deletions
35
llama.cpp
35
llama.cpp
|
@ -3704,23 +3704,29 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
kq = ggml_scale(ctx, kq, kq_scale);
|
||||
cb(kq, "kq_scaled", il);
|
||||
|
||||
if (max_alibi_bias > 0.0f) {
|
||||
// TODO: n_head or n_head_kv
|
||||
// TODO: K-shift is likely not working
|
||||
// TODO: change to ggml_add
|
||||
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
||||
cb(kq, "kq_scaled_alibi", il);
|
||||
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
|
||||
kq = ggml_scale(ctx, kq, kq_scale);
|
||||
cb(kq, "kq_scaled", il);
|
||||
|
||||
if (max_alibi_bias > 0.0f) {
|
||||
// TODO: n_head or n_head_kv
|
||||
// TODO: K-shift is likely not working
|
||||
// TODO: change to ggml_add
|
||||
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
||||
cb(kq, "kq_scaled_alibi", il);
|
||||
}
|
||||
|
||||
kq = ggml_add(ctx, kq, kq_mask);
|
||||
cb(kq, "kq_masked", il);
|
||||
|
||||
kq = ggml_soft_max(ctx, kq);
|
||||
cb(kq, "kq_soft_max", il);
|
||||
} else {
|
||||
kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
|
||||
cb(kq, "kq_soft_max_ext", il);
|
||||
}
|
||||
|
||||
kq = ggml_add(ctx, kq, kq_mask);
|
||||
cb(kq, "kq_masked", il);
|
||||
|
||||
kq = ggml_soft_max(ctx, kq);
|
||||
cb(kq, "kq_soft_max", il);
|
||||
|
||||
// split cached v into n_head heads
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, kv.v,
|
||||
|
@ -5041,6 +5047,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|||
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
|
||||
{ "kq_masked", OFFLOAD_FUNC_KQ },
|
||||
{ "kq_soft_max", OFFLOAD_FUNC_V },
|
||||
{ "kq_soft_max_ext", OFFLOAD_FUNC_V },
|
||||
{ "v", OFFLOAD_FUNC_V },
|
||||
{ "kqv", OFFLOAD_FUNC_V },
|
||||
{ "kqv_merged", OFFLOAD_FUNC_V },
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue