llama : fix llm_build_k_shift to use correct n_rot (#4889)
* llama : fix llm_build_k_shift to use correct n_rot ggml-ci * llama : always use hparams.n_rot for ggml_rope_custom ggml-ci * convert : fix persimmon conversion to write correct n_rot
This commit is contained in:
parent
326b418b59
commit
f445c0e68c
4 changed files with 51 additions and 33 deletions
65
llama.cpp
65
llama.cpp
|
@ -4104,7 +4104,6 @@ static void llm_build_k_shift(
|
|||
struct ggml_cgraph * graph,
|
||||
llm_rope_type type,
|
||||
int64_t n_ctx,
|
||||
int n_rot,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
const llm_build_cb & cb) {
|
||||
|
@ -4112,14 +4111,13 @@ static void llm_build_k_shift(
|
|||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int32_t n_rot = hparams.n_rot;
|
||||
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
||||
const float ext_factor = cparams.yarn_ext_factor;
|
||||
const float attn_factor = cparams.yarn_attn_factor;
|
||||
const float beta_fast = cparams.yarn_beta_fast;
|
||||
const float beta_slow = cparams.yarn_beta_slow;
|
||||
|
||||
GGML_ASSERT(n_embd_head_k % n_rot == 0);
|
||||
|
||||
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
|
||||
cb(K_shift, "K_shift", -1);
|
||||
|
||||
|
@ -4523,7 +4521,7 @@ struct llm_build_context {
|
|||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
@ -4561,14 +4559,14 @@ struct llm_build_context {
|
|||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -4691,6 +4689,7 @@ struct llm_build_context {
|
|||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -4708,7 +4707,7 @@ struct llm_build_context {
|
|||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
@ -4734,12 +4733,12 @@ struct llm_build_context {
|
|||
case MODEL_7B:
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
break;
|
||||
|
@ -4812,6 +4811,7 @@ struct llm_build_context {
|
|||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -4829,7 +4829,7 @@ struct llm_build_context {
|
|||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
@ -4870,13 +4870,13 @@ struct llm_build_context {
|
|||
|
||||
// using mode = 2 for neox mode
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
||||
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
||||
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -5033,9 +5033,8 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
const int64_t n_rot = n_embd_head_k / 2;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -5052,7 +5051,7 @@ struct llm_build_context {
|
|||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
@ -5112,7 +5111,7 @@ struct llm_build_context {
|
|||
|
||||
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
||||
struct ggml_tensor * qrot = ggml_view_3d(
|
||||
ctx0, tmpq, n_rot, n_head, n_tokens,
|
||||
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
||||
ggml_element_size(tmpq) * n_embd_head,
|
||||
ggml_element_size(tmpq) * n_embd_head * n_head,
|
||||
0
|
||||
|
@ -5120,7 +5119,7 @@ struct llm_build_context {
|
|||
cb(qrot, "qrot", il);
|
||||
|
||||
struct ggml_tensor * krot = ggml_view_3d(
|
||||
ctx0, tmpk, n_rot, n_head, n_tokens,
|
||||
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
||||
ggml_element_size(tmpk) * n_embd_head,
|
||||
ggml_element_size(tmpk) * n_embd_head * n_head,
|
||||
0
|
||||
|
@ -5129,29 +5128,29 @@ struct llm_build_context {
|
|||
|
||||
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
||||
struct ggml_tensor * qpass = ggml_view_3d(
|
||||
ctx0, tmpq, n_rot, n_head, n_tokens,
|
||||
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
||||
ggml_element_size(tmpq) * n_embd_head,
|
||||
ggml_element_size(tmpq) * n_embd_head * n_head,
|
||||
ggml_element_size(tmpq) * n_rot
|
||||
ggml_element_size(tmpq) * hparams.n_rot
|
||||
);
|
||||
cb(qpass, "qpass", il);
|
||||
|
||||
struct ggml_tensor * kpass = ggml_view_3d(
|
||||
ctx0, tmpk, n_rot, n_head, n_tokens,
|
||||
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
||||
ggml_element_size(tmpk) * n_embd_head,
|
||||
ggml_element_size(tmpk) * n_embd_head * n_head,
|
||||
ggml_element_size(tmpk) * n_rot
|
||||
ggml_element_size(tmpk) * hparams.n_rot
|
||||
);
|
||||
cb(kpass, "kpass", il);
|
||||
|
||||
struct ggml_tensor * qrotated = ggml_rope_custom(
|
||||
ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx,
|
||||
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(qrotated, "qrotated", il);
|
||||
|
||||
struct ggml_tensor * krotated = ggml_rope_custom(
|
||||
ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx,
|
||||
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(krotated, "krotated", il);
|
||||
|
@ -5531,6 +5530,7 @@ struct llm_build_context {
|
|||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -5548,7 +5548,7 @@ struct llm_build_context {
|
|||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
@ -5661,7 +5661,7 @@ struct llm_build_context {
|
|||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
@ -5693,13 +5693,13 @@ struct llm_build_context {
|
|||
|
||||
// using mode = 2 for neox mode
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
||||
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
||||
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -5778,7 +5778,7 @@ struct llm_build_context {
|
|||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
@ -5874,6 +5874,7 @@ struct llm_build_context {
|
|||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -5891,7 +5892,7 @@ struct llm_build_context {
|
|||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
@ -5917,13 +5918,13 @@ struct llm_build_context {
|
|||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
|
||||
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
|
||||
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue