llama : always use hparams.n_rot for ggml_rope_custom
ggml-ci
This commit is contained in:
parent
ff0899c9b3
commit
0cb764e4ab
1 changed files with 24 additions and 21 deletions
45
llama.cpp
45
llama.cpp
|
@ -4559,14 +4559,14 @@ struct llm_build_context {
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -4689,6 +4689,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
@ -4732,12 +4733,12 @@ struct llm_build_context {
|
||||||
case MODEL_7B:
|
case MODEL_7B:
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
|
@ -4810,6 +4811,7 @@ struct llm_build_context {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
@ -4868,13 +4870,13 @@ struct llm_build_context {
|
||||||
|
|
||||||
// using mode = 2 for neox mode
|
// using mode = 2 for neox mode
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -5031,9 +5033,8 @@ struct llm_build_context {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
||||||
const int64_t n_rot = n_embd_head_k / 2;
|
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
@ -5110,7 +5111,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
||||||
struct ggml_tensor * qrot = ggml_view_3d(
|
struct ggml_tensor * qrot = ggml_view_3d(
|
||||||
ctx0, tmpq, n_rot, n_head, n_tokens,
|
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
||||||
ggml_element_size(tmpq) * n_embd_head,
|
ggml_element_size(tmpq) * n_embd_head,
|
||||||
ggml_element_size(tmpq) * n_embd_head * n_head,
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
||||||
0
|
0
|
||||||
|
@ -5118,7 +5119,7 @@ struct llm_build_context {
|
||||||
cb(qrot, "qrot", il);
|
cb(qrot, "qrot", il);
|
||||||
|
|
||||||
struct ggml_tensor * krot = ggml_view_3d(
|
struct ggml_tensor * krot = ggml_view_3d(
|
||||||
ctx0, tmpk, n_rot, n_head, n_tokens,
|
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
||||||
ggml_element_size(tmpk) * n_embd_head,
|
ggml_element_size(tmpk) * n_embd_head,
|
||||||
ggml_element_size(tmpk) * n_embd_head * n_head,
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
||||||
0
|
0
|
||||||
|
@ -5127,29 +5128,29 @@ struct llm_build_context {
|
||||||
|
|
||||||
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
||||||
struct ggml_tensor * qpass = ggml_view_3d(
|
struct ggml_tensor * qpass = ggml_view_3d(
|
||||||
ctx0, tmpq, n_rot, n_head, n_tokens,
|
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
||||||
ggml_element_size(tmpq) * n_embd_head,
|
ggml_element_size(tmpq) * n_embd_head,
|
||||||
ggml_element_size(tmpq) * n_embd_head * n_head,
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
||||||
ggml_element_size(tmpq) * n_rot
|
ggml_element_size(tmpq) * hparams.n_rot
|
||||||
);
|
);
|
||||||
cb(qpass, "qpass", il);
|
cb(qpass, "qpass", il);
|
||||||
|
|
||||||
struct ggml_tensor * kpass = ggml_view_3d(
|
struct ggml_tensor * kpass = ggml_view_3d(
|
||||||
ctx0, tmpk, n_rot, n_head, n_tokens,
|
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
||||||
ggml_element_size(tmpk) * n_embd_head,
|
ggml_element_size(tmpk) * n_embd_head,
|
||||||
ggml_element_size(tmpk) * n_embd_head * n_head,
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
||||||
ggml_element_size(tmpk) * n_rot
|
ggml_element_size(tmpk) * hparams.n_rot
|
||||||
);
|
);
|
||||||
cb(kpass, "kpass", il);
|
cb(kpass, "kpass", il);
|
||||||
|
|
||||||
struct ggml_tensor * qrotated = ggml_rope_custom(
|
struct ggml_tensor * qrotated = ggml_rope_custom(
|
||||||
ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx,
|
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(qrotated, "qrotated", il);
|
cb(qrotated, "qrotated", il);
|
||||||
|
|
||||||
struct ggml_tensor * krotated = ggml_rope_custom(
|
struct ggml_tensor * krotated = ggml_rope_custom(
|
||||||
ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx,
|
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(krotated, "krotated", il);
|
cb(krotated, "krotated", il);
|
||||||
|
@ -5529,6 +5530,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
@ -5691,13 +5693,13 @@ struct llm_build_context {
|
||||||
|
|
||||||
// using mode = 2 for neox mode
|
// using mode = 2 for neox mode
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -5872,6 +5874,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
@ -5915,13 +5918,13 @@ struct llm_build_context {
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
|
||||||
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
|
||||||
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue