llama : always use hparams.n_rot for ggml_rope_custom

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-01-12 11:21:45 +02:00
parent ff0899c9b3
commit 0cb764e4ab
No known key found for this signature in database
GPG key ID: 449E073F9DC10735

View file

@ -4559,14 +4559,14 @@ struct llm_build_context {
Qcur = ggml_rope_custom( Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
@ -4689,6 +4689,7 @@ struct llm_build_context {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -4732,12 +4733,12 @@ struct llm_build_context {
case MODEL_7B: case MODEL_7B:
Qcur = ggml_rope_custom( Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
Kcur = ggml_rope_custom( Kcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
break; break;
@ -4810,6 +4811,7 @@ struct llm_build_context {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -4868,13 +4870,13 @@ struct llm_build_context {
// using mode = 2 for neox mode // using mode = 2 for neox mode
Qcur = ggml_rope_custom( Qcur = ggml_rope_custom(
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_custom(
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
@ -5031,9 +5033,8 @@ struct llm_build_context {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
const int64_t n_rot = n_embd_head_k / 2;
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -5110,7 +5111,7 @@ struct llm_build_context {
// RoPE the first n_rot of q/k, pass the other half, and concat. // RoPE the first n_rot of q/k, pass the other half, and concat.
struct ggml_tensor * qrot = ggml_view_3d( struct ggml_tensor * qrot = ggml_view_3d(
ctx0, tmpq, n_rot, n_head, n_tokens, ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head,
ggml_element_size(tmpq) * n_embd_head * n_head, ggml_element_size(tmpq) * n_embd_head * n_head,
0 0
@ -5118,7 +5119,7 @@ struct llm_build_context {
cb(qrot, "qrot", il); cb(qrot, "qrot", il);
struct ggml_tensor * krot = ggml_view_3d( struct ggml_tensor * krot = ggml_view_3d(
ctx0, tmpk, n_rot, n_head, n_tokens, ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head,
ggml_element_size(tmpk) * n_embd_head * n_head, ggml_element_size(tmpk) * n_embd_head * n_head,
0 0
@ -5127,29 +5128,29 @@ struct llm_build_context {
// get the second half of tmpq, e.g tmpq[n_rot:, :, :] // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
struct ggml_tensor * qpass = ggml_view_3d( struct ggml_tensor * qpass = ggml_view_3d(
ctx0, tmpq, n_rot, n_head, n_tokens, ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head,
ggml_element_size(tmpq) * n_embd_head * n_head, ggml_element_size(tmpq) * n_embd_head * n_head,
ggml_element_size(tmpq) * n_rot ggml_element_size(tmpq) * hparams.n_rot
); );
cb(qpass, "qpass", il); cb(qpass, "qpass", il);
struct ggml_tensor * kpass = ggml_view_3d( struct ggml_tensor * kpass = ggml_view_3d(
ctx0, tmpk, n_rot, n_head, n_tokens, ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head,
ggml_element_size(tmpk) * n_embd_head * n_head, ggml_element_size(tmpk) * n_embd_head * n_head,
ggml_element_size(tmpk) * n_rot ggml_element_size(tmpk) * hparams.n_rot
); );
cb(kpass, "kpass", il); cb(kpass, "kpass", il);
struct ggml_tensor * qrotated = ggml_rope_custom( struct ggml_tensor * qrotated = ggml_rope_custom(
ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx, ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(qrotated, "qrotated", il); cb(qrotated, "qrotated", il);
struct ggml_tensor * krotated = ggml_rope_custom( struct ggml_tensor * krotated = ggml_rope_custom(
ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx, ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(krotated, "krotated", il); cb(krotated, "krotated", il);
@ -5529,6 +5530,7 @@ struct llm_build_context {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -5691,13 +5693,13 @@ struct llm_build_context {
// using mode = 2 for neox mode // using mode = 2 for neox mode
Qcur = ggml_rope_custom( Qcur = ggml_rope_custom(
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_custom(
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
@ -5872,6 +5874,7 @@ struct llm_build_context {
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
@ -5915,13 +5918,13 @@ struct llm_build_context {
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_custom( Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);