ggml : update ggml_rope_ext API to support freq. factors

This commit is contained in:
Georgi Gerganov 2024-05-16 13:23:04 +03:00
parent 2d473a4a9a
commit 471d8170bc
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
6 changed files with 225 additions and 176 deletions

View file

@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// not capturing these, to silcence warnings // not capturing these, to silcence warnings
const int rope_mode = 0; const int rope_mode = 0;
return ggml_rope_custom(ctx, return ggml_rope_ext(ctx,
t, KQ_pos, n_rot, rope_mode, n_ctx, 0, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
); );
}; };

View file

@ -301,8 +301,8 @@ static struct ggml_tensor * llama_build_train_graphs(
// not capturing these, to silcence warnings // not capturing these, to silcence warnings
const int rope_mode = 0; const int rope_mode = 0;
return ggml_rope_custom( return ggml_rope_ext(
ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
); );
}; };

View file

@ -288,15 +288,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const bool is_glm = mode & 4; const bool is_glm = mode & 4;
if (is_neox) { if (is_neox) {
// TODO: move these asserts to ggml.c
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(src1->ne[0] == ne2);
pos = (const int32_t *) src1_d; pos = (const int32_t *) src1_d;
if (src2 != nullptr) { if (src2 != nullptr) {
// TODO: move these asserts to ggml.c
GGML_ASSERT(src2->type == GGML_TYPE_F32);
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
freq_factors = (const float *) src2->data; freq_factors = (const float *) src2->data;
} }
} else { } else {

84
ggml.c
View file

@ -6231,6 +6231,7 @@ static struct ggml_tensor * ggml_rope_impl(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
@ -6248,6 +6249,11 @@ static struct ggml_tensor * ggml_rope_impl(
GGML_ASSERT(b->type == GGML_TYPE_I32); GGML_ASSERT(b->type == GGML_TYPE_I32);
GGML_ASSERT(a->ne[2] == b->ne[0]); GGML_ASSERT(a->ne[2] == b->ne[0]);
if (c) {
GGML_ASSERT(c->type == GGML_TYPE_F32);
GGML_ASSERT(c->ne[0] >= n_dims / 2);
}
bool is_node = false; bool is_node = false;
if (a->grad) { if (a->grad) {
@ -6271,17 +6277,11 @@ static struct ggml_tensor * ggml_rope_impl(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = b; result->src[1] = b;
result->src[2] = c;
return result; return result;
} }
struct ggml_tensor * ggml_rope_with_freq_factors(
struct ggml_tensor* rope_tensor,
struct ggml_tensor* freq_factors) {
rope_tensor->src[2] = freq_factors;
return rope_tensor;
}
struct ggml_tensor * ggml_rope( struct ggml_tensor * ggml_rope(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -6290,7 +6290,7 @@ struct ggml_tensor * ggml_rope(
int mode, int mode,
int n_ctx) { int n_ctx) {
return ggml_rope_impl( return ggml_rope_impl(
ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
); );
} }
@ -6302,7 +6302,49 @@ struct ggml_tensor * ggml_rope_inplace(
int mode, int mode,
int n_ctx) { int n_ctx) {
return ggml_rope_impl( return ggml_rope_impl(
ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
);
}
struct ggml_tensor * ggml_rope_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int mode,
int n_ctx,
int n_orig_ctx,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow) {
return ggml_rope_impl(
ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
);
}
struct ggml_tensor * ggml_rope_ext_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int mode,
int n_ctx,
int n_orig_ctx,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow) {
return ggml_rope_impl(
ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
); );
} }
@ -6321,7 +6363,7 @@ struct ggml_tensor * ggml_rope_custom(
float beta_fast, float beta_fast,
float beta_slow) { float beta_slow) {
return ggml_rope_impl( return ggml_rope_impl(
ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
); );
} }
@ -6341,27 +6383,18 @@ struct ggml_tensor * ggml_rope_custom_inplace(
float beta_fast, float beta_fast,
float beta_slow) { float beta_slow) {
return ggml_rope_impl( return ggml_rope_impl(
ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
); );
} }
struct ggml_tensor * ggml_rope_xpos_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int n_dims,
float base,
bool down) {
return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
}
// ggml_rope_back // ggml_rope_back
struct ggml_tensor * ggml_rope_back( struct ggml_tensor * ggml_rope_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
@ -6377,6 +6410,7 @@ struct ggml_tensor * ggml_rope_back(
GGML_ASSERT(ggml_is_vector(b)); GGML_ASSERT(ggml_is_vector(b));
GGML_ASSERT(b->type == GGML_TYPE_I32); GGML_ASSERT(b->type == GGML_TYPE_I32);
GGML_ASSERT(a->ne[2] == b->ne[0]); GGML_ASSERT(a->ne[2] == b->ne[0]);
GGML_ASSERT(c == NULL && "freq factors not implemented yet");
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet"); GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
@ -18407,6 +18441,7 @@ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct gg
static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) { static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
struct ggml_tensor * src0 = tensor->src[0]; struct ggml_tensor * src0 = tensor->src[0];
struct ggml_tensor * src1 = tensor->src[1]; struct ggml_tensor * src1 = tensor->src[1];
struct ggml_tensor * src2 = tensor->src[2];
switch (tensor->op) { switch (tensor->op) {
case GGML_OP_DUP: case GGML_OP_DUP:
@ -18935,10 +18970,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
src0->grad = ggml_add_or_set(ctx, src0->grad = ggml_add_or_set(ctx,
src0->grad, src0->grad,
ggml_rope_with_freq_factors(
ggml_rope_back(ctx, ggml_rope_back(ctx,
tensor->grad, tensor->grad,
src1, src1,
src2,
n_dims, n_dims,
mode, mode,
n_ctx, n_ctx,
@ -18951,7 +18986,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
beta_slow, beta_slow,
xpos_base, xpos_base,
xpos_down), xpos_down),
tensor->src[2]),
zero_table); zero_table);
} }
} break; } break;
@ -18976,10 +19010,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
src0->grad = ggml_add_or_set(ctx, src0->grad = ggml_add_or_set(ctx,
src0->grad, src0->grad,
ggml_rope_with_freq_factors(
ggml_rope_impl(ctx, ggml_rope_impl(ctx,
tensor->grad, tensor->grad,
src1, src1,
src2,
n_dims, n_dims,
mode, mode,
n_ctx, n_ctx,
@ -18993,7 +19027,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
xpos_base, xpos_base,
xpos_down, xpos_down,
false), false),
tensor->src[2]),
zero_table); zero_table);
} }
} break; } break;
@ -19062,7 +19095,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
masked); masked);
} }
struct ggml_tensor * src2 = tensor->src[2];
const int64_t elem_q = ggml_nelements(src0); const int64_t elem_q = ggml_nelements(src0);
const int64_t elem_k = ggml_nelements(src1); const int64_t elem_k = ggml_nelements(src1);
const int64_t elem_v = ggml_nelements(src2); const int64_t elem_v = ggml_nelements(src2);

49
ggml.h
View file

@ -1465,6 +1465,7 @@ extern "C" {
// if mode & 4 == 1, ChatGLM style // if mode & 4 == 1, ChatGLM style
// //
// b is an int32 vector with size a->ne[2], it contains the positions // b is an int32 vector with size a->ne[2], it contains the positions
// c is freq factors (e.g. phi3-128k), (optional)
GGML_API struct ggml_tensor * ggml_rope( GGML_API struct ggml_tensor * ggml_rope(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -1483,10 +1484,11 @@ extern "C" {
int n_ctx); int n_ctx);
// custom RoPE // custom RoPE
GGML_API struct ggml_tensor * ggml_rope_custom( GGML_API struct ggml_tensor * ggml_rope_ext(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
@ -1499,7 +1501,23 @@ extern "C" {
float beta_slow); float beta_slow);
// in-place, returns view(a) // in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_custom_inplace( GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int mode,
int n_ctx,
int n_orig_ctx,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow);
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
@ -1512,20 +1530,28 @@ extern "C" {
float ext_factor, float ext_factor,
float attn_factor, float attn_factor,
float beta_fast, float beta_fast,
float beta_slow); float beta_slow),
"use ggml_rope_ext instead");
// compute correction dims for YaRN RoPE scaling GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
GGML_CALL void ggml_rope_yarn_corr_dims(
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
// xPos RoPE, in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
int n_dims, int n_dims,
float base, int mode,
bool down); int n_ctx,
int n_orig_ctx,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow),
"use ggml_rope_ext_inplace instead");
// compute correction dims for YaRN RoPE scaling
GGML_CALL void ggml_rope_yarn_corr_dims(
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
// rotary position embedding backward, i.e compute dx from dy // rotary position embedding backward, i.e compute dx from dy
// a - dy // a - dy
@ -1533,6 +1559,7 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,

184
llama.cpp
View file

@ -6883,17 +6883,15 @@ struct llm_build_context {
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * tmp = struct ggml_tensor * tmp =
// we rotate only the first n_rot dimensions // we rotate only the first n_rot dimensions
ggml_rope_custom_inplace(ctx0, ggml_rope_ext_inplace(ctx0,
ggml_view_3d(ctx0, kv_self.k_l[il], ggml_view_3d(ctx0, kv_self.k_l[il],
n_embd_head_k, n_head_kv, n_ctx, n_embd_head_k, n_head_kv, n_ctx,
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
0), 0),
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, lctx.inp_K_shift, lctx.freq_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
tmp = ggml_rope_with_freq_factors(tmp, lctx.freq_factors);
cb(tmp, "K_shifted", il); cb(tmp, "K_shifted", il);
ggml_build_forward_expand(gf, tmp); ggml_build_forward_expand(gf, tmp);
} }
@ -7117,15 +7115,15 @@ struct llm_build_context {
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
} }
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -7247,13 +7245,13 @@ struct llm_build_context {
switch (model.type) { switch (model.type) {
case MODEL_7B: case MODEL_7B:
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -7359,15 +7357,15 @@ struct llm_build_context {
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -7480,14 +7478,14 @@ struct llm_build_context {
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
// using mode = 2 for neox mode // using mode = 2 for neox mode
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
@ -7603,15 +7601,15 @@ struct llm_build_context {
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
} }
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -7755,15 +7753,15 @@ struct llm_build_context {
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -8108,15 +8106,15 @@ struct llm_build_context {
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -8548,15 +8546,15 @@ struct llm_build_context {
} }
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -8668,14 +8666,14 @@ struct llm_build_context {
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
// using mode = 2 for neox mode // using mode = 2 for neox mode
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
@ -8779,15 +8777,15 @@ struct llm_build_context {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -8893,15 +8891,15 @@ struct llm_build_context {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -9045,8 +9043,8 @@ struct llm_build_context {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
@ -9056,8 +9054,8 @@ struct llm_build_context {
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head))); Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
@ -9167,21 +9165,19 @@ struct llm_build_context {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, ctx0, Qcur, inp_pos, freq_factors, n_rot, rope_type, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
Qcur = ggml_rope_with_freq_factors(Qcur, freq_factors);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, ctx0, Kcur, inp_pos, freq_factors, n_rot, rope_type, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
); );
Kcur = ggml_rope_with_freq_factors(Kcur, freq_factors);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
@ -9285,14 +9281,14 @@ struct llm_build_context {
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
@ -9493,15 +9489,15 @@ struct llm_build_context {
cb(tmpk, "tmpk", il); cb(tmpk, "tmpk", il);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
struct ggml_tensor * Qcur = ggml_rope_custom( struct ggml_tensor * Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_rope_custom( struct ggml_tensor * Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -9609,15 +9605,15 @@ struct llm_build_context {
// cb(Vcur, "Vcur", il); // cb(Vcur, "Vcur", il);
// } // }
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -9726,15 +9722,15 @@ struct llm_build_context {
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
} }
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -9856,15 +9852,15 @@ struct llm_build_context {
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
} }
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -9976,8 +9972,8 @@ struct llm_build_context {
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
@ -9985,8 +9981,8 @@ struct llm_build_context {
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
cb(Qcur, "Qcur_scaled", il); cb(Qcur, "Qcur_scaled", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow); ext_factor, attn_factor, beta_fast, beta_slow);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
@ -10096,15 +10092,15 @@ struct llm_build_context {
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
} }
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -10386,15 +10382,15 @@ struct llm_build_context {
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
} }
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
@ -10517,15 +10513,15 @@ struct llm_build_context {
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
} }
Qcur = ggml_rope_custom( Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom( Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow ext_factor, attn_factor, beta_fast, beta_slow
); );