llama : refactor k-shift implementation
ggml-ci
This commit is contained in:
parent
15499eb942
commit
1a999819a2
3 changed files with 173 additions and 159 deletions
|
@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
|
||||||
const int n_batch = ctx_params.n_batch;
|
const int n_batch = ctx_params.n_batch;
|
||||||
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
|
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
||||||
|
|
||||||
// print the prompt token-by-token
|
// print the prompt token-by-token
|
||||||
|
|
||||||
|
|
322
llama.cpp
322
llama.cpp
|
@ -1552,6 +1552,7 @@ static const size_t GiB = 1024*MiB;
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
bool vocab_only;
|
bool vocab_only;
|
||||||
bool rope_finetuned;
|
bool rope_finetuned;
|
||||||
|
|
||||||
uint32_t n_vocab;
|
uint32_t n_vocab;
|
||||||
uint32_t n_ctx_train; // context size the model was trained on
|
uint32_t n_ctx_train; // context size the model was trained on
|
||||||
uint32_t n_embd;
|
uint32_t n_embd;
|
||||||
|
@ -4595,10 +4596,11 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||||
|
|
||||||
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
|
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
|
||||||
|
|
||||||
enum llm_rope_type {
|
enum llm_rope_type : int {
|
||||||
LLM_ROPE,
|
LLM_ROPE_NONE = -1,
|
||||||
LLM_ROPE_NEOX,
|
LLM_ROPE = 0,
|
||||||
LLM_ROPE_GLM,
|
LLM_ROPE_NEOX = 2,
|
||||||
|
LLM_ROPE_GLM = 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llm_ffn_op_type {
|
enum llm_ffn_op_type {
|
||||||
|
@ -4655,7 +4657,7 @@ static void llm_build_k_shift(
|
||||||
const llama_kv_cache & kv,
|
const llama_kv_cache & kv,
|
||||||
struct ggml_cgraph * graph,
|
struct ggml_cgraph * graph,
|
||||||
struct ggml_tensor * K_shift,
|
struct ggml_tensor * K_shift,
|
||||||
llm_rope_type type,
|
llm_rope_type rope_type,
|
||||||
int64_t n_ctx,
|
int64_t n_ctx,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale,
|
float freq_scale,
|
||||||
|
@ -4671,14 +4673,6 @@ static void llm_build_k_shift(
|
||||||
const float beta_fast = cparams.yarn_beta_fast;
|
const float beta_fast = cparams.yarn_beta_fast;
|
||||||
const float beta_slow = cparams.yarn_beta_slow;
|
const float beta_slow = cparams.yarn_beta_slow;
|
||||||
|
|
||||||
int rope_type = 0;
|
|
||||||
|
|
||||||
switch (type) {
|
|
||||||
case LLM_ROPE: rope_type = 0; break;
|
|
||||||
case LLM_ROPE_NEOX: rope_type = 2; break;
|
|
||||||
case LLM_ROPE_GLM: rope_type = 4; break;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * tmp =
|
struct ggml_tensor * tmp =
|
||||||
// we rotate only the first n_rot dimensions
|
// we rotate only the first n_rot dimensions
|
||||||
|
@ -4988,6 +4982,38 @@ static struct ggml_tensor * llm_build_kv(
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static llm_rope_type llm_get_rope_type(llm_arch arch) {
|
||||||
|
switch (arch) {
|
||||||
|
case LLM_ARCH_LLAMA: return LLM_ROPE;
|
||||||
|
case LLM_ARCH_FALCON: return LLM_ROPE_NEOX;
|
||||||
|
case LLM_ARCH_BAICHUAN: return LLM_ROPE;
|
||||||
|
case LLM_ARCH_GPT2: return LLM_ROPE_NONE;
|
||||||
|
case LLM_ARCH_GPTJ: return LLM_ROPE_NONE;
|
||||||
|
case LLM_ARCH_GPTNEOX: return LLM_ROPE_NONE;
|
||||||
|
case LLM_ARCH_MPT: return LLM_ROPE_NONE;
|
||||||
|
case LLM_ARCH_STARCODER: return LLM_ROPE;
|
||||||
|
case LLM_ARCH_PERSIMMON: return LLM_ROPE_NEOX;
|
||||||
|
case LLM_ARCH_REFACT: return LLM_ROPE_NONE;
|
||||||
|
case LLM_ARCH_BERT: return LLM_ROPE_NEOX;
|
||||||
|
case LLM_ARCH_NOMIC_BERT: return LLM_ROPE_NEOX;
|
||||||
|
case LLM_ARCH_BLOOM: return LLM_ROPE_NONE;
|
||||||
|
case LLM_ARCH_STABLELM: return LLM_ROPE_NEOX;
|
||||||
|
case LLM_ARCH_QWEN: return LLM_ROPE_NEOX;
|
||||||
|
case LLM_ARCH_QWEN2: return LLM_ROPE_NEOX;
|
||||||
|
case LLM_ARCH_PHI2: return LLM_ROPE_NEOX;
|
||||||
|
case LLM_ARCH_PLAMO: return LLM_ROPE;
|
||||||
|
case LLM_ARCH_CODESHELL: return LLM_ROPE;
|
||||||
|
case LLM_ARCH_ORION: return LLM_ROPE;
|
||||||
|
case LLM_ARCH_INTERNLM2: return LLM_ROPE;
|
||||||
|
case LLM_ARCH_MINICPM: return LLM_ROPE;
|
||||||
|
case LLM_ARCH_GEMMA: return LLM_ROPE;
|
||||||
|
case LLM_ARCH_UNKNOWN:
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false && "unknown architecture");
|
||||||
|
return LLM_ROPE_NONE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct llm_build_context {
|
struct llm_build_context {
|
||||||
const llama_model & model;
|
const llama_model & model;
|
||||||
const llama_context & lctx;
|
const llama_context & lctx;
|
||||||
|
@ -5022,9 +5048,10 @@ struct llm_build_context {
|
||||||
const int32_t kv_head; // index of where we store new KV data in the cache
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
||||||
const int32_t n_orig_ctx;
|
const int32_t n_orig_ctx;
|
||||||
|
|
||||||
const bool do_rope_shift;
|
|
||||||
const uint32_t pooling_type;
|
const uint32_t pooling_type;
|
||||||
|
|
||||||
|
const llm_rope_type rope_type;
|
||||||
|
|
||||||
const llm_build_cb & cb;
|
const llm_build_cb & cb;
|
||||||
|
|
||||||
std::vector<uint8_t> & buf_compute_meta;
|
std::vector<uint8_t> & buf_compute_meta;
|
||||||
|
@ -5066,8 +5093,8 @@ struct llm_build_context {
|
||||||
n_kv (worst_case ? n_ctx : kv_self.n),
|
n_kv (worst_case ? n_ctx : kv_self.n),
|
||||||
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
||||||
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
||||||
do_rope_shift (worst_case || kv_self.has_shift),
|
|
||||||
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
|
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
|
||||||
|
rope_type (llm_get_rope_type(model.arch)),
|
||||||
cb (cb),
|
cb (cb),
|
||||||
buf_compute_meta (lctx.buf_compute_meta) {
|
buf_compute_meta (lctx.buf_compute_meta) {
|
||||||
// all initializations should be done in init()
|
// all initializations should be done in init()
|
||||||
|
@ -5090,6 +5117,14 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_k_shift() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, rope_type, n_ctx, freq_base, freq_scale, cb);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_llama() {
|
struct ggml_cgraph * build_llama() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
@ -5111,11 +5146,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
@ -5151,14 +5181,14 @@ struct llm_build_context {
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -5299,11 +5329,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
||||||
cb(KQ_pos, "KQ_pos", -1);
|
cb(KQ_pos, "KQ_pos", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
@ -5327,12 +5352,12 @@ struct llm_build_context {
|
||||||
case MODEL_7B:
|
case MODEL_7B:
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
|
@ -5417,11 +5442,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * attn_norm;
|
struct ggml_tensor * attn_norm;
|
||||||
|
|
||||||
|
@ -5460,13 +5480,13 @@ struct llm_build_context {
|
||||||
|
|
||||||
// using mode = 2 for neox mode
|
// using mode = 2 for neox mode
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -5636,10 +5656,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * residual = inpL;
|
struct ggml_tensor * residual = inpL;
|
||||||
|
|
||||||
|
@ -5730,13 +5746,13 @@ struct llm_build_context {
|
||||||
cb(kpass, "kpass", il);
|
cb(kpass, "kpass", il);
|
||||||
|
|
||||||
struct ggml_tensor * qrotated = ggml_rope_custom(
|
struct ggml_tensor * qrotated = ggml_rope_custom(
|
||||||
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
ctx0, qrot, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(qrotated, "qrotated", il);
|
cb(qrotated, "qrotated", il);
|
||||||
|
|
||||||
struct ggml_tensor * krotated = ggml_rope_custom(
|
struct ggml_tensor * krotated = ggml_rope_custom(
|
||||||
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
ctx0, krot, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(krotated, "krotated", il);
|
cb(krotated, "krotated", il);
|
||||||
|
@ -5988,14 +6004,14 @@ struct llm_build_context {
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -6284,11 +6300,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
@ -6325,14 +6336,14 @@ struct llm_build_context {
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -6407,11 +6418,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
@ -6441,13 +6447,13 @@ struct llm_build_context {
|
||||||
|
|
||||||
// using mode = 2 for neox mode
|
// using mode = 2 for neox mode
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -6521,11 +6527,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
@ -6561,14 +6562,14 @@ struct llm_build_context {
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -6642,11 +6643,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
||||||
model.layers[il].attn_norm,
|
model.layers[il].attn_norm,
|
||||||
|
@ -6684,7 +6680,7 @@ struct llm_build_context {
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
@ -6695,7 +6691,7 @@ struct llm_build_context {
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -6764,11 +6760,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
|
@ -6793,13 +6784,13 @@ struct llm_build_context {
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
|
||||||
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
|
||||||
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
@ -6969,11 +6960,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
model.layers[il].attn_norm,
|
model.layers[il].attn_norm,
|
||||||
|
@ -6999,14 +6985,14 @@ struct llm_build_context {
|
||||||
|
|
||||||
struct ggml_tensor * Qcur = ggml_rope_custom(
|
struct ggml_tensor * Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Kcur = ggml_rope_custom(
|
struct ggml_tensor * Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -7077,11 +7063,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
@ -7117,14 +7098,14 @@ struct llm_build_context {
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -7196,11 +7177,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
@ -7236,14 +7212,14 @@ struct llm_build_context {
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -7328,11 +7304,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
@ -7368,14 +7339,14 @@ struct llm_build_context {
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
@ -7464,11 +7435,6 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||||
cb(KQ_mask, "KQ_mask", -1);
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
// shift the entire K-cache if needed
|
|
||||||
if (do_rope_shift) {
|
|
||||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
|
@ -7491,7 +7457,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
||||||
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
@ -7500,7 +7466,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
Kcur = ggml_rope_custom(
|
Kcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
||||||
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
@ -7553,6 +7519,22 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
|
||||||
|
llama_batch dummy;
|
||||||
|
|
||||||
|
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
||||||
|
|
||||||
|
struct llm_build_context llm(lctx, dummy, cb, false);
|
||||||
|
|
||||||
|
llm.init();
|
||||||
|
|
||||||
|
struct ggml_cgraph * result = llm.build_k_shift();
|
||||||
|
|
||||||
|
llm.free();
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
static struct ggml_cgraph * llama_build_graph(
|
static struct ggml_cgraph * llama_build_graph(
|
||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
const llama_batch & batch,
|
const llama_batch & batch,
|
||||||
|
@ -7672,6 +7654,20 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void llama_set_k_shift(llama_context & lctx) {
|
||||||
|
const auto & cparams = lctx.cparams;
|
||||||
|
|
||||||
|
const int64_t n_ctx = cparams.n_ctx;
|
||||||
|
|
||||||
|
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
||||||
|
|
||||||
|
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_ctx; ++i) {
|
||||||
|
data[i] = lctx.kv_self.cells[i].delta;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
//
|
//
|
||||||
// set input data
|
// set input data
|
||||||
|
@ -7739,18 +7735,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (kv_self.has_shift) {
|
|
||||||
const int64_t n_ctx = cparams.n_ctx;
|
|
||||||
|
|
||||||
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
|
||||||
|
|
||||||
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
|
||||||
|
|
||||||
for (int i = 0; i < n_ctx; ++i) {
|
|
||||||
data[i] = lctx.kv_self.cells[i].delta;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
|
@ -7795,6 +7779,34 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void llama_graph_compute(
|
||||||
|
llama_context & lctx,
|
||||||
|
ggml_cgraph * gf,
|
||||||
|
int n_threads) {
|
||||||
|
#ifdef GGML_USE_MPI
|
||||||
|
const int64_t n_layer = hparams.n_layer;
|
||||||
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
||||||
|
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (lctx.backend_cpu != nullptr) {
|
||||||
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
||||||
|
|
||||||
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
||||||
|
|
||||||
|
#ifdef GGML_USE_MPI
|
||||||
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
// decode a batch of tokens by evaluating the transformer
|
// decode a batch of tokens by evaluating the transformer
|
||||||
//
|
//
|
||||||
// - lctx: llama context
|
// - lctx: llama context
|
||||||
|
@ -7890,6 +7902,10 @@ static int llama_decode_internal(
|
||||||
|
|
||||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||||
|
|
||||||
|
if (kv_self.has_shift) {
|
||||||
|
llama_kv_cache_apply_k_shift(&lctx);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_sched_reset(lctx.sched);
|
ggml_backend_sched_reset(lctx.sched);
|
||||||
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
||||||
|
|
||||||
|
@ -7898,6 +7914,7 @@ static int llama_decode_internal(
|
||||||
// the output is always the last tensor in the graph
|
// the output is always the last tensor in the graph
|
||||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
||||||
|
|
||||||
if (strcmp(res->name, "result_output") == 0) {
|
if (strcmp(res->name, "result_output") == 0) {
|
||||||
// the embeddings could be the second to last tensor, or the third to last tensor
|
// the embeddings could be the second to last tensor, or the third to last tensor
|
||||||
if (strcmp(embeddings->name, "result_norm") != 0) {
|
if (strcmp(embeddings->name, "result_norm") != 0) {
|
||||||
|
@ -7924,40 +7941,12 @@ static int llama_decode_internal(
|
||||||
n_threads = std::min(4, n_threads);
|
n_threads = std::min(4, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
|
||||||
const int64_t n_layer = hparams.n_layer;
|
|
||||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
|
||||||
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (lctx.backend_cpu != nullptr) {
|
|
||||||
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_set_inputs(lctx, batch);
|
llama_set_inputs(lctx, batch);
|
||||||
|
|
||||||
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
llama_graph_compute(lctx, gf, n_threads);
|
||||||
|
|
||||||
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
|
||||||
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// update the kv ring buffer
|
// update the kv ring buffer
|
||||||
{
|
{
|
||||||
if (kv_self.has_shift) {
|
|
||||||
kv_self.has_shift = false;
|
|
||||||
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
|
||||||
kv_self.cells[i].delta = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
kv_self.head += n_tokens;
|
kv_self.head += n_tokens;
|
||||||
|
|
||||||
// Ensure kv cache head points to a valid index.
|
// Ensure kv cache head points to a valid index.
|
||||||
|
@ -8053,6 +8042,28 @@ static int llama_decode_internal(
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_kv_cache_apply_k_shift(struct llama_context * ctx) {
|
||||||
|
struct llama_context & lctx = *ctx;
|
||||||
|
|
||||||
|
llama_set_k_shift(lctx);
|
||||||
|
|
||||||
|
{
|
||||||
|
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
||||||
|
|
||||||
|
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
auto & kv_self = ctx->kv_self;
|
||||||
|
|
||||||
|
kv_self.has_shift = false;
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
||||||
|
kv_self.cells[i].delta = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// tokenizer
|
// tokenizer
|
||||||
//
|
//
|
||||||
|
@ -12054,6 +12065,7 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
|
||||||
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Returns the *maximum* size of the state
|
// Returns the *maximum* size of the state
|
||||||
size_t llama_get_state_size(const struct llama_context * ctx) {
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||||
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -533,6 +533,8 @@ extern "C" {
|
||||||
llama_pos p1,
|
llama_pos p1,
|
||||||
int d);
|
int d);
|
||||||
|
|
||||||
|
LLAMA_API void llama_kv_cache_apply_k_shift(struct llama_context * ctx);
|
||||||
|
|
||||||
//
|
//
|
||||||
// State / sessions
|
// State / sessions
|
||||||
//
|
//
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue