llama : refactor k-shift implementation
ggml-ci
This commit is contained in:
parent
15499eb942
commit
1a999819a2
3 changed files with 173 additions and 159 deletions
|
@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
|
|||
const int n_batch = ctx_params.n_batch;
|
||||
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
||||
|
||||
// print the prompt token-by-token
|
||||
|
||||
|
|
328
llama.cpp
328
llama.cpp
|
@ -1550,8 +1550,9 @@ static const size_t MiB = 1024*kiB;
|
|||
static const size_t GiB = 1024*MiB;
|
||||
|
||||
struct llama_hparams {
|
||||
bool vocab_only;
|
||||
bool rope_finetuned;
|
||||
bool vocab_only;
|
||||
bool rope_finetuned;
|
||||
|
||||
uint32_t n_vocab;
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_embd;
|
||||
|
@ -4595,10 +4596,11 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|||
|
||||
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
|
||||
|
||||
enum llm_rope_type {
|
||||
LLM_ROPE,
|
||||
LLM_ROPE_NEOX,
|
||||
LLM_ROPE_GLM,
|
||||
enum llm_rope_type : int {
|
||||
LLM_ROPE_NONE = -1,
|
||||
LLM_ROPE = 0,
|
||||
LLM_ROPE_NEOX = 2,
|
||||
LLM_ROPE_GLM = 4,
|
||||
};
|
||||
|
||||
enum llm_ffn_op_type {
|
||||
|
@ -4655,7 +4657,7 @@ static void llm_build_k_shift(
|
|||
const llama_kv_cache & kv,
|
||||
struct ggml_cgraph * graph,
|
||||
struct ggml_tensor * K_shift,
|
||||
llm_rope_type type,
|
||||
llm_rope_type rope_type,
|
||||
int64_t n_ctx,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
|
@ -4671,14 +4673,6 @@ static void llm_build_k_shift(
|
|||
const float beta_fast = cparams.yarn_beta_fast;
|
||||
const float beta_slow = cparams.yarn_beta_slow;
|
||||
|
||||
int rope_type = 0;
|
||||
|
||||
switch (type) {
|
||||
case LLM_ROPE: rope_type = 0; break;
|
||||
case LLM_ROPE_NEOX: rope_type = 2; break;
|
||||
case LLM_ROPE_GLM: rope_type = 4; break;
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * tmp =
|
||||
// we rotate only the first n_rot dimensions
|
||||
|
@ -4988,6 +4982,38 @@ static struct ggml_tensor * llm_build_kv(
|
|||
return cur;
|
||||
}
|
||||
|
||||
static llm_rope_type llm_get_rope_type(llm_arch arch) {
|
||||
switch (arch) {
|
||||
case LLM_ARCH_LLAMA: return LLM_ROPE;
|
||||
case LLM_ARCH_FALCON: return LLM_ROPE_NEOX;
|
||||
case LLM_ARCH_BAICHUAN: return LLM_ROPE;
|
||||
case LLM_ARCH_GPT2: return LLM_ROPE_NONE;
|
||||
case LLM_ARCH_GPTJ: return LLM_ROPE_NONE;
|
||||
case LLM_ARCH_GPTNEOX: return LLM_ROPE_NONE;
|
||||
case LLM_ARCH_MPT: return LLM_ROPE_NONE;
|
||||
case LLM_ARCH_STARCODER: return LLM_ROPE;
|
||||
case LLM_ARCH_PERSIMMON: return LLM_ROPE_NEOX;
|
||||
case LLM_ARCH_REFACT: return LLM_ROPE_NONE;
|
||||
case LLM_ARCH_BERT: return LLM_ROPE_NEOX;
|
||||
case LLM_ARCH_NOMIC_BERT: return LLM_ROPE_NEOX;
|
||||
case LLM_ARCH_BLOOM: return LLM_ROPE_NONE;
|
||||
case LLM_ARCH_STABLELM: return LLM_ROPE_NEOX;
|
||||
case LLM_ARCH_QWEN: return LLM_ROPE_NEOX;
|
||||
case LLM_ARCH_QWEN2: return LLM_ROPE_NEOX;
|
||||
case LLM_ARCH_PHI2: return LLM_ROPE_NEOX;
|
||||
case LLM_ARCH_PLAMO: return LLM_ROPE;
|
||||
case LLM_ARCH_CODESHELL: return LLM_ROPE;
|
||||
case LLM_ARCH_ORION: return LLM_ROPE;
|
||||
case LLM_ARCH_INTERNLM2: return LLM_ROPE;
|
||||
case LLM_ARCH_MINICPM: return LLM_ROPE;
|
||||
case LLM_ARCH_GEMMA: return LLM_ROPE;
|
||||
case LLM_ARCH_UNKNOWN:
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown architecture");
|
||||
return LLM_ROPE_NONE;
|
||||
}
|
||||
}
|
||||
|
||||
struct llm_build_context {
|
||||
const llama_model & model;
|
||||
const llama_context & lctx;
|
||||
|
@ -5022,9 +5048,10 @@ struct llm_build_context {
|
|||
const int32_t kv_head; // index of where we store new KV data in the cache
|
||||
const int32_t n_orig_ctx;
|
||||
|
||||
const bool do_rope_shift;
|
||||
const uint32_t pooling_type;
|
||||
|
||||
const llm_rope_type rope_type;
|
||||
|
||||
const llm_build_cb & cb;
|
||||
|
||||
std::vector<uint8_t> & buf_compute_meta;
|
||||
|
@ -5066,8 +5093,8 @@ struct llm_build_context {
|
|||
n_kv (worst_case ? n_ctx : kv_self.n),
|
||||
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
||||
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
||||
do_rope_shift (worst_case || kv_self.has_shift),
|
||||
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
|
||||
rope_type (llm_get_rope_type(model.arch)),
|
||||
cb (cb),
|
||||
buf_compute_meta (lctx.buf_compute_meta) {
|
||||
// all initializations should be done in init()
|
||||
|
@ -5090,6 +5117,14 @@ struct llm_build_context {
|
|||
}
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_k_shift() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, rope_type, n_ctx, freq_base, freq_scale, cb);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_llama() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
|
@ -5111,11 +5146,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
|
@ -5151,14 +5181,14 @@ struct llm_build_context {
|
|||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -5299,11 +5329,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
||||
cb(KQ_pos, "KQ_pos", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
|
@ -5327,12 +5352,12 @@ struct llm_build_context {
|
|||
case MODEL_7B:
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
break;
|
||||
|
@ -5417,11 +5442,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * attn_norm;
|
||||
|
||||
|
@ -5460,13 +5480,13 @@ struct llm_build_context {
|
|||
|
||||
// using mode = 2 for neox mode
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -5636,10 +5656,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * residual = inpL;
|
||||
|
||||
|
@ -5730,13 +5746,13 @@ struct llm_build_context {
|
|||
cb(kpass, "kpass", il);
|
||||
|
||||
struct ggml_tensor * qrotated = ggml_rope_custom(
|
||||
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
ctx0, qrot, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(qrotated, "qrotated", il);
|
||||
|
||||
struct ggml_tensor * krotated = ggml_rope_custom(
|
||||
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
ctx0, krot, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(krotated, "krotated", il);
|
||||
|
@ -5988,14 +6004,14 @@ struct llm_build_context {
|
|||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -6284,11 +6300,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
|
@ -6325,14 +6336,14 @@ struct llm_build_context {
|
|||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -6407,11 +6418,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
|
@ -6441,13 +6447,13 @@ struct llm_build_context {
|
|||
|
||||
// using mode = 2 for neox mode
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -6521,11 +6527,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
|
@ -6561,14 +6562,14 @@ struct llm_build_context {
|
|||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -6642,11 +6643,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm,
|
||||
|
@ -6684,7 +6680,7 @@ struct llm_build_context {
|
|||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
ctx0, Qcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
@ -6695,7 +6691,7 @@ struct llm_build_context {
|
|||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
||||
ctx0, Kcur, inp_pos, hparams.n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -6764,11 +6760,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
// norm
|
||||
|
@ -6793,13 +6784,13 @@ struct llm_build_context {
|
|||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
|
||||
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
|
||||
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
|
@ -6969,11 +6960,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm,
|
||||
|
@ -6999,14 +6985,14 @@ struct llm_build_context {
|
|||
|
||||
struct ggml_tensor * Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -7077,11 +7063,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
|
@ -7117,14 +7098,14 @@ struct llm_build_context {
|
|||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -7196,11 +7177,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
|
@ -7236,14 +7212,14 @@ struct llm_build_context {
|
|||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -7328,11 +7304,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
|
@ -7368,14 +7339,14 @@ struct llm_build_context {
|
|||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
hparams.n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -7464,11 +7435,6 @@ struct llm_build_context {
|
|||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
// norm
|
||||
|
@ -7491,7 +7457,7 @@ struct llm_build_context {
|
|||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
||||
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
|
@ -7500,7 +7466,7 @@ struct llm_build_context {
|
|||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
||||
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
|
@ -7553,6 +7519,22 @@ struct llm_build_context {
|
|||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
|
||||
llama_batch dummy;
|
||||
|
||||
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
||||
|
||||
struct llm_build_context llm(lctx, dummy, cb, false);
|
||||
|
||||
llm.init();
|
||||
|
||||
struct ggml_cgraph * result = llm.build_k_shift();
|
||||
|
||||
llm.free();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph(
|
||||
llama_context & lctx,
|
||||
const llama_batch & batch,
|
||||
|
@ -7672,6 +7654,20 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
return result;
|
||||
}
|
||||
|
||||
static void llama_set_k_shift(llama_context & lctx) {
|
||||
const auto & cparams = lctx.cparams;
|
||||
|
||||
const int64_t n_ctx = cparams.n_ctx;
|
||||
|
||||
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
||||
|
||||
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
||||
|
||||
for (int i = 0; i < n_ctx; ++i) {
|
||||
data[i] = lctx.kv_self.cells[i].delta;
|
||||
}
|
||||
}
|
||||
|
||||
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||
//
|
||||
// set input data
|
||||
|
@ -7739,18 +7735,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
}
|
||||
}
|
||||
|
||||
if (kv_self.has_shift) {
|
||||
const int64_t n_ctx = cparams.n_ctx;
|
||||
|
||||
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
||||
|
||||
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
||||
|
||||
for (int i = 0; i < n_ctx; ++i) {
|
||||
data[i] = lctx.kv_self.cells[i].delta;
|
||||
}
|
||||
}
|
||||
|
||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
|
@ -7795,6 +7779,34 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
}
|
||||
}
|
||||
|
||||
static void llama_graph_compute(
|
||||
llama_context & lctx,
|
||||
ggml_cgraph * gf,
|
||||
int n_threads) {
|
||||
#ifdef GGML_USE_MPI
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
||||
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (lctx.backend_cpu != nullptr) {
|
||||
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
||||
}
|
||||
|
||||
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
||||
|
||||
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
||||
#endif
|
||||
}
|
||||
|
||||
// decode a batch of tokens by evaluating the transformer
|
||||
//
|
||||
// - lctx: llama context
|
||||
|
@ -7890,14 +7902,19 @@ static int llama_decode_internal(
|
|||
|
||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||
|
||||
if (kv_self.has_shift) {
|
||||
llama_kv_cache_apply_k_shift(&lctx);
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(lctx.sched);
|
||||
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
||||
|
||||
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
||||
|
||||
// the output is always the last tensor in the graph
|
||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
||||
|
||||
if (strcmp(res->name, "result_output") == 0) {
|
||||
// the embeddings could be the second to last tensor, or the third to last tensor
|
||||
if (strcmp(embeddings->name, "result_norm") != 0) {
|
||||
|
@ -7924,40 +7941,12 @@ static int llama_decode_internal(
|
|||
n_threads = std::min(4, n_threads);
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
||||
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (lctx.backend_cpu != nullptr) {
|
||||
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
||||
}
|
||||
|
||||
llama_set_inputs(lctx, batch);
|
||||
|
||||
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
||||
|
||||
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
||||
#endif
|
||||
llama_graph_compute(lctx, gf, n_threads);
|
||||
|
||||
// update the kv ring buffer
|
||||
{
|
||||
if (kv_self.has_shift) {
|
||||
kv_self.has_shift = false;
|
||||
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
||||
kv_self.cells[i].delta = 0;
|
||||
}
|
||||
}
|
||||
|
||||
kv_self.head += n_tokens;
|
||||
|
||||
// Ensure kv cache head points to a valid index.
|
||||
|
@ -8053,6 +8042,28 @@ static int llama_decode_internal(
|
|||
return 0;
|
||||
}
|
||||
|
||||
void llama_kv_cache_apply_k_shift(struct llama_context * ctx) {
|
||||
struct llama_context & lctx = *ctx;
|
||||
|
||||
llama_set_k_shift(lctx);
|
||||
|
||||
{
|
||||
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
||||
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
||||
}
|
||||
|
||||
{
|
||||
auto & kv_self = ctx->kv_self;
|
||||
|
||||
kv_self.has_shift = false;
|
||||
|
||||
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
||||
kv_self.cells[i].delta = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// tokenizer
|
||||
//
|
||||
|
@ -12054,6 +12065,7 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
|
|||
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
||||
}
|
||||
|
||||
|
||||
// Returns the *maximum* size of the state
|
||||
size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
||||
|
|
2
llama.h
2
llama.h
|
@ -533,6 +533,8 @@ extern "C" {
|
|||
llama_pos p1,
|
||||
int d);
|
||||
|
||||
LLAMA_API void llama_kv_cache_apply_k_shift(struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// State / sessions
|
||||
//
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue