adapt to new lora implementation

This commit is contained in:
nopperl 2024-07-18 09:59:30 +02:00
parent fa568f6a82
commit f40cd2073a

View file

@ -13848,13 +13848,13 @@ struct llm_build_context {
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (model.layers[il].attn_q_norm) { if (model.layers[il].attn_q_norm) {
@ -13962,7 +13962,7 @@ struct llm_build_context {
cb(cur, "result_norm", -1); cb(cur, "result_norm", -1);
// lm_head // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur); cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output_with_img_logits", -1); cb(cur, "result_output_with_img_logits", -1);
// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs. // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.