diff --git a/src/llama.cpp b/src/llama.cpp index be6dbf88a..ef47b9534 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13178,13 +13178,13 @@ struct llm_build_context { // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -13218,7 +13218,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); - cur = ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur); cb(cur, "kqv_out", il); } @@ -13292,13 +13292,13 @@ struct llm_build_context { // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); @@ -13345,7 +13345,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); cb(cur, "kqv_out", il); } @@ -13362,13 +13362,13 @@ struct llm_build_context { // cross-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -13397,7 +13397,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); - cur = ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur); cb(cur, "kqv_out", il); } @@ -13454,7 +13454,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); }