use lowvram flag for offload qkv

This commit is contained in:
Concedo 2023-12-08 18:16:14 +08:00
parent ec21fa7712
commit 7469f202ea

View file

@ -895,7 +895,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
//llama_ctx_paran_parts = -1; //llama_ctx_paran_parts = -1;
llama_ctx_params.seed = -1; llama_ctx_params.seed = -1;
//llama_ctx_params.f16_kv = true; //llama_ctx_params.f16_kv = true;
//llama_ctx_params.low_vram = inputs.low_vram; llama_ctx_params.offload_kqv = !inputs.low_vram;
llama_ctx_params.mul_mat_q = inputs.use_mmq; llama_ctx_params.mul_mat_q = inputs.use_mmq;
llama_ctx_params.logits_all = false; llama_ctx_params.logits_all = false;
model_params.use_mmap = inputs.use_mmap; model_params.use_mmap = inputs.use_mmap;