From 150d6e923281536c7db9cbe6cbb088a017691b2f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 29 Nov 2024 19:33:49 +0200 Subject: [PATCH] server : force F16 KV cache for the draft model ggml-ci --- examples/server/server.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9c86407c2..92c463dce 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -697,6 +697,10 @@ struct server_context { params_dft.n_ctx = params_base.speculative.n_ctx; params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; + // force F16 KV cache for the draft model for extra performance + params_dft.cache_type_k = "f16"; + params_dft.cache_type_v = "f16"; + common_init_result llama_init_dft = common_init_from_params(params_dft); model_dft = llama_init_dft.model;