server : various params fixes

ggml-ci
2024-12-03 11:18:01 +02:00 · 2024-12-03 11:18:01 +02:00 · 11b4d582bc
commit 11b4d582bc
parent f325205574
1 changed files with 13 additions and 6 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -694,12 +694,9 @@ struct server_context {

            params_dft.devices      = params_base.speculative.devices;
            params_dft.model        = params_base.speculative.model;
-            params_dft.n_ctx        = params_base.speculative.n_ctx;
+            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
-
-            // force F16 KV cache for the draft model for extra performance
-            params_dft.cache_type_k = "f16";
-            params_dft.cache_type_v = "f16";
+            params_dft.n_parallel   = 1;

            common_init_result llama_init_dft = common_init_from_params(params_dft);

@ -719,8 +716,14 @@ struct server_context {
                return false;
            }

+            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
+
            cparams_dft = common_context_params_to_llama(params_dft);
-            cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
+            cparams_dft.n_batch = n_ctx_dft;
+
+            // force F16 KV cache for the draft model for extra performance
+            cparams_dft.type_k = GGML_TYPE_F16;
+            cparams_dft.type_v = GGML_TYPE_F16;

            // the context is not needed - we will create one for each slot
            llama_free(llama_init_dft.context);
@ -2312,6 +2315,10 @@ struct server_context {
                    continue;
                }

+                if (slot.state != SLOT_STATE_GENERATING) {
+                    continue;
+                }
+
                llama_token id = slot.sampled;

                struct common_speculative_params params_spec;