server : various params fixes

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-12-03 11:18:01 +02:00
parent f325205574
commit 11b4d582bc
No known key found for this signature in database
GPG key ID: 449E073F9DC10735

View file

@ -694,12 +694,9 @@ struct server_context {
params_dft.devices = params_base.speculative.devices; params_dft.devices = params_base.speculative.devices;
params_dft.model = params_base.speculative.model; params_dft.model = params_base.speculative.model;
params_dft.n_ctx = params_base.speculative.n_ctx; params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
params_dft.n_parallel = 1;
// force F16 KV cache for the draft model for extra performance
params_dft.cache_type_k = "f16";
params_dft.cache_type_v = "f16";
common_init_result llama_init_dft = common_init_from_params(params_dft); common_init_result llama_init_dft = common_init_from_params(params_dft);
@ -719,8 +716,14 @@ struct server_context {
return false; return false;
} }
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
cparams_dft = common_context_params_to_llama(params_dft); cparams_dft = common_context_params_to_llama(params_dft);
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context); cparams_dft.n_batch = n_ctx_dft;
// force F16 KV cache for the draft model for extra performance
cparams_dft.type_k = GGML_TYPE_F16;
cparams_dft.type_v = GGML_TYPE_F16;
// the context is not needed - we will create one for each slot // the context is not needed - we will create one for each slot
llama_free(llama_init_dft.context); llama_free(llama_init_dft.context);
@ -2312,6 +2315,10 @@ struct server_context {
continue; continue;
} }
if (slot.state != SLOT_STATE_GENERATING) {
continue;
}
llama_token id = slot.sampled; llama_token id = slot.sampled;
struct common_speculative_params params_spec; struct common_speculative_params params_spec;