server : fix speculative decoding with context shift
ggml-ci
This commit is contained in:
parent
cc98896db8
commit
a5a915b51e
1 changed files with 1 additions and 1 deletions
|
@ -2325,7 +2325,7 @@ struct server_context {
|
||||||
llama_token id = slot.sampled;
|
llama_token id = slot.sampled;
|
||||||
|
|
||||||
struct common_speculative_params params_spec;
|
struct common_speculative_params params_spec;
|
||||||
params_spec.n_draft = slot.params.speculative.n_max;
|
params_spec.n_draft = std::min(slot.params.speculative.n_max, slot.n_ctx - slot.n_past - 1);
|
||||||
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
|
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
|
||||||
params_spec.p_min = slot.params.speculative.p_min;
|
params_spec.p_min = slot.params.speculative.p_min;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue