server : handle speculative decoding llama_decode failures
Since this is the draft model, decode failures can be safely ignored. Without this check, sampling segfaults attempting to look up logits. Fixes #10547
This commit is contained in:
parent
8648c52101
commit
d2beb5ae07
1 changed files with 4 additions and 1 deletions
|
@ -2344,7 +2344,10 @@ struct server_context {
|
||||||
common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
|
common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_decode(ctx, slot.batch_spec);
|
int ret = llama_decode(ctx, slot.batch_spec);
|
||||||
|
if (ret != 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// the accepted tokens from the speculation
|
// the accepted tokens from the speculation
|
||||||
const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
|
const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue