server : handle speculative decoding llama_decode failures
Since this is the draft model, decode failures can be safely ignored. Without this check, sampling segfaults attempting to look up logits. Fixes #10547
This commit is contained in:
parent
8648c52101
commit
d2beb5ae07
1 changed files with 4 additions and 1 deletions
|
@ -2344,7 +2344,10 @@ struct server_context {
|
|||
common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
|
||||
}
|
||||
|
||||
llama_decode(ctx, slot.batch_spec);
|
||||
int ret = llama_decode(ctx, slot.batch_spec);
|
||||
if (ret != 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// the accepted tokens from the speculation
|
||||
const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue