server : handle speculative decoding llama_decode failures

Since this is the draft model, decode failures can be safely ignored.
Without this check, sampling segfaults attempting to look up logits.

Fixes #10547
This commit is contained in:
Josh Bleecher Snyder 2024-12-02 12:23:26 -08:00
parent 8648c52101
commit d2beb5ae07

View file

@ -2344,7 +2344,10 @@ struct server_context {
common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
}
llama_decode(ctx, slot.batch_spec);
int ret = llama_decode(ctx, slot.batch_spec);
if (ret != 0) {
continue;
}
// the accepted tokens from the speculation
const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);