From d2beb5ae07ac4b4c9f8cff49d5a94fce1f649403 Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Mon, 2 Dec 2024 12:23:26 -0800 Subject: [PATCH] server : handle speculative decoding llama_decode failures Since this is the draft model, decode failures can be safely ignored. Without this check, sampling segfaults attempting to look up logits. Fixes #10547 --- examples/server/server.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8eca14b86..f69a417eb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2344,7 +2344,10 @@ struct server_context { common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true); } - llama_decode(ctx, slot.batch_spec); + int ret = llama_decode(ctx, slot.batch_spec); + if (ret != 0) { + continue; + } // the accepted tokens from the speculation const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);