From d2beb5ae07ac4b4c9f8cff49d5a94fce1f649403 Mon Sep 17 00:00:00 2001
From: Josh Bleecher Snyder <josharian@gmail.com>
Date: Mon, 2 Dec 2024 12:23:26 -0800
Subject: [PATCH] server : handle speculative decoding llama_decode failures

Since this is the draft model, decode failures can be safely ignored.
Without this check, sampling segfaults attempting to look up logits.

Fixes #10547
---
 examples/server/server.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8eca14b86..f69a417eb 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2344,7 +2344,10 @@ struct server_context {
                     common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
                 }
 
-                llama_decode(ctx, slot.batch_spec);
+                int ret = llama_decode(ctx, slot.batch_spec);
+                if (ret != 0) {
+                    continue;
+                }
 
                 // the accepted tokens from the speculation
                 const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);