diff --git a/examples/server-parallel/frontend.h b/examples/server-parallel/frontend.h index 7c2b8ca7c..122d58b44 100644 --- a/examples/server-parallel/frontend.h +++ b/examples/server-parallel/frontend.h @@ -131,7 +131,6 @@ function updateView() { async function call_llama(options) { try { controller = new AbortController(); - signal = controller.signal; const response = await fetch("/completion", { method: "POST", body: JSON.stringify(options), @@ -139,7 +138,7 @@ async function call_llama(options) { "Content-Type": "application/json", Accept: "text/event-stream", }, - signal: signal + signal: controller.signal }); const reader = response.body.getReader(); const decoder = new TextDecoder(); diff --git a/examples/server-parallel/server.cpp b/examples/server-parallel/server.cpp index 13dd1fcc0..0d98db4de 100644 --- a/examples/server-parallel/server.cpp +++ b/examples/server-parallel/server.cpp @@ -310,6 +310,7 @@ struct server_parallel_context { slot.command = NONE; slot.n_prompt = 0; slot.n_tokens_predicted = 0; + slot.sampled_tokens.clear(); continue; } @@ -346,8 +347,6 @@ struct server_parallel_context { // do not prepend BOS because we have a system prompt! std::vector tokens_prompt; tokens_prompt = ::llama_tokenize(ctx, slot.prompt, false); - slot.n_tokens_predicted = 0; - slot.sampled_tokens.clear(); for (size_t i = 0; i < tokens_prompt.size(); ++i) { batch.token [batch.n_tokens] = tokens_prompt[i];