diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 67d704f1b..d6e32084a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2588,6 +2588,13 @@ struct server_context { // next, batch any pending prompts without exceeding n_batch if (params_base.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { + // check if we can batch this slot with the previous one + if (!slot_batched) { + slot_batched = &slot; + } else if (slot_batched && !slot_batched->can_batch_with(slot)) { + continue; + } + // this slot still has a prompt to be processed if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) { auto & prompt_tokens = slot.prompt_tokens; @@ -2748,13 +2755,6 @@ struct server_context { } } - // check if we can batch this slot with the previous one - if (!slot_batched) { - slot_batched = &slot; - } else if (slot_batched && !slot_batched->can_batch_with(slot)) { - continue; - } - // keep only the common part if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) diff --git a/examples/server/tests/unit/test_lora.py b/examples/server/tests/unit/test_lora.py index ea5092733..0751f156b 100644 --- a/examples/server/tests/unit/test_lora.py +++ b/examples/server/tests/unit/test_lora.py @@ -68,10 +68,9 @@ def test_lora_per_request(): "temperature": 0.0, "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed }) - ) for lora, re_test in lora_config] + ) for lora, _ in lora_config] results = parallel_function_calls(tasks) - print(results) assert all([res.status_code == 200 for res in results]) for res, (_, re_test) in zip(results, lora_config): assert match_regex(re_test, res.body["content"])