From 076346db8a85ceabe485a852d577d05bdfb2f308 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 28 Dec 2024 16:16:57 +0100 Subject: [PATCH] fix condition --- examples/server/server.cpp | 23 ++++++++++++++++------- examples/server/tests/README.md | 6 ++++++ examples/server/tests/unit/test_lora.py | 5 ++--- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d6e32084a..a5caf6ac9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2558,12 +2558,22 @@ struct server_context { // start populating the batch for this iteration common_batch_clear(batch); + // track if given slot can be batched with slots already in the batch + server_slot * slot_batched = nullptr; + // frist, add sampled tokens from any ongoing sequences for (auto & slot : slots) { if (slot.state != SLOT_STATE_GENERATING) { continue; } + // check if we can batch this slot with the previous one + if (!slot_batched) { + slot_batched = &slot; + } else if (slot_batched && !slot_batched->can_batch_with(slot)) { + continue; + } + slot.i_batch = batch.n_tokens; common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true); @@ -2582,17 +2592,16 @@ struct server_context { int32_t n_batch = llama_n_batch(ctx); int32_t n_ubatch = llama_n_ubatch(ctx); - // track if given slot can be batched with slots already in the batch - server_slot * slot_batched = nullptr; - // next, batch any pending prompts without exceeding n_batch if (params_base.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { // check if we can batch this slot with the previous one - if (!slot_batched) { - slot_batched = &slot; - } else if (slot_batched && !slot_batched->can_batch_with(slot)) { - continue; + if (slot.is_processing()) { + if (!slot_batched) { + slot_batched = &slot; + } else if (slot_batched && !slot_batched->can_batch_with(slot)) { + continue; + } } // this slot still has a prompt to be processed diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index fa3d0a2f5..5787276ab 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -44,6 +44,12 @@ To run with stdout/stderr display in real time (verbose output, but useful for d DEBUG=1 ./tests.sh -s -v -x ``` +To run single test unit: + +```shell +./tests.sh unit/test_{name of test case here}.py -v -x +``` + Hint: You can compile and run test in single command, useful for local developement: ```shell diff --git a/examples/server/tests/unit/test_lora.py b/examples/server/tests/unit/test_lora.py index 0751f156b..68a7be17e 100644 --- a/examples/server/tests/unit/test_lora.py +++ b/examples/server/tests/unit/test_lora.py @@ -52,12 +52,11 @@ def test_lora_per_request(): lora_config = [ ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ), ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ), - ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ), - ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ), + ( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ), + ( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ), ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ), ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ), ] - # FIXME: tesing with scale between 0.0 and 1.0 (i.e. 0.2, 0.5, 0.7) produces unreliable results tasks = [( server.make_request,