From ab1c46a7bfa1196e1e5cb669ad13c1be7cc2ce21 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Wed, 27 Mar 2024 19:11:47 +0800 Subject: [PATCH] respond error in case there's no space in the kv cache --- examples/server/server.cpp | 4 +++ llama.cpp | 51 ++++++++++++++++++++++---------------- 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4e9a0e9e3..227bb3c6b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1689,6 +1689,10 @@ struct server_context { infile.close(); size_t nread = llama_set_seq_data(ctx, state_data.data(), slot->id + 1); + if (nread == 0) { + send_error(task, "Unable to restore slot, no available space in KV cache", ERROR_TYPE_INVALID_REQUEST); + break; + } GGML_ASSERT(nread <= state_data.size()); // restore cached token values diff --git a/llama.cpp b/llama.cpp index e4ee7b3e6..0faaac013 100644 --- a/llama.cpp +++ b/llama.cpp @@ -15111,12 +15111,13 @@ size_t llama_get_seq_size(struct llama_context* ctx, llama_seq_id seq_id) { size_t llama_copy_seq_data(struct llama_context * ctx, uint8_t * dst, llama_seq_id seq_id) { llama_data_buffer_context data_ctx(dst); + const auto& kv_self = ctx->kv_self; + GGML_ASSERT(!kv_self.recurrent); // not implemented // Save the size of size_t as a uint32_t for safety check const uint32_t size_t_size = sizeof(size_t); data_ctx.write(&size_t_size, sizeof(size_t_size)); - const auto& kv_self = ctx->kv_self; std::vector> cell_ranges; // ranges, from inclusive, to exclusive uint32_t cell_count = 0; @@ -15215,6 +15216,7 @@ size_t llama_copy_seq_data(struct llama_context * ctx, uint8_t * dst, llama_seq_ size_t llama_set_seq_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) { auto & kv_self = ctx->kv_self; + GGML_ASSERT(!kv_self.recurrent); // not implemented // Wipe the slot llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); @@ -15243,26 +15245,34 @@ size_t llama_set_seq_data(struct llama_context * ctx, const uint8_t * src, llama inp += sizeof(n_embd_v_gqa_ref); // Allocate the new cells for the slot - llama_batch batch = llama_batch_init(cell_count, 0, 1); - batch.n_tokens = cell_count; - for (uint32_t i = 0; i < cell_count; ++i) { - llama_pos pos; - memcpy(&pos, inp, sizeof(pos)); - inp += sizeof(pos); + { + llama_batch batch = llama_batch_init(cell_count, 0, 1); + batch.n_tokens = cell_count; + for (uint32_t i = 0; i < cell_count; ++i) { + llama_pos pos; + memcpy(&pos, inp, sizeof(pos)); + inp += sizeof(pos); - batch.pos[i] = pos; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = dest_seq_id; + batch.pos[i] = pos; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = dest_seq_id; + } + if (!llama_kv_cache_find_slot(kv_self, batch)) { + llama_batch_free(batch); + return 0; + } + + // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values) + // Assume that this is one contiguous block of cells + GGML_ASSERT(kv_self.head + cell_count <= kv_self.size); + GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]); + GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]); + GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id)); + GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id)); + + // Cleanup + llama_batch_free(batch); } - llama_kv_cache_find_slot(kv_self, batch); - - // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values) - // Assume that this is one contiguous block of cells - GGML_ASSERT(kv_self.head + cell_count <= kv_self.size); - GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]); - GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]); - GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id)); - GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id)); const auto& hparams = ctx->model.hparams; const uint32_t n_layer = hparams.n_layer; @@ -15305,9 +15315,6 @@ size_t llama_set_seq_data(struct llama_context * ctx, const uint8_t * src, llama } } - // Cleanup - llama_batch_free(batch); - const size_t nread = inp - src; return nread; }