diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fdd6ff01f..adcfa79f9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1745,10 +1745,7 @@ struct server_context { // Erase token cache const size_t n_erased = slot->cache_tokens.size(); - if (!llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1)) { - send_error(task, "Failed to erase slot KV cache", ERROR_TYPE_INVALID_REQUEST); - break; - } + llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1); slot->cache_tokens.clear(); server_task_result result; diff --git a/llama.cpp b/llama.cpp index ac8703ca2..145942078 100644 --- a/llama.cpp +++ b/llama.cpp @@ -15243,9 +15243,7 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, GGML_ASSERT(!kv_self.recurrent); // not implemented // Wipe the slot - if (!llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1)) { - return 0; - } + llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); const uint8_t * inp = src; diff --git a/llama.h b/llama.h index 3c313b884..0473f726a 100644 --- a/llama.h +++ b/llama.h @@ -523,6 +523,7 @@ extern "C" { struct llama_context * ctx); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) + // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails // seq_id < 0 : match any sequence // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf)