llama : allow exporting a view of the KV cache (#4180)

* Allow exporting a view of the KV cache

* Allow dumping the sequences per cell in common

* Track max contiguous cells value and position as well

* Fix max contiguous empty cells index calculation

Make dump functions deal with lengths or sequences counts > 10 better

* Fix off by one error in dump_kv_cache_view

* Add doc comments for KV cache view functions

Eliminate cell sequence struct; use llama_seq_id directly

Minor cleanups
This commit is contained in:
Kerfuffle 2023-11-23 09:31:20 -07:00 committed by GitHub
parent 671f639c59
commit 5df7d06c42
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 227 additions and 0 deletions

View file

@ -172,6 +172,8 @@ int main(int argc, char ** argv) {
int32_t n_total_gen = 0;
int32_t n_cache_miss = 0;
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
const auto t_main_start = ggml_time_us();
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
@ -201,6 +203,9 @@ int main(int argc, char ** argv) {
LOG_TEE("Processing requests ...\n\n");
while (true) {
llama_kv_cache_view_update(ctx, &kvc_view);
dump_kv_cache_view_seqs(kvc_view, 40);
llama_batch_clear(batch);
// decode any currently ongoing sequences