diff --git a/common/common.cpp b/common/common.cpp index e9c338028..1704b08a2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1393,8 +1393,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l // void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { - printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d\n", - view.n_cells, view.n_max_seq, view.used_cells, view.token_count); + printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, max contiguous cells=%d @ %d\n", + view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx); llama_kv_cache_view_cell * c_curr = view.cells; struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences; for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { @@ -1405,14 +1405,14 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { for (int j = 0; j < view.n_max_seq; j++) { if (cs_curr[j].seq_id >= 0) { seq_count++; } } - putchar(int('0' + (std::min(9, seq_count)))); + putchar(seq_count == 0 ? '.' : ('0' + (std::min(9, seq_count)))); } printf("\n=== Done dumping\n"); } void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { - printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d\n", - view.n_cells, view.n_max_seq, view.used_cells, view.token_count); + printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, max contiguous cells=%d @ %d\n", + view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx); std::unordered_map seqs; llama_kv_cache_view_cell * c_curr = view.cells; diff --git a/llama.cpp b/llama.cpp index e23d820ea..c564d636d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8807,12 +8807,14 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) { struct llama_kv_cache_view result = { - /*.n_cells*/ 0, - /*.n_max_seq*/ n_max_seq, - /*.token_count*/ 0, - /*.used_cells*/ llama_get_kv_cache_used_cells(ctx), - /*.cells*/ nullptr, - /*.cells_sequences*/ nullptr, + /*.n_cells*/ 0, + /*.n_max_seq*/ n_max_seq, + /*.token_count*/ 0, + /*.used_cells*/ llama_get_kv_cache_used_cells(ctx), + /*max_contiguous*/ 0, + /*max_contiguous_idx*/ -1, + /*.cells*/ nullptr, + /*.cells_sequences*/ nullptr, }; return result; } @@ -8844,11 +8846,25 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k struct llama_kv_cache_view_cell_sequence * cs_curr = view->cells_sequences; int32_t used_cells = 0; int32_t token_count = 0; + int32_t curr_contig_idx = -1; + uint32_t max_contig = 0; + int32_t max_contig_idx = -1; - for (uint32_t i = 0; i < ctx->kv_self.size; i++, c_curr++, cs_curr += view->n_max_seq) { - token_count += ctx->kv_self.cells[i].seq_id.size(); + for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) { + const size_t curr_size = kv_cells[i].seq_id.size(); + token_count += curr_size; c_curr->pos = kv_cells[i].pos + kv_cells[i].delta; + if (curr_size > 0) { + if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) { + max_contig_idx = i; + max_contig = i - curr_contig_idx; + } + curr_contig_idx = -1; + } else if (curr_contig_idx < 0) { + curr_contig_idx = i; + } + int seq_idx = 0; for (const llama_seq_id it : kv_cells[i].seq_id) { if (seq_idx >= view->n_max_seq) { @@ -8864,6 +8880,12 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k cs_curr[seq_idx].seq_id = -1; } } + if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) { + max_contig_idx = curr_contig_idx; + max_contig = kv_cells.size() - curr_contig_idx; + } + view->max_contiguous_cells = max_contig; + view->max_contiguous_cells_idx = max_contig_idx; view->token_count = token_count; view->used_cells = used_cells; if (uint32_t(used_cells) != ctx->kv_self.used) { diff --git a/llama.h b/llama.h index 72a156ac5..3d3b38c12 100644 --- a/llama.h +++ b/llama.h @@ -366,6 +366,7 @@ extern "C" { }; struct llama_kv_cache_view_cell_sequence { + // Would like to have token_id here as well. llama_seq_id seq_id; }; @@ -374,7 +375,9 @@ extern "C" { int32_t n_max_seq; int32_t token_count; int32_t used_cells; - struct llama_kv_cache_view_cell *cells; + int32_t max_contiguous_cells; + int32_t max_contiguous_cells_idx; + struct llama_kv_cache_view_cell * cells; struct llama_kv_cache_view_cell_sequence * cells_sequences; };