Add doc comments for KV cache view functions
Eliminate cell sequence struct; use llama_seq_id directly Minor cleanups
This commit is contained in:
parent
bc1c346ae8
commit
aa21e6dbc2
4 changed files with 59 additions and 34 deletions
|
@ -1396,10 +1396,10 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
|
||||||
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
||||||
|
|
||||||
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
||||||
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx);
|
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||||
|
|
||||||
llama_kv_cache_view_cell * c_curr = view.cells;
|
llama_kv_cache_view_cell * c_curr = view.cells;
|
||||||
struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences;
|
llama_seq_id * cs_curr = view.cells_sequences;
|
||||||
|
|
||||||
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
||||||
if (i % row_size == 0) {
|
if (i % row_size == 0) {
|
||||||
|
@ -1407,7 +1407,7 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
|
||||||
}
|
}
|
||||||
int seq_count = 0;
|
int seq_count = 0;
|
||||||
for (int j = 0; j < view.n_max_seq; j++) {
|
for (int j = 0; j < view.n_max_seq; j++) {
|
||||||
if (cs_curr[j].seq_id >= 0) { seq_count++; }
|
if (cs_curr[j] >= 0) { seq_count++; }
|
||||||
}
|
}
|
||||||
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
|
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
|
||||||
}
|
}
|
||||||
|
@ -1419,18 +1419,18 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
||||||
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||||
|
|
||||||
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
||||||
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous_cells, view.max_contiguous_cells_idx);
|
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||||
|
|
||||||
std::unordered_map<llama_seq_id, size_t> seqs;
|
std::unordered_map<llama_seq_id, size_t> seqs;
|
||||||
llama_kv_cache_view_cell * c_curr = view.cells;
|
llama_kv_cache_view_cell * c_curr = view.cells;
|
||||||
struct llama_kv_cache_view_cell_sequence * cs_curr = view.cells_sequences;
|
llama_seq_id * cs_curr = view.cells_sequences;
|
||||||
|
|
||||||
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
||||||
for (int j = 0; j < view.n_max_seq; j++) {
|
for (int j = 0; j < view.n_max_seq; j++) {
|
||||||
if (cs_curr[j].seq_id < 0) { continue; }
|
if (cs_curr[j] < 0) { continue; }
|
||||||
if (seqs.find(cs_curr[j].seq_id) == seqs.end()) {
|
if (seqs.find(cs_curr[j]) == seqs.end()) {
|
||||||
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||||
seqs[cs_curr[j].seq_id] = seqs.size();
|
seqs[cs_curr[j]] = seqs.size();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||||
|
@ -1449,8 +1449,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
||||||
printf("\n%5d: ", i);
|
printf("\n%5d: ", i);
|
||||||
}
|
}
|
||||||
for (int j = 0; j < view.n_max_seq; j++) {
|
for (int j = 0; j < view.n_max_seq; j++) {
|
||||||
if (cs_curr[j].seq_id >= 0) {
|
if (cs_curr[j] >= 0) {
|
||||||
const auto & it = seqs.find(cs_curr[j].seq_id);
|
const auto & it = seqs.find(cs_curr[j]);
|
||||||
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
|
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
|
||||||
} else {
|
} else {
|
||||||
putchar('.');
|
putchar('.');
|
||||||
|
|
|
@ -223,5 +223,8 @@ void dump_non_result_info_yaml(
|
||||||
// KV cache utils
|
// KV cache utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// Dump the KV cache view with the number of sequences per cell.
|
||||||
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
|
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||||
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 80);
|
|
||||||
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||||
|
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||||
|
|
30
llama.cpp
30
llama.cpp
|
@ -8807,14 +8807,14 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
||||||
|
|
||||||
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
|
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
|
||||||
struct llama_kv_cache_view result = {
|
struct llama_kv_cache_view result = {
|
||||||
/*.n_cells*/ 0,
|
/*.n_cells = */ 0,
|
||||||
/*.n_max_seq*/ n_max_seq,
|
/*.n_max_seq = */ n_max_seq,
|
||||||
/*.token_count*/ 0,
|
/*.token_count = */ 0,
|
||||||
/*.used_cells*/ llama_get_kv_cache_used_cells(ctx),
|
/*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
|
||||||
/*max_contiguous*/ 0,
|
/*.max_contiguous = */ 0,
|
||||||
/*max_contiguous_idx*/ -1,
|
/*.max_contiguous_idx = */ -1,
|
||||||
/*.cells*/ nullptr,
|
/*.cells = */ nullptr,
|
||||||
/*.cells_sequences*/ nullptr,
|
/*.cells_sequences = */ nullptr,
|
||||||
};
|
};
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -8836,14 +8836,14 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
||||||
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
|
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
|
||||||
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
|
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
|
||||||
view->cells = (struct llama_kv_cache_view_cell *)p;
|
view->cells = (struct llama_kv_cache_view_cell *)p;
|
||||||
p = realloc(view->cells_sequences, sizeof(struct llama_kv_cache_view_cell_sequence) * view->n_max_seq * view->n_cells);
|
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
|
||||||
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
|
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
|
||||||
view->cells_sequences = (struct llama_kv_cache_view_cell_sequence *)p;
|
view->cells_sequences = (llama_seq_id *)p;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
|
const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
|
||||||
llama_kv_cache_view_cell * c_curr = view->cells;
|
llama_kv_cache_view_cell * c_curr = view->cells;
|
||||||
struct llama_kv_cache_view_cell_sequence * cs_curr = view->cells_sequences;
|
llama_seq_id * cs_curr = view->cells_sequences;
|
||||||
int32_t used_cells = 0;
|
int32_t used_cells = 0;
|
||||||
int32_t token_count = 0;
|
int32_t token_count = 0;
|
||||||
int32_t curr_contig_idx = -1;
|
int32_t curr_contig_idx = -1;
|
||||||
|
@ -8870,22 +8870,22 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
||||||
if (seq_idx >= view->n_max_seq) {
|
if (seq_idx >= view->n_max_seq) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
cs_curr[seq_idx].seq_id = it;
|
cs_curr[seq_idx] = it;
|
||||||
seq_idx++;
|
seq_idx++;
|
||||||
}
|
}
|
||||||
if (seq_idx != 0) {
|
if (seq_idx != 0) {
|
||||||
used_cells++;
|
used_cells++;
|
||||||
}
|
}
|
||||||
for (; seq_idx < view->n_max_seq; seq_idx++) {
|
for (; seq_idx < view->n_max_seq; seq_idx++) {
|
||||||
cs_curr[seq_idx].seq_id = -1;
|
cs_curr[seq_idx] = -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
|
if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
|
||||||
max_contig_idx = curr_contig_idx;
|
max_contig_idx = curr_contig_idx;
|
||||||
max_contig = kv_cells.size() - curr_contig_idx;
|
max_contig = kv_cells.size() - curr_contig_idx;
|
||||||
}
|
}
|
||||||
view->max_contiguous_cells = max_contig;
|
view->max_contiguous = max_contig;
|
||||||
view->max_contiguous_cells_idx = max_contig_idx;
|
view->max_contiguous_idx = max_contig_idx;
|
||||||
view->token_count = token_count;
|
view->token_count = token_count;
|
||||||
view->used_cells = used_cells;
|
view->used_cells = used_cells;
|
||||||
if (uint32_t(used_cells) != ctx->kv_self.used) {
|
if (uint32_t(used_cells) != ctx->kv_self.used) {
|
||||||
|
|
38
llama.h
38
llama.h
|
@ -361,30 +361,52 @@ extern "C" {
|
||||||
// KV cache
|
// KV cache
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// Information associated with an individual cell in the KV cache view.
|
||||||
struct llama_kv_cache_view_cell {
|
struct llama_kv_cache_view_cell {
|
||||||
|
// The position for this cell. Takes KV cache shifts into account.
|
||||||
|
// May be negative if the cell is not populated.
|
||||||
llama_pos pos;
|
llama_pos pos;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_kv_cache_view_cell_sequence {
|
// An updateable view of the KV cache.
|
||||||
// Would like to have token_id here as well.
|
|
||||||
llama_seq_id seq_id;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_kv_cache_view {
|
struct llama_kv_cache_view {
|
||||||
|
// Number of KV cache cells. This will be the same as the context size.
|
||||||
int32_t n_cells;
|
int32_t n_cells;
|
||||||
|
|
||||||
|
// Maximum number of sequences that can exist in a cell. It's not an error
|
||||||
|
// if there are more sequences in a cell than this value, however they will
|
||||||
|
// not be visible in the view cells_sequences.
|
||||||
int32_t n_max_seq;
|
int32_t n_max_seq;
|
||||||
|
|
||||||
|
// Number of tokens in the cache. For example, if there are two populated
|
||||||
|
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
||||||
|
// ids then you'll have 3 tokens.
|
||||||
int32_t token_count;
|
int32_t token_count;
|
||||||
|
|
||||||
|
// Number of populated cache cells.
|
||||||
int32_t used_cells;
|
int32_t used_cells;
|
||||||
int32_t max_contiguous_cells;
|
|
||||||
int32_t max_contiguous_cells_idx;
|
// Maximum contiguous empty slots in the cache.
|
||||||
|
int32_t max_contiguous;
|
||||||
|
|
||||||
|
// Index to the start of the max_contiguous slot range. Can be negative
|
||||||
|
// when cache is full.
|
||||||
|
int32_t max_contiguous_idx;
|
||||||
|
|
||||||
|
// Information for an individual cell.
|
||||||
struct llama_kv_cache_view_cell * cells;
|
struct llama_kv_cache_view_cell * cells;
|
||||||
struct llama_kv_cache_view_cell_sequence * cells_sequences;
|
|
||||||
|
// The sequences for each cell. There will be n_max_seq items per cell.
|
||||||
|
llama_seq_id * cells_sequences;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Create an empty KV cache view.
|
||||||
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
|
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
|
||||||
|
|
||||||
|
// Free a KV cache view.
|
||||||
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
||||||
|
|
||||||
|
// Update the KV cache view structure with the current state of the KV cache.
|
||||||
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
||||||
|
|
||||||
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue