diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 47de67a93..2cbc9e1fa 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -182,7 +182,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); - llama_kv_cache_defrag (ctx); + //llama_kv_cache_defrag (ctx); llama_kv_cache_update (ctx); n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; @@ -213,7 +213,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); - llama_kv_cache_defrag (ctx); + //llama_kv_cache_defrag (ctx); llama_kv_cache_update (ctx); n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; diff --git a/llama.cpp b/llama.cpp index 28430254f..1e375af47 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5114,16 +5114,16 @@ struct llm_build_context { struct ggml_cgraph * build_defrag(const std::vector & ids) { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - for (int i = 0; i < n_kv; ++i) { - const int id = ids[i]; + for (uint32_t i = 0; i < ids.size(); ++i) { + const uint32_t id = ids[i]; - if (i == id || id == n_kv) { + if (i == id || id == ids.size()) { continue; } - int nm = 1; + uint32_t nm = 1; - while (i + nm < n_kv && (int) ids[i + nm] == id + nm) { + while (i + nm < ids.size() && ids[i + nm] == id + nm) { nm++; } @@ -5155,6 +5155,8 @@ struct llm_build_context { i += nm - 1; } + //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); + return gf; } @@ -7935,6 +7937,8 @@ static int llama_decode_internal( batch.seq_id = seq_id_arr.data(); } + llama_kv_cache_update(&lctx); + // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it if (kv_self.head > kv_self.used + 2*n_tokens) { @@ -7953,8 +7957,6 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - llama_kv_cache_update(&lctx); - ggml_backend_sched_reset(lctx.sched); ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); @@ -8004,6 +8006,19 @@ static int llama_decode_internal( } } + // decide if we need to defrag the kv cache + // TODO: should become configurable + { + const float fragmentation = kv_self.n >= 512 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f; + + // queue defragmentation for next llama_kv_cache_update + if (fragmentation > 0.1f) { + LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation); + + llama_kv_cache_defrag(kv_self); + } + } + #ifdef GGML_PERF // print timing information per ggml operation (for debugging purposes) // requires GGML_PERF to be defined @@ -8095,12 +8110,16 @@ static int llama_decode_internal( static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { auto & kv_self = lctx.kv_self; + const auto & hparams = lctx.model.hparams; + + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_kv = llama_kv_cache_cell_max(kv_self); const uint32_t n_used = kv_self.used; assert(n_used <= n_kv); - const int64_t t_start = ggml_time_us(); + //const int64_t t_start = ggml_time_us(); // number of cells moved uint32_t n_moves = 0; @@ -8124,15 +8143,29 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // found a hole - fill it with data from the end of the cache - // determine the size of the hole uint32_t nh = 1; + + // determine the size of the hole while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { nh++; } - // starting from the end, find nh non-empty cells + // in the worst case each move requires 6*n_layer tensors + // + // TODO: ideally this should be: + // + // if (6*(n_moves + nh)*n_layer > LLAMA_MAX_NODES) { + // + // but when I do that, the defrag graph can not fit due to not enough memory - not sure why + // + if (6*(n_moves + nh)*n_layer > LLAMA_MAX_NODES/2) { + break; + } + uint32_t nf = 0; uint32_t is = n_kv - 1; + + // starting from the end, find nh non-empty cells for (; is > i0; --is) { const auto & cell1 = kv_self.cells[is]; @@ -8153,11 +8186,17 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { nf = 0; + uint32_t i1 = is; + + // are we moving a continuous block of memory? + bool cont = false; + // go back and move the nf cells to the hole - for (uint32_t i1 = is; i1 < n_kv; ++i1) { - const auto & cell1 = kv_self.cells[i1]; + for (; i1 < n_kv; ++i1) { + auto & cell1 = kv_self.cells[i1]; if (cell1.is_empty() || ids[i1] != n_kv) { + cont = false; continue; } @@ -8167,11 +8206,23 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // move the cell meta data kv_self.cells[i0 + nf] = cell1; - n_moves++; + // clear the old cell and move the head there + cell1 = llama_kv_cell(); + kv_self.head = n_used; + + if (!cont) { + n_moves++; + cont = true; + } + nf++; + + if (nf == nh) { + break; + } } - LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, n_kv, i0, i0 + nh); + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); i0 += nh - 1; } @@ -8180,15 +8231,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { return; } - LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); + //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); - kv_self.head = n_used; - kv_self.used = n_used; - - // zero the rest of the cells - for (uint32_t i = n_used; i < n_kv; ++i) { - kv_self.cells[i] = llama_kv_cell(); - } + //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); #if 0 // CPU defrag @@ -8200,9 +8245,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // likely not worth the effort, as we have ggml_graph based defrag // - const auto & hparams = lctx.model.hparams; - - const uint32_t n_layer = hparams.n_layer; const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); @@ -8271,9 +8313,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { llama_graph_compute(lctx, gf, lctx.cparams.n_threads); #endif - const int64_t t_end = ggml_time_us(); + //const int64_t t_end = ggml_time_us(); - LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); + //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); } static void llama_kv_cache_update_internal(struct llama_context & lctx) {