llama : add llama_kv_cache_defrag

2024-02-25 15:00:45 +02:00 · 2024-02-25 15:00:45 +02:00 · 65f21ec5d3
commit 65f21ec5d3
parent 9ec749df59
3 changed files with 383 additions and 231 deletions
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -183,6 +183,7 @@ int main(int argc, char ** argv) {
        llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
        llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
        llama_kv_cache_defrag (ctx);
        llama_kv_cache_update (ctx);
        n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
@ -213,6 +214,7 @@ int main(int argc, char ** argv) {
            llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
            llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
            llama_kv_cache_defrag (ctx);
            llama_kv_cache_update (ctx);
            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
--- a/llama.cpp
+++ b/llama.cpp
@ -1722,6 +1722,7 @@ struct llama_kv_cell {
 // ring-buffer of cached KV data
 struct llama_kv_cache {
    bool has_shift = false;
    bool do_defrag = false;
    // Note: The value of head isn't only used to optimize searching
    // for a free KV slot. llama_decode_internal also uses it, so it
@ -2278,6 +2279,10 @@ static void llama_kv_cache_compress(struct llama_kv_cache & cache, llama_pos del
    cache.compress_delta = delta;
 }
 static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
    cache.do_defrag = true;
 }
 //
 // model loading and saving
 //
@ -8029,29 +8034,7 @@ static int llama_decode_internal(
    return 0;
 }
-static void llama_kv_cache_update_internal(struct llama_context & lctx) {
+// summary:
    // apply K-shift if needed
    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
        llama_set_k_shift(lctx);
        {
            ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
        }
        {
            auto & kv_self = lctx.kv_self;
            kv_self.has_shift = false;
            for (uint32_t i = 0; i < kv_self.size; ++i) {
                kv_self.cells[i].delta = 0;
            }
        }
    }
    // compress the KV cache data if needed:
 //
 //   - determine which KV cell pairs (i0, i1) to merge:
 //
@ -8067,7 +8050,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 //
 // as a side effect, the new KV cache is defragmented
 //
-    if (lctx.kv_self.compress_delta >= 0) {
+static void llama_kv_cache_compress_internal(struct llama_context & lctx) {
    auto & kv_self = lctx.kv_self;
    const auto & hparams = lctx.model.hparams;
@ -8080,13 +8063,13 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
    const uint32_t n_head_kv     = hparams.n_head_kv;      GGML_UNUSED(n_head_kv);
    const uint32_t kv_size       = kv_self.size;
    const int64_t t_start = ggml_time_us();
    std::vector<uint8_t> buf_q;
    std::vector<float> buf_src_f32;
    std::vector<float> buf_dst_f32;
        const int64_t t_start = ggml_time_us();
    struct c_pair { uint32_t i0, i1; };
    struct c_info { bool merged; uint32_t id, cnt, r; };
@ -8283,8 +8266,162 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
    const int64_t t_end = ggml_time_us();
    LLAMA_LOG_INFO("(tmp log) KV compress time: %.3f ms\n", (t_end - t_start)/1000.0);
 }
-        kv_self.compress_delta = -1;
+// copy the KV cache to the host memory and reshuffle the cells to the beginning of the cache
 // removing any empty segments that may have been left by previous KV cache operations
 // TODO: optimizations are possible:
 //       - multiple threads
 //       - avoid copying to the host memory when already there
 // TODO: can we do all this on-device?
 static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
    auto & kv_self = lctx.kv_self;
    const auto & hparams = lctx.model.hparams;
    const uint32_t n_layer      = hparams.n_layer;
    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
    const uint32_t n_kv         = llama_kv_cache_cell_max(kv_self);
    const uint32_t kv_size = kv_self.size;
    const int64_t t_start = ggml_time_us();
    std::vector<uint8_t> buf_k;
    std::vector<uint8_t> buf_v;
    // the destination cell in the new KV cache
    uint32_t id = 0;
    // number of cells moved
    uint32_t n_moves = 0;
    // determine which KV cells to move where
    std::vector<uint32_t> ids(n_kv, n_kv);
    for (uint32_t i0 = 0; i0 < n_kv; ++i0) {
        const auto & cell0 = kv_self.cells[i0];
        if (!cell0.is_empty()) {
            ids[i0] = id;
            if (i0 != id) {
                kv_self.cells[id] = cell0;
                n_moves++;
            }
            id++;
        }
    }
    if (n_moves == 0) {
        return;
    }
    LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
    kv_self.head = id;
    kv_self.used = id;
    // zero the rest of the cells
    for (uint32_t i = id; i < n_kv; ++i) {
        kv_self.cells[i] = llama_kv_cell();
    }
    for (uint32_t il = 0; il < n_layer; ++il) {
        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
        const size_t k_size     = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
        const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
        const size_t v_size    = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
        buf_k.resize(k_size);
        buf_v.resize(v_size);
        ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
        ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
        // batch move [i, i+nm) to [id, id+nm)
        // note: cells can move only to a lower index
        for (uint32_t i = 0; i < n_kv; ++i) {
            const uint32_t id = ids[i];
            if (i == id || id == n_kv) {
                continue;
            }
            uint32_t nm = 1;
            while (i + nm < n_kv && ids[i + nm] == id + nm) {
                nm++;
            }
            // move keys
            {
                const int64_t os =  i*k_size_row;
                const int64_t od = id*k_size_row;
                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
            }
            // move values (note: they are transposed)
            {
                const int64_t os =  i;
                const int64_t od = id;
                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
                }
            }
            i += nm - 1;
        }
        ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
        ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
    }
    const int64_t t_end = ggml_time_us();
    LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
 }
 static void llama_kv_cache_update_internal(struct llama_context & lctx) {
    // apply K-shift if needed
    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
        llama_set_k_shift(lctx);
        {
            ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
        }
        {
            auto & kv_self = lctx.kv_self;
            kv_self.has_shift = false;
            for (uint32_t i = 0; i < kv_self.size; ++i) {
                kv_self.cells[i].delta = 0;
            }
        }
    }
    // compress the KV cache data if needed
    if (lctx.kv_self.compress_delta >= 0) {
        llama_kv_cache_compress_internal(lctx);
        lctx.kv_self.compress_delta = -1;
        lctx.kv_self.do_defrag = false;
    }
    // defragment the KV cache if needed
    if (lctx.kv_self.do_defrag) {
        llama_kv_cache_defrag_internal(lctx);
        lctx.kv_self.do_defrag = false;
    }
 }
@ -12360,6 +12497,10 @@ void llama_kv_cache_compress(struct llama_context * ctx, llama_pos delta) {
    llama_kv_cache_compress(ctx->kv_self, delta);
 }
 void llama_kv_cache_defrag(struct llama_context * ctx) {
    llama_kv_cache_defrag(ctx->kv_self);
 }
 void llama_kv_cache_update(struct llama_context * ctx) {
    llama_kv_cache_update_internal(*ctx);
 }
--- a/llama.h
+++ b/llama.h
@ -555,11 +555,20 @@ extern "C" {
                    llama_seq_id   seq_id);
    // [EXPERIMENTAL] Compress the data in the KV cache
    // This will be applied:
    //   - lazily on next llama_decode()
    //   - explicitly with llama_kv_cache_update()
    LLAMA_API void llama_kv_cache_compress(
            struct llama_context * ctx,
                       llama_pos   delta);
-    // Apply the KV cache updates (such as K-shifts) to the KV data
+    // Defragment the KV cache
    // This will be applied:
    //   - lazily on next llama_decode()
    //   - explicitly with llama_kv_cache_update()
    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
    //