From f78b396ee7a5d4c47cf3e3a8cb9fb02a4d3fe250 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 13 Jan 2025 14:13:11 +0200 Subject: [PATCH 01/28] llama : add struct llama_kv_cache (wip) [no ci] --- common/common.cpp | 6 +- common/speculative.cpp | 10 +- examples/embedding/embedding.cpp | 5 +- include/llama.h | 79 +++---- src/llama-context.cpp | 16 +- src/llama-kv-cache.cpp | 286 ++++--------------------- src/llama-kv-cache.h | 350 ++++++++++++++++++++++++++----- src/llama.cpp | 91 ++------ 8 files changed, 428 insertions(+), 415 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 6dea8e3d2..29de45189 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -952,7 +952,9 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } - if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) { + llama_kv_cache * kv = llama_get_kv_cache(lctx); + + if (params.ctx_shift && !llama_kv_cache_can_shift(kv)) { LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__); params.ctx_shift = false; } @@ -1057,7 +1059,7 @@ struct common_init_result common_init_from_params(common_params & params) { if (llama_model_has_decoder(model)) { llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch))); } - llama_kv_cache_clear(lctx); + llama_kv_cache_clear(kv); llama_synchronize(lctx); llama_perf_context_reset(lctx); } diff --git a/common/speculative.cpp b/common/speculative.cpp index 318e96ea3..6ac058517 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -171,8 +171,10 @@ llama_tokens common_speculative_gen_draft( llama_tokens result; result.reserve(params.n_draft); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + if (reuse_n == 0) { - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); prompt.clear(); } else { @@ -191,14 +193,14 @@ llama_tokens common_speculative_gen_draft( } if (reuse_i > 0) { - llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i); - llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i); + llama_kv_cache_seq_rm (kv, 0, 0, reuse_i); + llama_kv_cache_seq_add(kv, 0, reuse_i, -1, -reuse_i); prompt.erase(prompt.begin(), prompt.begin() + reuse_i); } if (reuse_n < (int) prompt.size()) { - llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1); + llama_kv_cache_seq_rm (kv, 0, reuse_n, -1); prompt.erase(prompt.begin() + reuse_n, prompt.end()); } diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 38d22c90f..fda0949f1 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -34,10 +34,11 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - const struct llama_model * model = llama_get_model(ctx); + const llama_model * model = llama_get_model(ctx); + llama_kv_cache * kv = llama_get_kv_cache(ctx); // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/include/llama.h b/include/llama.h index 3b75e7607..08b8658ad 100644 --- a/include/llama.h +++ b/include/llama.h @@ -60,6 +60,7 @@ extern "C" { struct llama_model; struct llama_context; struct llama_sampler; + struct llama_kv_cache; typedef int32_t llama_pos; typedef int32_t llama_token; @@ -467,8 +468,9 @@ extern "C" { DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead"); - LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); - LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); + LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); // TODO: remove const? + LLAMA_API struct llama_kv_cache * llama_get_kv_cache( struct llama_context * ctx); + LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); @@ -584,7 +586,7 @@ extern "C" { // KV cache // - // TODO: remove llama_kv_cache_view_* API + // TODO: start using struct llama_kv_cache // Information associated with an individual cell in the KV cache view. struct llama_kv_cache_view_cell { @@ -639,14 +641,20 @@ extern "C" { // Returns the number of tokens in the KV cache (slow, use only for debug) // If a KV cell has multiple sequences assigned to it, it will be counted multiple times - LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx); + LLAMA_API int32_t llama_kv_cache_n_tokens(const struct llama_kv_cache * kv); + + DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), + "use llama_kv_cache_n_tokens instead"); // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) - LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx); + LLAMA_API int32_t llama_kv_cache_used_cells(const struct llama_kv_cache * kv); + + DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx), + "use llama_kv_cache_used_cells instead"); // Clear the KV cache - both cell info is erased and KV data is zeroed LLAMA_API void llama_kv_cache_clear( - struct llama_context * ctx); + struct llama_kv_cache * kv); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails @@ -654,26 +662,26 @@ extern "C" { // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API bool llama_kv_cache_seq_rm( - struct llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1); + struct llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1); // Copy all tokens that belong to the specified sequence to another sequence // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_cp( - struct llama_context * ctx, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1); + struct llama_kv_cache * kv, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1); // Removes all tokens that do not belong to the specified sequence LLAMA_API void llama_kv_cache_seq_keep( - struct llama_context * ctx, - llama_seq_id seq_id); + struct llama_kv_cache * kv, + llama_seq_id seq_id); // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) // If the KV cache is RoPEd, the KV data is updated accordingly: @@ -682,11 +690,11 @@ extern "C" { // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_add( - struct llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta); + struct llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta); // Integer division of the positions by factor of `d > 1` // If the KV cache is RoPEd, the KV data is updated accordingly: @@ -695,31 +703,28 @@ extern "C" { // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_div( - struct llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d); + struct llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d); // Returns the largest position present in the KV cache for the specified sequence LLAMA_API llama_pos llama_kv_cache_seq_pos_max( - struct llama_context * ctx, - llama_seq_id seq_id); - - // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache - // how to avoid this? + struct llama_kv_cache * kv, + llama_seq_id seq_id); // Defragment the KV cache // This will be applied: // - lazily on next llama_decode() // - explicitly with llama_kv_cache_update() - LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx); - - // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) - LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); + LLAMA_API void llama_kv_cache_defrag(struct llama_kv_cache * kv); // Check if the context supports KV cache shifting - LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx); + LLAMA_API bool llama_kv_cache_can_shift(const struct llama_kv_cache * kv); + + // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) + LLAMA_API void llama_update_kv_cache(struct llama_context * ctx, struct llama_kv_cache * kv); // // State / sessions diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 671d2a81a..bf5a77cca 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -602,11 +602,15 @@ uint32_t llama_n_seq_max(const struct llama_context * ctx) { return ctx->kv_self.size; } -const struct llama_model * llama_get_model(const struct llama_context * ctx) { +const llama_model * llama_get_model(const llama_context * ctx) { return &ctx->model; } -enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) { +llama_kv_cache * llama_get_kv_cache(llama_context * ctx) { + return &ctx->kv_self; +} + +enum llama_pooling_type llama_pooling_type(const llama_context * ctx) { return ctx->cparams.pooling_type; } @@ -1142,7 +1146,7 @@ struct llama_data_read { if (dest_seq_id != -1) { // single sequence - llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + kv_self.seq_rm(dest_seq_id, -1, -1); llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false); batch.n_tokens = cell_count; @@ -1185,7 +1189,7 @@ struct llama_data_read { return false; } - llama_kv_cache_clear(kv_self); + kv_self.clear(); for (uint32_t i = 0; i < cell_count; ++i) { llama_kv_cell & cell = kv_self.cells[i]; @@ -1362,9 +1366,9 @@ struct llama_data_read { if (!res) { if (seq_id == -1) { - llama_kv_cache_clear(ctx); + ctx->kv_self.clear(); } else { - llama_kv_cache_seq_rm(ctx, seq_id, -1, -1); + ctx->kv_self.seq_rm(seq_id, -1, -1); } throw std::runtime_error("failed to restore kv cache"); } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index feffdf0de..b0d5a9318 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -350,277 +350,67 @@ uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { return 0; } -void llama_kv_cache_clear(struct llama_kv_cache & cache) { - for (int32_t i = 0; i < (int32_t) cache.size; ++i) { - cache.cells[i].pos = -1; - cache.cells[i].seq_id.clear(); - cache.cells[i].src = -1; - cache.cells[i].tail = -1; - } - cache.head = 0; - cache.used = 0; - - for (auto & buf : cache.bufs) { - ggml_backend_buffer_clear(buf.get(), 0); - } +void llama_kv_cache_clear(llama_kv_cache * kv) { + kv->clear(); } bool llama_kv_cache_seq_rm( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1) { - uint32_t new_head = cache.size; - - if (p0 < 0) p0 = 0; - if (p1 < 0) p1 = std::numeric_limits::max(); - - // models like Mamba or RWKV can't have a state partially erased - if (cache.recurrent) { - if (seq_id >= (int64_t) cache.size) { - // could be fatal - return false; - } - if (0 <= seq_id) { - int32_t & tail_id = cache.cells[seq_id].tail; - if (tail_id >= 0) { - const llama_kv_cell & cell = cache.cells[tail_id]; - // partial intersection is invalid - if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { - return false; - } - // invalidate tails which will be cleared - if (p0 <= cell.pos && cell.pos < p1) { - tail_id = -1; - } - } - } else { - // seq_id is negative, then the range should include everything or nothing - if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { - return false; - } - } - } - - for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { - if (seq_id < 0) { - cache.cells[i].seq_id.clear(); - } else if (cache.cells[i].has_seq_id(seq_id)) { - cache.cells[i].seq_id.erase(seq_id); - } else { - continue; - } - if (cache.cells[i].is_empty()) { - // keep count of the number of used cells - if (cache.cells[i].pos >= 0) cache.used--; - - cache.cells[i].pos = -1; - cache.cells[i].src = -1; - if (new_head == cache.size) new_head = i; - } - } - } - - // If we freed up a slot, set head to it so searching can start there. - if (new_head != cache.size && new_head < cache.head) cache.head = new_head; - - return true; + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return kv->seq_rm(seq_id, p0, p1); } void llama_kv_cache_seq_cp( - struct llama_kv_cache & cache, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1) { - if (p0 < 0) p0 = 0; - if (p1 < 0) p1 = std::numeric_limits::max(); - - if (cache.recurrent) { - if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) { - llama_kv_cell & tail_src = cache.cells[seq_id_src]; - llama_kv_cell & tail_dst = cache.cells[seq_id_dst]; - if (tail_dst.tail >= 0) { - // clear destination seq_id if it wasn't empty - llama_kv_cell & cell_dst = cache.cells[tail_dst.tail]; - - cell_dst.seq_id.erase(seq_id_dst); - tail_dst.tail = -1; - if (cell_dst.seq_id.empty()) { - cell_dst.pos = -1; - cell_dst.delta = -1; - cell_dst.src = -1; - cache.used -= 1; - } - } - if (tail_src.tail >= 0) { - llama_kv_cell & cell_src = cache.cells[tail_src.tail]; - - cell_src.seq_id.insert(seq_id_dst); - tail_dst.tail = tail_src.tail; - } - } - - return; - } - // otherwise, this is the KV cache of a Transformer-like model - - cache.head = 0; - - for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { - cache.cells[i].seq_id.insert(seq_id_dst); - } - } + llama_kv_cache * kv, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + kv->seq_cp(seq_id_src, seq_id_dst, p0, p1); } -void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) { - uint32_t new_head = cache.size; - - for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.recurrent && (llama_seq_id) i != seq_id) { - cache.cells[i].tail = -1; - } - if (!cache.cells[i].has_seq_id(seq_id)) { - if (cache.cells[i].pos >= 0) cache.used--; - cache.cells[i].pos = -1; - cache.cells[i].src = -1; - cache.cells[i].seq_id.clear(); - if (new_head == cache.size) new_head = i; - } else { - cache.cells[i].seq_id.clear(); - cache.cells[i].seq_id.insert(seq_id); - } - } - - // If we freed up a slot, set head to it so searching can start there. - if (new_head != cache.size && new_head < cache.head) cache.head = new_head; +void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id) { + kv->seq_keep(seq_id); } void llama_kv_cache_seq_add( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta) { - uint32_t new_head = cache.size; - - if (p0 < 0) p0 = 0; - if (p1 < 0) p1 = std::numeric_limits::max(); - // If there is no range then return early to avoid looping over the cache. - if (p0 == p1) return; - - if (cache.recurrent) { - // for Mamba-like or RWKV models, only the pos needs to be shifted - if (0 <= seq_id && seq_id < (int64_t) cache.size) { - const int32_t tail_id = cache.cells[seq_id].tail; - if (tail_id >= 0) { - llama_kv_cell & cell = cache.cells[tail_id]; - if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos += delta; - } - } - } - return; - } - - for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { - cache.has_shift = true; - cache.cells[i].pos += delta; - cache.cells[i].delta += delta; - - if (cache.cells[i].pos < 0) { - if (!cache.cells[i].is_empty()) { - cache.used--; - } - cache.cells[i].pos = -1; - cache.cells[i].seq_id.clear(); - if (new_head == cache.size) { - new_head = i; - } - } - } - } - - // If we freed up a slot, set head to it so searching can start there. - // Otherwise we just start the next search from the beginning. - cache.head = new_head != cache.size ? new_head : 0; + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + kv->seq_add(seq_id, p0, p1, delta); } void llama_kv_cache_seq_div( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d) { - if (p0 < 0) p0 = 0; - if (p1 < 0) p1 = std::numeric_limits::max(); - // If there is no range then return early to avoid looping over the cache. - if (p0 == p1) return; - - if (cache.recurrent) { - // for Mamba-like or RWKV models, only the pos needs to be changed - if (0 <= seq_id && seq_id < (int64_t) cache.size) { - const int32_t tail_id = cache.cells[seq_id].tail; - if (tail_id >= 0) { - llama_kv_cell & cell = cache.cells[tail_id]; - if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos /= d; - } - } - } - return; - } - - for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { - cache.has_shift = true; - - { - llama_pos p_old = cache.cells[i].pos; - cache.cells[i].pos /= d; - cache.cells[i].delta += cache.cells[i].pos - p_old; - } - } - } + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + kv->seq_div(seq_id, p0, p1, d); } -llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) { - llama_pos result = 0; - - for (uint32_t i = 0; i < cache.size; ++i) { - if (cache.cells[i].has_seq_id(seq_id)) { - result = std::max(result, cache.cells[i].pos); - } - } - - return result; +llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id) { + return kv->seq_pos_max(seq_id); } -void llama_kv_cache_defrag(struct llama_kv_cache & cache) { - if (!cache.recurrent) { - cache.do_defrag = true; - } +void llama_kv_cache_defrag(llama_kv_cache * kv) { + kv->defrag(); } -int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv) { - int result = 0; - - for (uint32_t i = 0; i < kv.size; i++) { - result += kv.cells[i].seq_id.size(); - } - - return result; +int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) { + return kv->n_tokens(); } -int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv) { - return kv.used; +int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) { + return kv->used; } -bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv) { - return kv.can_shift; +bool llama_kv_cache_can_shift(const llama_kv_cache * kv) { + return kv->can_shift; } // @@ -632,7 +422,7 @@ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache /*.n_cells = */ 0, /*.n_seq_max = */ n_seq_max, /*.token_count = */ 0, - /*.used_cells = */ llama_get_kv_cache_used_cells(kv), + /*.used_cells = */ llama_kv_cache_used_cells(&kv), /*.max_contiguous = */ 0, /*.max_contiguous_idx = */ -1, /*.cells = */ nullptr, diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index dca6f3998..b0bb1cfb1 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -57,6 +57,16 @@ struct llama_kv_cache { std::vector ctxs; std::vector bufs; + int32_t n_tokens() const { + int32_t result = 0; + + for (uint32_t i = 0; i < size; i++) { + result += cells[i].seq_id.size(); + } + + return result; + } + size_t total_size() const { size_t size = 0; for (const auto & buf : bufs) { @@ -75,6 +85,297 @@ struct llama_kv_cache { return max_pos; } + + void clear() { + for (int32_t i = 0; i < (int32_t) size; ++i) { + cells[i].pos = -1; + cells[i].seq_id.clear(); + cells[i].src = -1; + cells[i].tail = -1; + } + head = 0; + used = 0; + + for (auto & buf : bufs) { + ggml_backend_buffer_clear(buf.get(), 0); + } + } + + bool seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // models like Mamba or RWKV can't have a state partially erased + if (recurrent) { + if (seq_id >= (int64_t) size) { + // could be fatal + return false; + } + if (0 <= seq_id) { + int32_t & tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + const llama_kv_cell & cell = cells[tail_id]; + // partial intersection is invalid + if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { + return false; + } + // invalidate tails which will be cleared + if (p0 <= cell.pos && cell.pos < p1) { + tail_id = -1; + } + } + } else { + // seq_id is negative, then the range should include everything or nothing + if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { + return false; + } + } + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].pos >= p0 && cells[i].pos < p1) { + if (seq_id < 0) { + cells[i].seq_id.clear(); + } else if (cells[i].has_seq_id(seq_id)) { + cells[i].seq_id.erase(seq_id); + } else { + continue; + } + if (cells[i].is_empty()) { + // keep count of the number of used cells + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } + + return true; + } + + void seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + if (seq_id_src == seq_id_dst) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + if (recurrent) { + if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) { + llama_kv_cell & tail_src = cells[seq_id_src]; + llama_kv_cell & tail_dst = cells[seq_id_dst]; + if (tail_dst.tail >= 0) { + // clear destination seq_id if it wasn't empty + llama_kv_cell & cell_dst = cells[tail_dst.tail]; + + cell_dst.seq_id.erase(seq_id_dst); + tail_dst.tail = -1; + if (cell_dst.seq_id.empty()) { + cell_dst.pos = -1; + cell_dst.delta = -1; + cell_dst.src = -1; + used -= 1; + } + } + if (tail_src.tail >= 0) { + llama_kv_cell & cell_src = cells[tail_src.tail]; + + cell_src.seq_id.insert(seq_id_dst); + tail_dst.tail = tail_src.tail; + } + } + + return; + } + + // otherwise, this is the KV of a Transformer-like model + head = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) { + cells[i].seq_id.insert(seq_id_dst); + } + } + } + + void seq_keep(llama_seq_id seq_id) { + uint32_t new_head = size; + + for (uint32_t i = 0; i < size; ++i) { + if (recurrent && (llama_seq_id) i != seq_id) { + cells[i].tail = -1; + } + + if (!cells[i].has_seq_id(seq_id)) { + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + cells[i].seq_id.clear(); + + if (new_head == size){ + new_head = i; + } + } else { + cells[i].seq_id.clear(); + cells[i].seq_id.insert(seq_id); + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } + } + + void seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + if (delta == 0) { + return; + } + + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the + if (p0 == p1) { + return; + } + + if (recurrent) { + // for Mamba-like or RWKV models, only the pos needs to be shifted + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos += delta; + } + } + } + return; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { + has_shift = true; + cells[i].pos += delta; + cells[i].delta += delta; + + if (cells[i].pos < 0) { + if (!cells[i].is_empty()) { + used--; + } + cells[i].pos = -1; + cells[i].seq_id.clear(); + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + // Otherwise we just start the next search from the beginning. + head = new_head != size ? new_head : 0; + } + + void seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + if (d == 1) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the cache. + if (p0 == p1) { + return; + } + + if (recurrent) { + // for Mamba-like or RWKV models, only the pos needs to be changed + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos /= d; + } + } + } + + return; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { + has_shift = true; + + { + llama_pos p_old = cells[i].pos; + cells[i].pos /= d; + cells[i].delta += cells[i].pos - p_old; + } + } + } + } + + llama_pos seq_pos_max(llama_seq_id seq_id) { + llama_pos result = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id)) { + result = std::max(result, cells[i].pos); + } + } + + return result; + } + + void defrag() { + if (!recurrent) { + do_defrag = true; + } + } }; // a structure holds information about the slot found in llama_kv_cache_find_slot @@ -112,51 +413,6 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // find how many cells are currently in use uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache); -void llama_kv_cache_clear(struct llama_kv_cache & cache); - -bool llama_kv_cache_seq_rm( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1); - -void llama_kv_cache_seq_cp( - struct llama_kv_cache & cache, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1); - -void llama_kv_cache_seq_keep( - struct llama_kv_cache & cache, - llama_seq_id seq_id); - -void llama_kv_cache_seq_add( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta); - -void llama_kv_cache_seq_div( - struct llama_kv_cache & cache, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d); - -llama_pos llama_kv_cache_seq_pos_max( - struct llama_kv_cache & cache, - llama_seq_id seq_id); - -void llama_kv_cache_defrag(struct llama_kv_cache & cache); - -int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv); - -int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv); - -bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv); - // // kv cache view // @@ -206,10 +462,10 @@ struct llama_kv_slot_restorer { cache.n = old_state.n; if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased - llama_kv_cache_seq_rm(cache, -1, -1, -1); + cache.seq_rm(-1, -1, -1); } else { for (auto & slot : slot_boundaries) { - llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second); + cache.seq_rm(-1, slot.first, slot.second); } } } diff --git a/src/llama.cpp b/src/llama.cpp index 094157ccf..87dd512b2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8564,7 +8564,7 @@ static int llama_decode_impl( // non-causal masks do not use the KV cache if (hparams.causal_attn) { - llama_kv_cache_update(&lctx); + llama_update_kv_cache(&lctx, &lctx.kv_self); // TODO: lctx->update_kv_cache() // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it @@ -8760,7 +8760,7 @@ static int llama_decode_impl( if (fragmentation > cparams.defrag_thold) { //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation); - llama_kv_cache_defrag(kv_self); + kv_self.defrag(); } } @@ -9182,11 +9182,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) { //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); } -static void llama_kv_cache_update_impl(struct llama_context & lctx) { +static void llama_update_kv_cache_impl(llama_context & lctx, llama_kv_cache & kv) { bool need_reserve = false; - if (lctx.kv_self.has_shift) { - if (!llama_kv_cache_can_shift(&lctx)) { + if (kv.has_shift) { + if (!kv.can_shift) { GGML_ABORT("The current context does not support K-shift"); } @@ -9206,23 +9206,21 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) { } { - auto & kv_self = lctx.kv_self; + kv.has_shift = false; - kv_self.has_shift = false; - - for (uint32_t i = 0; i < kv_self.size; ++i) { - kv_self.cells[i].delta = 0; + for (uint32_t i = 0; i < kv.size; ++i) { + kv.cells[i].delta = 0; } } } // defragment the KV cache if needed - if (lctx.kv_self.do_defrag) { + if (kv.do_defrag) { llama_kv_cache_defrag_impl(lctx); need_reserve = true; - lctx.kv_self.do_defrag = false; + kv.do_defrag = false; } // reserve a worst case graph again @@ -9845,6 +9843,7 @@ struct llama_context * llama_init_from_model( return ctx; } +// deprecated struct llama_context * llama_new_context_with_model( struct llama_model * model, struct llama_context_params params) { @@ -9855,73 +9854,27 @@ struct llama_context * llama_new_context_with_model( // kv cache // -// TODO: tmp bridges below until `struct llama_kv_cache` is exposed through the public API - -struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { +struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { return llama_kv_cache_view_init(ctx->kv_self, n_seq_max); } -void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) { +void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) { llama_kv_cache_view_update(view, ctx->kv_self); } -int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) { - return llama_get_kv_cache_token_count(ctx->kv_self); +// deprecated +int32_t llama_get_kv_cache_token_count(const llama_context * ctx) { + return llama_kv_cache_n_tokens(&ctx->kv_self); } -int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) { - return llama_get_kv_cache_used_cells(ctx->kv_self); +// deprecated +int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) { + return llama_kv_cache_used_cells(&ctx->kv_self); } -void llama_kv_cache_clear(struct llama_context * ctx) { - llama_kv_cache_clear(ctx->kv_self); -} - -bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { - return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1); -} - -void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { - if (seq_id_src == seq_id_dst) { - return; - } - llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); -} - -void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) { - llama_kv_cache_seq_keep(ctx->kv_self, seq_id); -} - -void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { - if (delta == 0) { - return; - } - - llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta); -} - -void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { - if (d == 1) { - return; - } - - llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); -} - -llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) { - return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id); -} - -void llama_kv_cache_defrag(struct llama_context * ctx) { - llama_kv_cache_defrag(ctx->kv_self); -} - -void llama_kv_cache_update(struct llama_context * ctx) { - llama_kv_cache_update_impl(*ctx); -} - -bool llama_kv_cache_can_shift(struct llama_context * ctx) { - return llama_kv_cache_can_shift(ctx->kv_self); +// TODO: move to llama-context +void llama_update_kv_cache(llama_context * ctx, llama_kv_cache * kv) { + llama_update_kv_cache_impl(*ctx, *kv); } /// From e4550fbafc44403b243fe029937a97a0aed7bbd6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 13 Jan 2025 14:56:52 +0200 Subject: [PATCH 02/28] llama : cont ggml-ci --- examples/batched-bench/batched-bench.cpp | 6 ++-- .../cvector-generator/cvector-generator.cpp | 3 +- examples/gritlm/gritlm.cpp | 8 +++-- examples/imatrix/imatrix.cpp | 4 ++- examples/infill/infill.cpp | 6 ++-- examples/llama-bench/llama-bench.cpp | 6 ++-- examples/lookahead/lookahead.cpp | 13 ++++---- examples/lookup/lookup.cpp | 3 +- examples/main/main.cpp | 14 +++++---- examples/parallel/parallel.cpp | 11 +++---- examples/passkey/passkey.cpp | 30 ++++++++++--------- examples/perplexity/perplexity.cpp | 24 +++++++++++---- examples/retrieval/retrieval.cpp | 4 ++- examples/run/run.cpp | 7 +++-- examples/save-load-state/save-load-state.cpp | 4 ++- examples/server/server.cpp | 25 +++++++++------- examples/simple-chat/simple-chat.cpp | 6 ++-- .../speculative-simple/speculative-simple.cpp | 4 ++- examples/speculative/speculative.cpp | 29 ++++++++++-------- 19 files changed, 128 insertions(+), 79 deletions(-) diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 0659ab6f1..fcbad37bb 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -57,6 +57,8 @@ int main(int argc, char ** argv) { return 1; } + llama_kv_cache * kv = llama_get_kv_cache(ctx); + const int32_t n_kv_max = llama_n_ctx(ctx); llama_batch batch = llama_batch_init(n_kv_max, 0, 1); @@ -132,7 +134,7 @@ int main(int argc, char ** argv) { const auto t_pp_start = ggml_time_us(); - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { LOG_ERR("%s: llama_decode() failed\n", __func__); @@ -141,7 +143,7 @@ int main(int argc, char ** argv) { if (is_pp_shared) { for (int32_t i = 1; i < pl; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + llama_kv_cache_seq_cp(kv, 0, i, -1, -1); } } diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 413b71d34..adb4a60ad 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -342,7 +342,8 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { } static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { - llama_kv_cache_clear(ctx); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + llama_kv_cache_clear(kv); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 72eb46257..16437453e 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -13,6 +13,8 @@ static std::vector> encode(llama_context * ctx, const std::ve const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); for (uint64_t i = 0; i < sentences.size(); i++) { @@ -45,7 +47,7 @@ static std::vector> encode(llama_context * ctx, const std::ve } // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); llama_set_embeddings(ctx, true); llama_set_causal_attn(ctx, false); @@ -100,9 +102,11 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + llama_token eos_token = llama_vocab_eos(vocab); - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); llama_set_embeddings(ctx, false); llama_set_causal_attn(ctx, true); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index b5f3feb9f..5efe4f019 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -431,6 +431,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + const bool add_bos = llama_vocab_get_add_bos(vocab); const int n_ctx = llama_n_ctx(ctx); @@ -497,7 +499,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); llama_batch batch = llama_batch_init(n_batch, 0, 1); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 489a208b6..de8e77695 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -139,6 +139,8 @@ int main(int argc, char ** argv) { return 1; } + llama_kv_cache * kv = llama_get_kv_cache(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); const int n_ctx_train = llama_model_n_ctx_train(model); @@ -332,8 +334,8 @@ int main(int argc, char ** argv) { LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (kv, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_cache_seq_add(kv, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); n_past -= n_discard; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 4ac19ca86..8843c0048 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1575,9 +1575,11 @@ int main(int argc, char ** argv) { return 1; } + llama_kv_cache * kv = llama_get_kv_cache(ctx); + test t(inst, lmodel, ctx); - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); // cool off before the test if (params.delay) { @@ -1617,7 +1619,7 @@ int main(int argc, char ** argv) { } for (int i = 0; i < params.reps; i++) { - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); uint64_t t_start = get_time_ns(); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 2f0898e62..1219c2074 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -60,6 +60,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); + llama_kv_cache * kv = llama_get_kv_cache(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -95,7 +96,7 @@ int main(int argc, char ** argv) { llama_decode(ctx, llama_batch_get_one(&inp.back(), 1)); for (int s = 1; s < W + G + 1; ++s) { - llama_kv_cache_seq_cp(ctx, 0, s, -1, -1); + llama_kv_cache_seq_cp(kv, 0, s, -1, -1); } const auto t_enc_end = ggml_time_us(); @@ -437,17 +438,17 @@ int main(int argc, char ** argv) { // KV cache management // if no verification token matched, we simply remove all cells from this batch -> no fragmentation - llama_kv_cache_seq_rm(ctx, -1, n_past, -1); + llama_kv_cache_seq_rm(kv, -1, n_past, -1); if (seq_id_best != 0) { // if a verification token matched, we keep the best sequence and remove the rest // this leads to some KV cache fragmentation - llama_kv_cache_seq_keep(ctx, seq_id_best); - llama_kv_cache_seq_cp (ctx, seq_id_best, 0, -1, -1); - llama_kv_cache_seq_rm (ctx, seq_id_best, -1, -1); + llama_kv_cache_seq_keep(kv, seq_id_best); + llama_kv_cache_seq_cp (kv, seq_id_best, 0, -1, -1); + llama_kv_cache_seq_rm (kv, seq_id_best, -1, -1); for (int s = 1; s < W + G + 1; ++s) { - llama_kv_cache_seq_cp(ctx, 0, s, -1, -1); + llama_kv_cache_seq_cp(kv, 0, s, -1, -1); } } } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index dbd0444ec..8628f7318 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -35,6 +35,7 @@ int main(int argc, char ** argv){ llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); + llama_kv_cache * kv = llama_get_kv_cache(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -192,7 +193,7 @@ int main(int argc, char ** argv){ // KV cache management // clean the cache of draft tokens that weren't accepted - llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + llama_kv_cache_seq_rm(kv, 0, n_past, -1); common_batch_clear(batch_tgt); common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index da2a03ab9..9d79af79e 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -157,6 +157,8 @@ int main(int argc, char ** argv) { return 1; } + llama_kv_cache * kv = llama_get_kv_cache(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); auto chat_templates = common_chat_templates_from_model(model, params.chat_template); @@ -328,7 +330,7 @@ int main(int argc, char ** argv) { } // remove any "future" tokens that we might have inherited from the previous session - llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); + llama_kv_cache_seq_rm(kv, -1, n_matching_session_tokens, -1); } LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", @@ -569,8 +571,8 @@ int main(int argc, char ** argv) { LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (kv, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_add(kv, 0, params.n_keep + n_discard, n_past, -n_discard); n_past -= n_discard; @@ -593,9 +595,9 @@ int main(int argc, char ** argv) { LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); - llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); - llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); - llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); + llama_kv_cache_seq_add(kv, 0, ga_i, n_past, ib*bd); + llama_kv_cache_seq_div(kv, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); + llama_kv_cache_seq_add(kv, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); n_past -= bd; diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7ef43d5e1..2ba0706dc 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -134,6 +134,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); + llama_kv_cache * kv = llama_get_kv_cache(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -201,7 +202,7 @@ int main(int argc, char ** argv) { // assign the system KV cache to all parallel sequences for (int32_t i = 1; i <= n_clients; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + llama_kv_cache_seq_cp(kv, 0, i, -1, -1); } LOG_INF("\n"); @@ -233,9 +234,9 @@ int main(int argc, char ** argv) { if (batch.n_tokens == 0) { // all sequences have ended - clear the entire KV cache for (int i = 1; i <= n_clients; ++i) { - llama_kv_cache_seq_rm(ctx, i, -1, -1); + llama_kv_cache_seq_rm(kv, i, -1, -1); // but keep the system prompt - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + llama_kv_cache_seq_cp(kv, 0, i, -1, -1); } LOG_INF("%s: clearing the KV cache\n", __func__); @@ -371,8 +372,8 @@ int main(int argc, char ** argv) { } // delete only the generated part of the sequence, i.e. keep the system prompt in the cache - llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1); - llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1); + llama_kv_cache_seq_rm(kv, client.id + 1, -1, -1); + llama_kv_cache_seq_cp(kv, 0, client.id + 1, -1, -1); const auto t_main_end = ggml_time_us(); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 5953928d4..e2764313b 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -86,6 +86,8 @@ int main(int argc, char ** argv) { return 1; } + llama_kv_cache * kv = llama_get_kv_cache(ctx); + auto sparams = llama_sampler_chain_default_params(); llama_sampler * smpl = llama_sampler_chain_init(sparams); @@ -132,11 +134,11 @@ int main(int argc, char ** argv) { const int ib = i/n_batch - 1; const int bd = n_batch_grp*(n_grp - 1); - llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd); - llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); - llama_kv_cache_update (ctx); + llama_kv_cache_seq_add(kv, 0, n_past - n_batch, n_past, ib*bd); + llama_kv_cache_seq_div(kv, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_update_kv_cache (ctx, kv); - n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; + n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1; } common_batch_clear(batch); @@ -166,12 +168,12 @@ int main(int argc, char ** argv) { LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard); - llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); - //llama_kv_cache_defrag (ctx); - llama_kv_cache_update (ctx); + llama_kv_cache_seq_rm (kv, 0, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx, -n_discard); + //llama_kv_cache_defrag (kv); + llama_update_kv_cache (ctx, kv); - n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; + n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1; common_batch_clear(batch); @@ -197,12 +199,12 @@ int main(int argc, char ** argv) { if (n_discard > 0) { LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); - llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); - //llama_kv_cache_defrag (ctx); - llama_kv_cache_update (ctx); + llama_kv_cache_seq_rm (kv, 0, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx, -n_discard); + //llama_kv_cache_defrag (kv); + llama_update_kv_cache (ctx, kv); - n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; + n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1; } } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 9bf6c5743..6c9f716ed 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -299,6 +299,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + const bool add_bos = llama_vocab_get_add_bos(vocab); GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); @@ -360,7 +362,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); llama_batch batch = llama_batch_init(n_batch, 0, 1); @@ -450,6 +452,8 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + const bool add_bos = llama_vocab_get_add_bos(vocab); GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); @@ -546,7 +550,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; @@ -741,6 +745,8 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + // Calculates hellaswag score (acc_norm) from prompt // // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl @@ -923,7 +929,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { return; } - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1084,6 +1090,8 @@ static void winogrande_score(llama_context * ctx, const common_params & params) const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + constexpr int k_min_trailing_ctx = 3; auto data = load_winogrande_from_csv(params.prompt); @@ -1202,7 +1210,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) return; } - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1388,6 +1396,8 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + std::istringstream strstream(params.prompt); uint32_t n_task; strstream.read((char *)&n_task, sizeof(n_task)); @@ -1574,7 +1584,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par return; } - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1671,6 +1681,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + llama_kv_cache * kv = llama_get_kv_cache(ctx); + if (params.logits_file.empty()) { LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); return; @@ -1764,7 +1776,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { } // clear the KV cache - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); llama_batch batch = llama_batch_init(n_batch, 0, 1); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 2439022a2..a907ea076 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -82,8 +82,10 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke } static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { + llama_kv_cache * kv = llama_get_kv_cache(ctx); + // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 92a49eb74..8e2c174a9 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -756,7 +756,8 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll // Function to tokenize the prompt static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt, std::vector & prompt_tokens, const LlamaData & llama_data) { - const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0; + const llama_kv_cache * kv = llama_get_kv_cache(llama_data.context.get()); + const bool is_first = llama_kv_cache_used_cells(kv) == 0; const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); prompt_tokens.resize(n_prompt_tokens); @@ -771,8 +772,10 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt // Check if we have enough space in the context to evaluate this batch static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { + llama_kv_cache * kv = llama_get_kv_cache(ctx.get()); + const int n_ctx = llama_n_ctx(ctx.get()); - const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get()); + const int n_ctx_used = llama_kv_cache_used_cells(kv); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); printe("context size exceeded\n"); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index cf7cbd815..3839fbe8c 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -156,6 +156,8 @@ int main(int argc, char ** argv) { // make new context llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params)); + llama_kv_cache * kv3 = llama_get_kv_cache(ctx3); + llama_sampler * smpl3 = llama_sampler_chain_init(sparams); llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed)); @@ -196,7 +198,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); // erase whole kv - llama_kv_cache_clear(ctx3); + llama_kv_cache_clear(kv3); fprintf(stderr, "%s : kv cache cleared\n", __func__); // restore kv into seq 1 diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b1cde2d7f..076044d39 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1693,6 +1693,7 @@ struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; + llama_kv_cache * kv = nullptr; const llama_vocab * vocab = nullptr; @@ -1755,6 +1756,8 @@ struct server_context { return false; } + kv = llama_get_kv_cache(ctx); + vocab = llama_model_get_vocab(model); n_ctx = llama_n_ctx(ctx); @@ -2023,7 +2026,7 @@ struct server_context { SRV_DBG("%s", "clearing KV cache\n"); // clear the entire KV cache - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(kv); clean_kv_cache = false; } @@ -2565,8 +2568,8 @@ struct server_context { res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); res->t_start = metrics.t_start; - res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); - res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx); + res->kv_cache_tokens_count = llama_kv_cache_n_tokens(kv); + res->kv_cache_used_cells = llama_kv_cache_used_cells(kv); res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; res->t_prompt_processing_total = metrics.t_prompt_processing_total; @@ -2682,7 +2685,7 @@ struct server_context { // Erase token cache const size_t n_erased = slot->cache_tokens.size(); - llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); + llama_kv_cache_seq_rm(kv, slot->id, -1, -1); slot->cache_tokens.clear(); auto res = std::make_unique(); @@ -2750,8 +2753,8 @@ struct server_context { SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); + llama_kv_cache_seq_rm (kv, slot.id, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(kv, slot.id, n_keep + n_discard, slot.n_past, -n_discard); if (slot.params.cache_prompt) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -2938,8 +2941,8 @@ struct server_context { const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c); - llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift); + llama_kv_cache_seq_rm (kv, slot.id, head_p, head_c); + llama_kv_cache_seq_add(kv, slot.id, head_c, -1, kv_shift); for (size_t i = 0; i < n_match; i++) { slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; @@ -2977,9 +2980,9 @@ struct server_context { } // keep only the common part - if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { + if (!llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(ctx, slot.id, -1, -1); + llama_kv_cache_seq_rm(kv, slot.id, -1, -1); // there is no common part left slot.n_past = 0; @@ -3219,7 +3222,7 @@ struct server_context { slot.cache_tokens.push_back(id); slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); - llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); + llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1); for (size_t i = 0; i < ids.size(); ++i) { completion_token_output result; diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index c5534cc13..130e326b5 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -88,6 +88,8 @@ int main(int argc, char ** argv) { return 1; } + const llama_kv_cache * kv = llama_get_kv_cache(ctx); + // initialize the sampler llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params()); llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1)); @@ -98,7 +100,7 @@ int main(int argc, char ** argv) { auto generate = [&](const std::string & prompt) { std::string response; - const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0; + const bool is_first = llama_kv_cache_used_cells(kv) == 0; // tokenize the prompt const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); @@ -113,7 +115,7 @@ int main(int argc, char ** argv) { while (true) { // check if we have enough space in the context to evaluate this batch int n_ctx = llama_n_ctx(ctx); - int n_ctx_used = llama_get_kv_cache_used_cells(ctx); + int n_ctx_used = llama_kv_cache_used_cells(kv); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); fprintf(stderr, "context size exceeded\n"); diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 403ba2dd2..24bdc806d 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -45,6 +45,8 @@ int main(int argc, char ** argv) { model_tgt = llama_init_tgt.model.get(); ctx_tgt = llama_init_tgt.context.get(); + llama_kv_cache * kv = llama_get_kv_cache(ctx_tgt); + const llama_vocab * vocab = llama_model_get_vocab(model_tgt); // load the draft model @@ -217,7 +219,7 @@ int main(int argc, char ** argv) { { LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past); - llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1); + llama_kv_cache_seq_rm(kv, 0, n_past, -1); } if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index c7ccea50d..b4e5259b5 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -90,6 +90,9 @@ int main(int argc, char ** argv) { model_dft = llama_init_dft.model.get(); ctx_dft = llama_init_dft.context.get(); + llama_kv_cache * kv_tgt = llama_get_kv_cache(ctx_tgt); + llama_kv_cache * kv_dft = llama_get_kv_cache(ctx_dft); + const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt); const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft); @@ -420,14 +423,14 @@ int main(int argc, char ** argv) { { LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); - llama_kv_cache_seq_keep(ctx_dft, s_keep); - llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1); - llama_kv_cache_seq_keep(ctx_dft, 0); + llama_kv_cache_seq_keep(kv_dft, s_keep); + llama_kv_cache_seq_cp (kv_dft, s_keep, 0, -1, -1); + llama_kv_cache_seq_keep(kv_dft, 0); - llama_kv_cache_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1); - llama_kv_cache_seq_keep(ctx_tgt, s_keep); - llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1); - llama_kv_cache_seq_keep(ctx_tgt, 0); + llama_kv_cache_seq_rm (kv_tgt, s_keep, n_past_tgt, -1); + llama_kv_cache_seq_keep(kv_tgt, s_keep); + llama_kv_cache_seq_cp (kv_tgt, s_keep, 0, -1, -1); + llama_kv_cache_seq_keep(kv_tgt, 0); } for (int s = 0; s < n_seq_dft; ++s) { @@ -444,8 +447,8 @@ int main(int argc, char ** argv) { common_batch_clear(batch_dft); common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); - llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); - // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); + llama_kv_cache_seq_rm(kv_dft, 0, n_past_dft, -1); + // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(kv_dft, batch_dft).c_str()); llama_decode(ctx_dft, batch_dft); ++n_past_dft; @@ -503,8 +506,8 @@ int main(int argc, char ** argv) { if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) { LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur); - llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1); - llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1); + llama_kv_cache_seq_rm(kv_dft, n_seq_cur, -1, -1); + llama_kv_cache_seq_cp(kv_dft, s, n_seq_cur, -1, -1); // all previous tokens from this branch are now also part of the new branch for (int t = 0; t < batch_tgt.n_tokens; ++t) { @@ -585,9 +588,9 @@ int main(int argc, char ** argv) { // evaluate the target model on the drafted tokens { - llama_kv_cache_seq_keep(ctx_tgt, 0); + llama_kv_cache_seq_keep(kv_tgt, 0); for (int s = 1; s < n_seq_dft; ++s) { - llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1); + llama_kv_cache_seq_cp(kv_tgt, 0, s, -1, -1); } // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); From 4d7bd03e653f24e00158ae7e819908e444a20353 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 13 Jan 2025 15:50:39 +0200 Subject: [PATCH 03/28] kv_cache : functions -> members ggml-ci --- src/llama-context.cpp | 2 +- src/llama-kv-cache.cpp | 490 ++++++++++++++++++++++++++++++++++------- src/llama-kv-cache.h | 404 +++++---------------------------- src/llama.cpp | 16 +- 4 files changed, 467 insertions(+), 445 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index bf5a77cca..0654feccb 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1169,7 +1169,7 @@ struct llama_data_read { } batch.n_seq_id[0] = 1; batch.seq_id[0] = &dest_seq_id; - if (!llama_kv_cache_find_slot(kv_self, batch)) { + if (!kv_self.find_slot(batch)) { LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); return false; } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index b0d5a9318..8b2f6287b 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -11,41 +11,35 @@ static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false}; -uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) { - // the FA kernels require padding to avoid extra runtime boundary checks - return cparams.flash_attn ? 256u : 32u; -} - -bool llama_kv_cache_init( - struct llama_kv_cache & cache, - const llama_model & model, - const llama_cparams & cparams, - ggml_type type_k, - ggml_type type_v, - uint32_t kv_size, - bool offload) { +bool llama_kv_cache::init( + const llama_model & model, + const llama_cparams & cparams, + ggml_type type_k, + ggml_type type_v, + uint32_t kv_size, + bool offload) { const struct llama_hparams & hparams = model.hparams; const int32_t n_layer = hparams.n_layer; - cache.has_shift = false; + has_shift = false; - cache.recurrent = llama_model_is_recurrent(&model); - cache.v_trans = !cache.recurrent && !cparams.flash_attn; - cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + recurrent = llama_model_is_recurrent(&model); + v_trans = !recurrent && !cparams.flash_attn; + can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", - __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift); + __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift); - cache.head = 0; - cache.size = kv_size; - cache.used = 0; + head = 0; + size = kv_size; + used = 0; - cache.type_k = type_k; - cache.type_v = type_v; + type_k = type_k; + type_v = type_v; - cache.cells.clear(); - cache.cells.resize(kv_size); + cells.clear(); + cells.resize(kv_size); // create a context for each buffer type std::map ctx_map; @@ -57,19 +51,23 @@ bool llama_kv_cache_init( /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; + ggml_context * ctx = ggml_init(params); if (!ctx) { return nullptr; } + ctx_map[buft] = ctx; - cache.ctxs.emplace_back(ctx); + ctxs.emplace_back(ctx); + return ctx; } + return it->second; }; - cache.k_l.reserve(n_layer); - cache.v_l.reserve(n_layer); + k_l.reserve(n_layer); + v_l.reserve(n_layer); for (int i = 0; i < n_layer; i++) { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); @@ -95,8 +93,8 @@ bool llama_kv_cache_init( ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); - cache.k_l.push_back(k); - cache.v_l.push_back(v); + k_l.push_back(k); + v_l.push_back(v); } // allocate tensors and initialize the buffers to avoid NaNs in the padding @@ -111,20 +109,339 @@ bool llama_kv_cache_init( } ggml_backend_buffer_clear(buf, 0); LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); - cache.bufs.emplace_back(buf); + bufs.emplace_back(buf); } return true; } -struct llama_kv_cache_slot_info llama_kv_cache_find_slot( - struct llama_kv_cache & cache, +int32_t llama_kv_cache::n_tokens() const { + int32_t result = 0; + + for (uint32_t i = 0; i < size; i++) { + result += cells[i].seq_id.size(); + } + + return result; +} + +size_t llama_kv_cache::total_size() const { + size_t size = 0; + for (const auto & buf : bufs) { + size += ggml_backend_buffer_get_size(buf.get()); + } + + return size; +} + +// TODO: better data structures to reduce the cost of this operation +llama_pos llama_kv_cache::max_pos() const { + llama_pos max_pos = -1; + for (const auto & cell : cells) { + max_pos = std::max(max_pos, cell.pos); + } + + return max_pos; +} + +void llama_kv_cache::clear() { + for (int32_t i = 0; i < (int32_t) size; ++i) { + cells[i].pos = -1; + cells[i].seq_id.clear(); + cells[i].src = -1; + cells[i].tail = -1; + } + head = 0; + used = 0; + + for (auto & buf : bufs) { + ggml_backend_buffer_clear(buf.get(), 0); + } +} + +bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // models like Mamba or RWKV can't have a state partially erased + if (recurrent) { + if (seq_id >= (int64_t) size) { + // could be fatal + return false; + } + if (0 <= seq_id) { + int32_t & tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + const llama_kv_cell & cell = cells[tail_id]; + // partial intersection is invalid + if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { + return false; + } + // invalidate tails which will be cleared + if (p0 <= cell.pos && cell.pos < p1) { + tail_id = -1; + } + } + } else { + // seq_id is negative, then the range should include everything or nothing + if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { + return false; + } + } + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].pos >= p0 && cells[i].pos < p1) { + if (seq_id < 0) { + cells[i].seq_id.clear(); + } else if (cells[i].has_seq_id(seq_id)) { + cells[i].seq_id.erase(seq_id); + } else { + continue; + } + if (cells[i].is_empty()) { + // keep count of the number of used cells + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } + + return true; +} + +void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + if (seq_id_src == seq_id_dst) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + if (recurrent) { + if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) { + llama_kv_cell & tail_src = cells[seq_id_src]; + llama_kv_cell & tail_dst = cells[seq_id_dst]; + if (tail_dst.tail >= 0) { + // clear destination seq_id if it wasn't empty + llama_kv_cell & cell_dst = cells[tail_dst.tail]; + + cell_dst.seq_id.erase(seq_id_dst); + tail_dst.tail = -1; + if (cell_dst.seq_id.empty()) { + cell_dst.pos = -1; + cell_dst.delta = -1; + cell_dst.src = -1; + used -= 1; + } + } + if (tail_src.tail >= 0) { + llama_kv_cell & cell_src = cells[tail_src.tail]; + + cell_src.seq_id.insert(seq_id_dst); + tail_dst.tail = tail_src.tail; + } + } + + return; + } + + // otherwise, this is the KV of a Transformer-like model + head = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) { + cells[i].seq_id.insert(seq_id_dst); + } + } +} + +void llama_kv_cache::seq_keep(llama_seq_id seq_id) { + uint32_t new_head = size; + + for (uint32_t i = 0; i < size; ++i) { + if (recurrent && (llama_seq_id) i != seq_id) { + cells[i].tail = -1; + } + + if (!cells[i].has_seq_id(seq_id)) { + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + cells[i].seq_id.clear(); + + if (new_head == size){ + new_head = i; + } + } else { + cells[i].seq_id.clear(); + cells[i].seq_id.insert(seq_id); + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } +} + +void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + if (delta == 0) { + return; + } + + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the + if (p0 == p1) { + return; + } + + if (recurrent) { + // for Mamba-like or RWKV models, only the pos needs to be shifted + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos += delta; + } + } + } + return; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { + has_shift = true; + cells[i].pos += delta; + cells[i].delta += delta; + + if (cells[i].pos < 0) { + if (!cells[i].is_empty()) { + used--; + } + cells[i].pos = -1; + cells[i].seq_id.clear(); + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + // Otherwise we just start the next search from the beginning. + head = new_head != size ? new_head : 0; +} + +void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + if (d == 1) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the cache. + if (p0 == p1) { + return; + } + + if (recurrent) { + // for Mamba-like or RWKV models, only the pos needs to be changed + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos /= d; + } + } + } + + return; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { + has_shift = true; + + { + llama_pos p_old = cells[i].pos; + cells[i].pos /= d; + cells[i].delta += cells[i].pos - p_old; + } + } + } +} + +llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) { + llama_pos result = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id)) { + result = std::max(result, cells[i].pos); + } + } + + return result; +} + +void llama_kv_cache::defrag() { + if (!recurrent) { + do_defrag = true; + } +} + +struct llama_kv_cache_slot_info llama_kv_cache::find_slot( const struct llama_ubatch & ubatch) { const uint32_t n_tokens = ubatch.n_tokens; const uint32_t n_seqs = ubatch.n_seqs; const uint32_t n_seq_tokens = ubatch.n_seq_tokens; - if (cache.recurrent) { + if (recurrent) { // For recurrent state architectures (like Mamba or RWKV), // each cache cell can store the state for a whole sequence. // A slot should be always be contiguous. @@ -132,7 +449,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // can only process batches with an equal number of new tokens in each sequence GGML_ASSERT(ubatch.equal_seqs); - int32_t min = cache.size - 1; + int32_t min = size - 1; int32_t max = 0; // everything should fit if all seq_ids are smaller than the max @@ -141,16 +458,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( for (uint32_t j = 0; j < n_seq_id; ++j) { const llama_seq_id seq_id = ubatch.seq_id[s][j]; - if (seq_id < 0 || (uint32_t) seq_id >= cache.size) { + if (seq_id < 0 || (uint32_t) seq_id >= size) { // too big seq_id // TODO: would it be possible to resize the cache instead? - LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size); + LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size); return llama_kv_cache_slot_info_failed; } if (j > 0) { - llama_kv_cell & seq = cache.cells[seq_id]; + llama_kv_cell & seq = cells[seq_id]; if (seq.tail >= 0) { - llama_kv_cell & cell = cache.cells[seq.tail]; + llama_kv_cell & cell = cells[seq.tail]; // clear cells from seq_ids that become shared // (should not normally happen, but let's handle it anyway) cell.seq_id.erase(seq_id); @@ -158,7 +475,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( if (cell.seq_id.empty()) { cell.pos = -1; cell.src = -1; - cache.used -= 1; + used -= 1; } } } @@ -168,9 +485,9 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( #ifndef NDEBUG { std::vector tails_verif; - tails_verif.assign(cache.size, -1); - for (uint32_t i = 0; i < cache.size; ++i) { - llama_kv_cell & cell = cache.cells[i]; + tails_verif.assign(size, -1); + for (uint32_t i = 0; i < size; ++i) { + llama_kv_cell & cell = cells[i]; for (llama_seq_id seq_id : cell.seq_id) { if (tails_verif[seq_id] != -1) { LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]); @@ -178,20 +495,20 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( tails_verif[seq_id] = i; } } - for (uint32_t i = 0; i < cache.size; ++i) { - if (tails_verif[i] != cache.cells[i].tail) { - LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]); + for (uint32_t i = 0; i < size; ++i) { + if (tails_verif[i] != cells[i].tail) { + LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]); } } } #endif // find next empty cell - uint32_t next_empty_cell = cache.head; + uint32_t next_empty_cell = head; - for (uint32_t i = 0; i < cache.size; ++i) { - if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; } - llama_kv_cell & cell = cache.cells[next_empty_cell]; + for (uint32_t i = 0; i < size; ++i) { + if (next_empty_cell >= size) { next_empty_cell -= size; } + llama_kv_cell & cell = cells[next_empty_cell]; if (cell.is_empty()) { break; } next_empty_cell += 1; } @@ -199,20 +516,20 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // find usable cell range for (uint32_t s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch.seq_id[s][0]; - llama_kv_cell & seq_meta = cache.cells[seq_id]; + llama_kv_cell & seq_meta = cells[seq_id]; bool has_cell = false; if (seq_meta.tail >= 0) { - llama_kv_cell & cell = cache.cells[seq_meta.tail]; + llama_kv_cell & cell = cells[seq_meta.tail]; GGML_ASSERT(cell.has_seq_id(seq_id)); // does this seq_id "own" the cell? if (cell.seq_id.size() == 1) { has_cell = true; } } if (!has_cell) { - llama_kv_cell & empty_cell = cache.cells[next_empty_cell]; + llama_kv_cell & empty_cell = cells[next_empty_cell]; GGML_ASSERT(empty_cell.is_empty()); // copy old tail into the empty cell if (seq_meta.tail >= 0) { - llama_kv_cell & orig_cell = cache.cells[seq_meta.tail]; + llama_kv_cell & orig_cell = cells[seq_meta.tail]; empty_cell.pos = orig_cell.pos; empty_cell.src = orig_cell.src; orig_cell.seq_id.erase(seq_id); @@ -222,9 +539,9 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // find next empty cell if (s + 1 < n_seqs) { next_empty_cell += 1; - for (uint32_t i = 0; i < cache.size; ++i) { - if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; } - llama_kv_cell & cell = cache.cells[next_empty_cell]; + for (uint32_t i = 0; i < size; ++i) { + if (next_empty_cell >= size) { next_empty_cell -= size; } + llama_kv_cell & cell = cells[next_empty_cell]; if (cell.is_empty()) { break; } next_empty_cell += 1; } @@ -237,10 +554,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // gather and re-order for (uint32_t s = 0; s < n_seqs; ++s) { int32_t dst_id = s + min; - int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail; + int32_t src_id = cells[ubatch.seq_id[s][0]].tail; if (dst_id != src_id) { - llama_kv_cell & dst_cell = cache.cells[dst_id]; - llama_kv_cell & src_cell = cache.cells[src_id]; + llama_kv_cell & dst_cell = cells[dst_id]; + llama_kv_cell & src_cell = cells[src_id]; std::swap(dst_cell.pos, src_cell.pos); std::swap(dst_cell.src, src_cell.src); @@ -248,10 +565,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( // swap tails (assuming they NEVER overlap) for (const llama_seq_id seq_id : src_cell.seq_id) { - cache.cells[seq_id].tail = src_id; + cells[seq_id].tail = src_id; } for (const llama_seq_id seq_id : dst_cell.seq_id) { - cache.cells[seq_id].tail = dst_id; + cells[seq_id].tail = dst_id; } } } @@ -260,7 +577,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( for (uint32_t s = 0; s < n_seqs; ++s) { const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1]; int32_t cell_id = s + min; - llama_kv_cell & cell = cache.cells[cell_id]; + llama_kv_cell & cell = cells[cell_id]; if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) { // What should happen when the pos backtracks or skips a value? @@ -273,41 +590,41 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) { const llama_seq_id seq_id = ubatch.seq_id[s][j]; cell.seq_id.insert(seq_id); - cache.cells[seq_id].tail = cell_id; + cells[seq_id].tail = cell_id; } } // allow getting the range of used cells, from head to head + n - cache.head = min; - cache.n = max - min + 1; - cache.used = std::count_if(cache.cells.begin(), cache.cells.end(), + head = min; + n = max - min + 1; + used = std::count_if(cells.begin(), cells.end(), [](const llama_kv_cell& cell){ return !cell.is_empty(); }); // sanity check - return llama_kv_cache_slot_info(cache.n >= n_seqs); + return llama_kv_cache_slot_info(n >= n_seqs); } // otherwise, one cell per token. - if (n_tokens > cache.size) { - LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size); + if (n_tokens > size) { + LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size); return llama_kv_cache_slot_info_failed; } uint32_t n_tested = 0; while (true) { - if (cache.head + n_tokens > cache.size) { - n_tested += cache.size - cache.head; - cache.head = 0; + if (head + n_tokens > size) { + n_tested += size - head; + head = 0; continue; } bool found = true; for (uint32_t i = 0; i < n_tokens; i++) { - if (cache.cells[cache.head + i].pos >= 0) { + if (cells[head + i].pos >= 0) { found = false; - cache.head += i + 1; - n_tested += i + 1; + head += i + 1; + n_tested += i + 1; break; } } @@ -316,7 +633,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( break; } - if (n_tested >= cache.size) { + if (n_tested >= size) { //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); return llama_kv_cache_slot_info_failed; } @@ -325,22 +642,27 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot( for (uint32_t s = 0; s < n_seqs; s++) { for (uint32_t i = 0; i < n_seq_tokens; ++i) { uint32_t k = s*n_seq_tokens + i; - cache.cells[cache.head + k].pos = ubatch.pos[k]; + cells[head + k].pos = ubatch.pos[k]; for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) { - cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]); + cells[head + k].seq_id.insert(ubatch.seq_id[s][j]); } } } - cache.used += n_tokens; + used += n_tokens; - return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens); + return llama_kv_cache_slot_info(head, head + n_tokens); } -uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) { - for (uint32_t i = cache.size; i > 0; --i) { - const llama_kv_cell & cell = cache.cells[i - 1]; +uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) const { + // the FA kernels require padding to avoid extra runtime boundary checks + return cparams.flash_attn ? 256u : 32u; +} + +uint32_t llama_kv_cache::cell_max() const { + for (uint32_t i = size; i > 0; --i) { + const llama_kv_cell & cell = cells[i - 1]; if (cell.pos >= 0 && !cell.is_empty()) { return i; diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index b0bb1cfb1..4ee3418d8 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -7,6 +7,9 @@ #include #include +struct llama_cparams; +struct llama_ubatch; + struct llama_kv_cell { llama_pos pos = -1; llama_pos delta = 0; @@ -28,7 +31,19 @@ struct llama_kv_cell { } }; +// a structure holds information about the slot found in llama_kv_cache_find_slot +struct llama_kv_cache_slot_info { + std::pair boundaries; // slot boundaries [begin, end) + bool found = false; // the slot was found + + explicit llama_kv_cache_slot_info(bool found_) : found{found_} {} + llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {} + + operator bool() const { return found; } +}; + // ring-buffer of cached KV data +// TODO: pimpl struct llama_kv_cache { bool has_shift = false; bool do_defrag = false; @@ -57,343 +72,8 @@ struct llama_kv_cache { std::vector ctxs; std::vector bufs; - int32_t n_tokens() const { - int32_t result = 0; - - for (uint32_t i = 0; i < size; i++) { - result += cells[i].seq_id.size(); - } - - return result; - } - - size_t total_size() const { - size_t size = 0; - for (const auto & buf : bufs) { - size += ggml_backend_buffer_get_size(buf.get()); - } - - return size; - } - - // TODO: better data structures to reduce the cost of this operation - llama_pos max_pos() const { - llama_pos max_pos = -1; - for (const auto & cell : cells) { - max_pos = std::max(max_pos, cell.pos); - } - - return max_pos; - } - - void clear() { - for (int32_t i = 0; i < (int32_t) size; ++i) { - cells[i].pos = -1; - cells[i].seq_id.clear(); - cells[i].src = -1; - cells[i].tail = -1; - } - head = 0; - used = 0; - - for (auto & buf : bufs) { - ggml_backend_buffer_clear(buf.get(), 0); - } - } - - bool seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { - uint32_t new_head = size; - - if (p0 < 0) { - p0 = 0; - } - - if (p1 < 0) { - p1 = std::numeric_limits::max(); - } - - // models like Mamba or RWKV can't have a state partially erased - if (recurrent) { - if (seq_id >= (int64_t) size) { - // could be fatal - return false; - } - if (0 <= seq_id) { - int32_t & tail_id = cells[seq_id].tail; - if (tail_id >= 0) { - const llama_kv_cell & cell = cells[tail_id]; - // partial intersection is invalid - if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { - return false; - } - // invalidate tails which will be cleared - if (p0 <= cell.pos && cell.pos < p1) { - tail_id = -1; - } - } - } else { - // seq_id is negative, then the range should include everything or nothing - if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { - return false; - } - } - } - - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].pos >= p0 && cells[i].pos < p1) { - if (seq_id < 0) { - cells[i].seq_id.clear(); - } else if (cells[i].has_seq_id(seq_id)) { - cells[i].seq_id.erase(seq_id); - } else { - continue; - } - if (cells[i].is_empty()) { - // keep count of the number of used cells - if (cells[i].pos >= 0) { - used--; - } - - cells[i].pos = -1; - cells[i].src = -1; - - if (new_head == size) { - new_head = i; - } - } - } - } - - // If we freed up a slot, set head to it so searching can start there. - if (new_head != size && new_head < head) { - head = new_head; - } - - return true; - } - - void seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { - if (seq_id_src == seq_id_dst) { - return; - } - - if (p0 < 0) { - p0 = 0; - } - - if (p1 < 0) { - p1 = std::numeric_limits::max(); - } - - if (recurrent) { - if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) { - llama_kv_cell & tail_src = cells[seq_id_src]; - llama_kv_cell & tail_dst = cells[seq_id_dst]; - if (tail_dst.tail >= 0) { - // clear destination seq_id if it wasn't empty - llama_kv_cell & cell_dst = cells[tail_dst.tail]; - - cell_dst.seq_id.erase(seq_id_dst); - tail_dst.tail = -1; - if (cell_dst.seq_id.empty()) { - cell_dst.pos = -1; - cell_dst.delta = -1; - cell_dst.src = -1; - used -= 1; - } - } - if (tail_src.tail >= 0) { - llama_kv_cell & cell_src = cells[tail_src.tail]; - - cell_src.seq_id.insert(seq_id_dst); - tail_dst.tail = tail_src.tail; - } - } - - return; - } - - // otherwise, this is the KV of a Transformer-like model - head = 0; - - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) { - cells[i].seq_id.insert(seq_id_dst); - } - } - } - - void seq_keep(llama_seq_id seq_id) { - uint32_t new_head = size; - - for (uint32_t i = 0; i < size; ++i) { - if (recurrent && (llama_seq_id) i != seq_id) { - cells[i].tail = -1; - } - - if (!cells[i].has_seq_id(seq_id)) { - if (cells[i].pos >= 0) { - used--; - } - - cells[i].pos = -1; - cells[i].src = -1; - cells[i].seq_id.clear(); - - if (new_head == size){ - new_head = i; - } - } else { - cells[i].seq_id.clear(); - cells[i].seq_id.insert(seq_id); - } - } - - // If we freed up a slot, set head to it so searching can start there. - if (new_head != size && new_head < head) { - head = new_head; - } - } - - void seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { - if (delta == 0) { - return; - } - - uint32_t new_head = size; - - if (p0 < 0) { - p0 = 0; - } - - if (p1 < 0) { - p1 = std::numeric_limits::max(); - } - - // If there is no range then return early to avoid looping over the - if (p0 == p1) { - return; - } - - if (recurrent) { - // for Mamba-like or RWKV models, only the pos needs to be shifted - if (0 <= seq_id && seq_id < (int64_t) size) { - const int32_t tail_id = cells[seq_id].tail; - if (tail_id >= 0) { - llama_kv_cell & cell = cells[tail_id]; - if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos += delta; - } - } - } - return; - } - - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { - has_shift = true; - cells[i].pos += delta; - cells[i].delta += delta; - - if (cells[i].pos < 0) { - if (!cells[i].is_empty()) { - used--; - } - cells[i].pos = -1; - cells[i].seq_id.clear(); - if (new_head == size) { - new_head = i; - } - } - } - } - - // If we freed up a slot, set head to it so searching can start there. - // Otherwise we just start the next search from the beginning. - head = new_head != size ? new_head : 0; - } - - void seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { - if (d == 1) { - return; - } - - if (p0 < 0) { - p0 = 0; - } - - if (p1 < 0) { - p1 = std::numeric_limits::max(); - } - - // If there is no range then return early to avoid looping over the cache. - if (p0 == p1) { - return; - } - - if (recurrent) { - // for Mamba-like or RWKV models, only the pos needs to be changed - if (0 <= seq_id && seq_id < (int64_t) size) { - const int32_t tail_id = cells[seq_id].tail; - if (tail_id >= 0) { - llama_kv_cell & cell = cells[tail_id]; - if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { - cell.pos /= d; - } - } - } - - return; - } - - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { - has_shift = true; - - { - llama_pos p_old = cells[i].pos; - cells[i].pos /= d; - cells[i].delta += cells[i].pos - p_old; - } - } - } - } - - llama_pos seq_pos_max(llama_seq_id seq_id) { - llama_pos result = 0; - - for (uint32_t i = 0; i < size; ++i) { - if (cells[i].has_seq_id(seq_id)) { - result = std::max(result, cells[i].pos); - } - } - - return result; - } - - void defrag() { - if (!recurrent) { - do_defrag = true; - } - } -}; - -// a structure holds information about the slot found in llama_kv_cache_find_slot -struct llama_kv_cache_slot_info { - std::pair boundaries; // slot boundaries [begin, end) - bool found = false; // the slot was found - - explicit llama_kv_cache_slot_info(bool found_) : found{found_} {} - llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {} - - operator bool() const { return found; } -}; - -// TODO: maybe not needed -uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams); - -bool llama_kv_cache_init( - struct llama_kv_cache & cache, + // TODO: become constructor + bool init( const llama_model & model, const llama_cparams & cparams, ggml_type type_k, @@ -401,25 +81,38 @@ bool llama_kv_cache_init( uint32_t kv_size, bool offload); -// find an empty slot of size "n_tokens" in the cache -// updates the cache head -// returns a structure holding information about the slot found -// Note: On success, it's important that cache.head points -// to the first cell of the slot. -struct llama_kv_cache_slot_info llama_kv_cache_find_slot( - struct llama_kv_cache & cache, - const struct llama_ubatch & batch); + int32_t n_tokens() const; -// find how many cells are currently in use -uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache); + size_t total_size() const; -// -// kv cache view -// + // TODO: better data structures to reduce the cost of this operation + llama_pos max_pos() const; -struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max); + void clear(); -void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv); + bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1); + void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1); + void seq_keep(llama_seq_id seq_id); + void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta); + void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d); + + llama_pos seq_pos_max(llama_seq_id seq_id); + + void defrag(); + + // find an empty slot of size "n_tokens" in the cache + // updates the cache head + // returns a structure holding information about the slot found + // Note: On success, it's important that cache.head points + // to the first cell of the slot. + llama_kv_cache_slot_info find_slot(const llama_ubatch & batch); + + // TODO: maybe not needed + uint32_t get_padding(const llama_cparams & cparams) const; + + // find how many cells are currently in use + uint32_t cell_max() const; +}; // // kv cache restore @@ -472,3 +165,10 @@ struct llama_kv_slot_restorer { } }; +// +// kv cache view +// + +struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max); + +void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv); diff --git a/src/llama.cpp b/src/llama.cpp index 87dd512b2..d8427af9d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8572,18 +8572,18 @@ static int llama_decode_impl( kv_self.head = 0; } - const auto slot = llama_kv_cache_find_slot(kv_self, ubatch); - if (!slot) { + const auto slot_info = kv_self.find_slot(ubatch); + if (!slot_info) { return 1; } - kv_slot_restorer.save(slot); + kv_slot_restorer.save(slot_info); if (!kv_self.recurrent) { // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - const uint32_t pad = llama_kv_cache_get_padding(cparams); - kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad))); + const uint32_t pad = kv_self.get_padding(cparams); + kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad))); //kv_self.n = llama_kv_cache_cell_max(kv_self); } } @@ -8969,7 +8969,7 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) { const uint32_t n_layer = hparams.n_layer; - const uint32_t n_kv = llama_kv_cache_cell_max(kv_self); + const uint32_t n_kv = kv_self.cell_max(); const uint32_t n_used = kv_self.used; assert(n_used <= n_kv); @@ -9550,7 +9550,7 @@ struct llama_context * llama_init_from_model( cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; // this is necessary due to kv_self.n being padded later during inference - cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams)); + cparams.n_ctx = GGML_PAD(cparams.n_ctx, ctx->kv_self.get_padding(cparams)); // with causal attention, the batch size is limited by the context size cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; @@ -9692,7 +9692,7 @@ struct llama_context * llama_init_from_model( llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data); - if (!llama_kv_cache_init(ctx->kv_self, ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { + if (!ctx->kv_self.init(ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; From fef90cb3d7a823bd00a7899b52ffc70a4f824d44 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 13 Jan 2025 15:58:20 +0200 Subject: [PATCH 04/28] kv_cache : fix ggml-ci --- src/llama-kv-cache.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 8b2f6287b..fe5986768 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -35,8 +35,8 @@ bool llama_kv_cache::init( size = kv_size; used = 0; - type_k = type_k; - type_v = type_v; + this->type_k = type_k; + this->type_v = type_v; cells.clear(); cells.resize(kv_size); From 73a14eccc9f200d6012963af9448042dfeac54fc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 14 Jan 2025 11:56:53 +0200 Subject: [PATCH 05/28] kv_cache : minor --- src/llama-kv-cache.cpp | 38 +++++++++++++++++++++++++++++++------- src/llama-kv-cache.h | 18 +++++++++++------- src/llama.cpp | 18 +++++------------- 3 files changed, 47 insertions(+), 27 deletions(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index fe5986768..9f3b4e514 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -73,17 +73,22 @@ bool llama_kv_cache::init( const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); - LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa); + const char * dev_name = "CPU"; ggml_backend_buffer_type_t buft; if (offload) { auto * dev = model.dev_layer(i); buft = ggml_backend_dev_buffer_type(dev); + + dev_name = ggml_backend_dev_name(dev); } else { buft = ggml_backend_cpu_buffer_type(); } - ggml_context * ctx = ctx_for_buft(buft); + LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__, + i, n_embd_k_gqa, n_embd_v_gqa, dev_name); + + ggml_context * ctx = ctx_for_buft(buft); if (!ctx) { LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__); return false; @@ -134,14 +139,13 @@ size_t llama_kv_cache::total_size() const { return size; } -// TODO: better data structures to reduce the cost of this operation -llama_pos llama_kv_cache::max_pos() const { - llama_pos max_pos = -1; +llama_pos llama_kv_cache::pos_max() const { + llama_pos pos_max = -1; for (const auto & cell : cells) { - max_pos = std::max(max_pos, cell.pos); + pos_max = std::max(pos_max, cell.pos); } - return max_pos; + return pos_max; } void llama_kv_cache::clear() { @@ -672,6 +676,26 @@ uint32_t llama_kv_cache::cell_max() const { return 0; } +size_t llama_kv_cache::size_k_bytes() const { + size_t size_k_bytes = 0; + + for (const auto & k : k_l) { + size_k_bytes += ggml_nbytes(k); + } + + return size_k_bytes; +} + +size_t llama_kv_cache::size_v_bytes() const { + size_t size_v_bytes = 0; + + for (const auto & v : v_l) { + size_v_bytes += ggml_nbytes(v); + } + + return size_v_bytes; +} + void llama_kv_cache_clear(llama_kv_cache * kv) { kv->clear(); } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 4ee3418d8..97285481e 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -61,17 +61,11 @@ struct llama_kv_cache { // computed before each graph build uint32_t n = 0; - ggml_type type_k = GGML_TYPE_F16; - ggml_type type_v = GGML_TYPE_F16; - std::vector cells; std::vector k_l; // per layer std::vector v_l; - std::vector ctxs; - std::vector bufs; - // TODO: become constructor bool init( const llama_model & model, @@ -86,7 +80,7 @@ struct llama_kv_cache { size_t total_size() const; // TODO: better data structures to reduce the cost of this operation - llama_pos max_pos() const; + llama_pos pos_max() const; void clear(); @@ -112,6 +106,16 @@ struct llama_kv_cache { // find how many cells are currently in use uint32_t cell_max() const; + + size_t size_k_bytes() const; + size_t size_v_bytes() const; + +private: + ggml_type type_k = GGML_TYPE_F16; + ggml_type type_v = GGML_TYPE_F16; + + std::vector ctxs; + std::vector bufs; }; // diff --git a/src/llama.cpp b/src/llama.cpp index d8427af9d..0227ba6b3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1973,7 +1973,7 @@ struct llm_build_context { if (il == n_layer - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -8456,7 +8456,7 @@ static int llama_decode_impl( } // temporary allocate memory for the input batch if needed - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; const uint32_t n_tokens_all = batch.n_tokens; @@ -8792,7 +8792,7 @@ static int llama_encode_impl( } // temporary allocate memory for the input batch if needed - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; const uint32_t n_tokens = batch.n_tokens; @@ -9699,16 +9699,8 @@ struct llama_context * llama_init_from_model( } { - size_t memory_size_k = 0; - size_t memory_size_v = 0; - - for (auto & k : ctx->kv_self.k_l) { - memory_size_k += ggml_nbytes(k); - } - - for (auto & v : ctx->kv_self.v_l) { - memory_size_v += ggml_nbytes(v); - } + const size_t memory_size_k = ctx->kv_self.size_k_bytes(); + const size_t memory_size_v = ctx->kv_self.size_v_bytes(); LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), From 4cd1b6fa4cc4e8da927caac5c61b9fcd096a1ace Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 14 Jan 2025 12:33:13 +0200 Subject: [PATCH 06/28] context : prepare kv_cache_read/write to be moved to kv_cache ggml-ci --- src/llama-context.cpp | 153 +++++++++++++++++++++--------------------- src/llama-kv-cache.h | 1 + 2 files changed, 76 insertions(+), 78 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 0654feccb..8fc6de2f2 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -928,11 +928,8 @@ struct llama_data_write { } } - void write_kv_cache_data(const struct llama_context * ctx, const std::vector> & cell_ranges) { - const struct llama_kv_cache & kv_self = ctx->kv_self; - const struct llama_hparams & hparams = ctx->model.hparams; - - const uint32_t v_trans = kv_self.v_trans ? 1 : 0; + void write_kv_cache_data(const llama_kv_cache & kv, const llama_hparams & hparams, const std::vector> & cell_ranges) { + const uint32_t v_trans = kv.v_trans ? 1 : 0; const uint32_t n_layer = hparams.n_layer; write(&v_trans, sizeof(v_trans)); @@ -946,52 +943,52 @@ struct llama_data_write { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); // Write key type - const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; + const int32_t k_type_i = (int32_t)kv.k_l[il]->type; write(&k_type_i, sizeof(k_type_i)); // Write row size of key - const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + const uint64_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa); write(&k_size_row, sizeof(k_size_row)); // Read each range of cells of k_size length each into tmp_buf and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * k_size_row; - write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size); + write_tensor_data(kv.k_l[il], range.first * k_size_row, buf_size); } } - if (!kv_self.v_trans) { + if (!kv.v_trans) { for (uint32_t il = 0; il < n_layer; ++il) { const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); // Write value type - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + const int32_t v_type_i = (int32_t)kv.v_l[il]->type; write(&v_type_i, sizeof(v_type_i)); // Write row size of value - const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + const uint64_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa); write(&v_size_row, sizeof(v_size_row)); // Read each range of cells of v_size length each into tmp_buf and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * v_size_row; - write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size); + write_tensor_data(kv.v_l[il], range.first * v_size_row, buf_size); } } } else { // When v is transposed, we also need the element size and get the element ranges from each row - const uint32_t kv_size = kv_self.size; + const uint32_t kv_size = kv.size; for (uint32_t il = 0; il < n_layer; ++il) { const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); // Write value type - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + const int32_t v_type_i = (int32_t)kv.v_l[il]->type; write(&v_type_i, sizeof(v_type_i)); // Write element size - const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + const uint32_t v_size_el = ggml_type_size(kv.v_l[il]->type); write(&v_size_el, sizeof(v_size_el)); // Write GQA embedding size @@ -1004,37 +1001,36 @@ struct llama_data_write { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * kv_size) * v_size_el; const size_t buf_size = range_size * v_size_el; - write_tensor_data(kv_self.v_l[il], src_offset, buf_size); + write_tensor_data(kv.v_l[il], src_offset, buf_size); } } } } } - void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) { - const struct llama_kv_cache & kv_self = ctx->kv_self; + void write_kv_cache(const llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) { std::vector> cell_ranges; // ranges, from inclusive, to exclusive uint32_t cell_count = 0; // Count the number of cells with the specified seq_id // Find all the ranges of cells with this seq id (or all, when -1) - uint32_t cell_range_begin = kv_self.size; - for (uint32_t i = 0; i < kv_self.size; ++i) { - const auto & cell = kv_self.cells[i]; + uint32_t cell_range_begin = kv.size; + for (uint32_t i = 0; i < kv.size; ++i) { + const auto & cell = kv.cells[i]; if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { ++cell_count; - if (cell_range_begin == kv_self.size) { + if (cell_range_begin == kv.size) { cell_range_begin = i; } } else { - if (cell_range_begin != kv_self.size) { + if (cell_range_begin != kv.size) { cell_ranges.emplace_back(cell_range_begin, i); - cell_range_begin = kv_self.size; + cell_range_begin = kv.size; } } } - if (cell_range_begin != kv_self.size) { - cell_ranges.emplace_back(cell_range_begin, kv_self.size); + if (cell_range_begin != kv.size) { + cell_ranges.emplace_back(cell_range_begin, kv.size); } // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count @@ -1046,8 +1042,8 @@ struct llama_data_write { write(&cell_count, sizeof(cell_count)); - write_kv_cache_meta(kv_self, cell_ranges, seq_id); - write_kv_cache_data(ctx, cell_ranges); + write_kv_cache_meta(kv, cell_ranges, seq_id); + write_kv_cache_data(kv, hparams, cell_ranges); } }; @@ -1140,15 +1136,15 @@ struct llama_data_read { } } - bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) { - struct llama_kv_cache & kv_self = ctx->kv_self; - + bool read_kv_cache_meta(llama_kv_cache & kv, uint32_t cell_count, llama_seq_id dest_seq_id = -1) { if (dest_seq_id != -1) { // single sequence - kv_self.seq_rm(dest_seq_id, -1, -1); + kv.seq_rm(dest_seq_id, -1, -1); + + llama_sbatch sbatch; + llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false); - llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false); batch.n_tokens = cell_count; batch.n_seq_tokens = cell_count; batch.n_seqs = 1; @@ -1157,7 +1153,7 @@ struct llama_data_read { llama_pos pos; uint32_t n_seq_id; - read_to(&pos, sizeof(pos)); + read_to(&pos, sizeof(pos)); read_to(&n_seq_id, sizeof(n_seq_id)); if (n_seq_id != 0) { @@ -1169,30 +1165,30 @@ struct llama_data_read { } batch.n_seq_id[0] = 1; batch.seq_id[0] = &dest_seq_id; - if (!kv_self.find_slot(batch)) { + if (!kv.find_slot(batch)) { LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); return false; } - // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values) + // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values) // Assume that this is one contiguous block of cells - GGML_ASSERT(kv_self.head + cell_count <= kv_self.size); - GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]); - GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]); - GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id)); - GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id)); + GGML_ASSERT(kv.head + cell_count <= kv.size); + GGML_ASSERT(kv.cells[kv.head].pos == batch.pos[0]); + GGML_ASSERT(kv.cells[kv.head + cell_count - 1].pos == batch.pos[cell_count - 1]); + GGML_ASSERT(kv.cells[kv.head].has_seq_id(dest_seq_id)); + GGML_ASSERT(kv.cells[kv.head + cell_count - 1].has_seq_id(dest_seq_id)); } else { // whole KV cache restore - if (cell_count > kv_self.size) { + if (cell_count > kv.size) { LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); return false; } - kv_self.clear(); + kv.clear(); for (uint32_t i = 0; i < cell_count; ++i) { - llama_kv_cell & cell = kv_self.cells[i]; + llama_kv_cell & cell = kv.cells[i]; llama_pos pos; uint32_t n_seq_id; @@ -1206,15 +1202,18 @@ struct llama_data_read { llama_seq_id seq_id; read_to(&seq_id, sizeof(seq_id)); - if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { - LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); + // TODO: llama_kv_cache should have a notion of max sequences + //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { + if (seq_id < 0) { + //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); + LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id); return false; } cell.seq_id.insert(seq_id); - if (kv_self.recurrent) { - int32_t & tail = kv_self.cells[seq_id].tail; + if (kv.recurrent) { + int32_t & tail = kv.cells[seq_id].tail; if (tail != -1) { LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail); return false; @@ -1224,24 +1223,22 @@ struct llama_data_read { } } - kv_self.head = 0; - kv_self.used = cell_count; + kv.head = 0; + kv.used = cell_count; } - if (kv_self.recurrent) { + if (kv.recurrent) { for (uint32_t i = 0; i < cell_count; ++i) { - uint32_t cell_id = kv_self.head + i; + uint32_t cell_id = kv.head + i; // make sure the recurrent states will keep their restored state - kv_self.cells[cell_id].src = cell_id; + kv.cells[cell_id].src = cell_id; } } return true; } - bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) { - const struct llama_hparams & hparams = ctx->model.hparams; - struct llama_kv_cache & kv_self = ctx->kv_self; + bool read_kv_cache_data(llama_kv_cache & kv, const llama_hparams & hparams, uint32_t cell_count) { uint32_t v_trans; uint32_t n_layer; read_to(&v_trans, sizeof(v_trans)); @@ -1251,11 +1248,11 @@ struct llama_data_read { LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); return false; } - if (cell_count > kv_self.size) { - LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size); + if (cell_count > kv.size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv.size); return false; } - if (kv_self.v_trans != (bool) v_trans) { + if (kv.v_trans != (bool) v_trans) { LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); return false; } @@ -1267,7 +1264,7 @@ struct llama_data_read { // Read type of key int32_t k_type_i_ref; read_to(&k_type_i_ref, sizeof(k_type_i_ref)); - const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; + const int32_t k_type_i = (int32_t)kv.k_l[il]->type; if (k_type_i != k_type_i_ref) { LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); return false; @@ -1276,7 +1273,7 @@ struct llama_data_read { // Read row size of key uint64_t k_size_row_ref; read_to(&k_size_row_ref, sizeof(k_size_row_ref)); - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + const size_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa); if (k_size_row != k_size_row_ref) { LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); return false; @@ -1284,18 +1281,18 @@ struct llama_data_read { if (cell_count) { // Read and set the keys for the whole cell range - ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row); + ggml_backend_tensor_set(kv.k_l[il], read(cell_count * k_size_row), kv.head * k_size_row, cell_count * k_size_row); } } - if (!kv_self.v_trans) { + if (!kv.v_trans) { for (uint32_t il = 0; il < n_layer; ++il) { const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); // Read type of value int32_t v_type_i_ref; read_to(&v_type_i_ref, sizeof(v_type_i_ref)); - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + const int32_t v_type_i = (int32_t)kv.v_l[il]->type; if (v_type_i != v_type_i_ref) { LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); return false; @@ -1304,7 +1301,7 @@ struct llama_data_read { // Read row size of value uint64_t v_size_row_ref; read_to(&v_size_row_ref, sizeof(v_size_row_ref)); - const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + const size_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa); if (v_size_row != v_size_row_ref) { LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); return false; @@ -1312,7 +1309,7 @@ struct llama_data_read { if (cell_count) { // Read and set the values for the whole cell range - ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row); + ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_row), kv.head * v_size_row, cell_count * v_size_row); } } } else { @@ -1323,7 +1320,7 @@ struct llama_data_read { // Read type of value int32_t v_type_i_ref; read_to(&v_type_i_ref, sizeof(v_type_i_ref)); - const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + const int32_t v_type_i = (int32_t)kv.v_l[il]->type; if (v_type_i != v_type_i_ref) { LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); return false; @@ -1332,7 +1329,7 @@ struct llama_data_read { // Read element size of value uint32_t v_size_el_ref; read_to(&v_size_el_ref, sizeof(v_size_el_ref)); - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + const size_t v_size_el = ggml_type_size(kv.v_l[il]->type); if (v_size_el != v_size_el_ref) { LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); return false; @@ -1349,8 +1346,8 @@ struct llama_data_read { if (cell_count) { // For each row in the transposed matrix, read the values for the whole cell range for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el; - ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + const size_t dst_offset = (kv.head + j * kv.size) * v_size_el; + ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); } } } @@ -1358,17 +1355,17 @@ struct llama_data_read { return true; } - void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) { + void read_kv_cache(llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) { uint32_t cell_count; read_to(&cell_count, sizeof(cell_count)); - bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count); + bool res = read_kv_cache_meta(kv, cell_count, seq_id) && read_kv_cache_data(kv, hparams, cell_count); if (!res) { if (seq_id == -1) { - ctx->kv_self.clear(); + kv.clear(); } else { - ctx->kv_self.seq_rm(seq_id, -1, -1); + kv.seq_rm(seq_id, -1, -1); } throw std::runtime_error("failed to restore kv cache"); } @@ -1521,7 +1518,7 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da data_ctx.write_logits(ctx); data_ctx.write_embeddings(ctx); - data_ctx.write_kv_cache(ctx); + data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams); return data_ctx.get_size_written(); } @@ -1558,7 +1555,7 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da data_ctx.read_logits(ctx); data_ctx.read_embeddings(ctx); - data_ctx.read_kv_cache(ctx); + data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams); return data_ctx.get_size_read(); } @@ -1654,7 +1651,7 @@ bool llama_state_save_file(struct llama_context * ctx, const char * path_session static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) { llama_synchronize(ctx); - data_ctx.write_kv_cache(ctx, seq_id); + data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams, seq_id); return data_ctx.get_size_written(); } @@ -1677,7 +1674,7 @@ size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_ static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) { llama_synchronize(ctx); - data_ctx.read_kv_cache(ctx, dest_seq_id); + data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams, dest_seq_id); return data_ctx.get_size_read(); } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 97285481e..7fc2fabf5 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -44,6 +44,7 @@ struct llama_kv_cache_slot_info { // ring-buffer of cached KV data // TODO: pimpl +// TODO: add notion of max sequences struct llama_kv_cache { bool has_shift = false; bool do_defrag = false; From fd05ab87aad1221535da86d5cd810ee5856ebb49 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 14 Jan 2025 13:13:35 +0200 Subject: [PATCH 07/28] kv_cache : move state read/write to llama_kv_cache ggml-ci --- src/llama-context.cpp | 424 +++++------------------------------------ src/llama-kv-cache.cpp | 378 ++++++++++++++++++++++++++++++++++++ src/llama-kv-cache.h | 20 ++ 3 files changed, 446 insertions(+), 376 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8fc6de2f2..0e146652c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -908,143 +908,6 @@ struct llama_data_write { write(ctx->embd, embeddings_size * sizeof(float)); } } - - void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) { - for (const auto & range : cell_ranges) { - for (uint32_t i = range.first; i < range.second; ++i) { - const auto & cell = kv_self.cells[i]; - const llama_pos pos = cell.pos; - const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0; - - write(&pos, sizeof(pos)); - write(&n_seq_id, sizeof(n_seq_id)); - - if (n_seq_id) { - for (auto seq_id : cell.seq_id) { - write(&seq_id, sizeof(seq_id)); - } - } - } - } - } - - void write_kv_cache_data(const llama_kv_cache & kv, const llama_hparams & hparams, const std::vector> & cell_ranges) { - const uint32_t v_trans = kv.v_trans ? 1 : 0; - const uint32_t n_layer = hparams.n_layer; - - write(&v_trans, sizeof(v_trans)); - write(&n_layer, sizeof(n_layer)); - - std::vector tmp_buf; - - // Iterate and write all the keys first, each row is a cell - // Get whole range at a time - for (uint32_t il = 0; il < n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - - // Write key type - const int32_t k_type_i = (int32_t)kv.k_l[il]->type; - write(&k_type_i, sizeof(k_type_i)); - - // Write row size of key - const uint64_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa); - write(&k_size_row, sizeof(k_size_row)); - - // Read each range of cells of k_size length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - const size_t buf_size = range_size * k_size_row; - write_tensor_data(kv.k_l[il], range.first * k_size_row, buf_size); - } - } - - if (!kv.v_trans) { - for (uint32_t il = 0; il < n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Write value type - const int32_t v_type_i = (int32_t)kv.v_l[il]->type; - write(&v_type_i, sizeof(v_type_i)); - - // Write row size of value - const uint64_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa); - write(&v_size_row, sizeof(v_size_row)); - - // Read each range of cells of v_size length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - const size_t buf_size = range_size * v_size_row; - write_tensor_data(kv.v_l[il], range.first * v_size_row, buf_size); - } - } - } else { - // When v is transposed, we also need the element size and get the element ranges from each row - const uint32_t kv_size = kv.size; - for (uint32_t il = 0; il < n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Write value type - const int32_t v_type_i = (int32_t)kv.v_l[il]->type; - write(&v_type_i, sizeof(v_type_i)); - - // Write element size - const uint32_t v_size_el = ggml_type_size(kv.v_l[il]->type); - write(&v_size_el, sizeof(v_size_el)); - - // Write GQA embedding size - write(&n_embd_v_gqa, sizeof(n_embd_v_gqa)); - - // For each row, we get the element values of each cell - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - // Read each range of cells of v_size_el length each into tmp_buf and write out - for (const auto & range : cell_ranges) { - const size_t range_size = range.second - range.first; - const size_t src_offset = (range.first + j * kv_size) * v_size_el; - const size_t buf_size = range_size * v_size_el; - write_tensor_data(kv.v_l[il], src_offset, buf_size); - } - } - } - } - } - - void write_kv_cache(const llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) { - std::vector> cell_ranges; // ranges, from inclusive, to exclusive - uint32_t cell_count = 0; - - // Count the number of cells with the specified seq_id - // Find all the ranges of cells with this seq id (or all, when -1) - uint32_t cell_range_begin = kv.size; - for (uint32_t i = 0; i < kv.size; ++i) { - const auto & cell = kv.cells[i]; - if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { - ++cell_count; - if (cell_range_begin == kv.size) { - cell_range_begin = i; - } - } else { - if (cell_range_begin != kv.size) { - cell_ranges.emplace_back(cell_range_begin, i); - cell_range_begin = kv.size; - } - } - } - if (cell_range_begin != kv.size) { - cell_ranges.emplace_back(cell_range_begin, kv.size); - } - - // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count - uint32_t cell_count_check = 0; - for (const auto & range : cell_ranges) { - cell_count_check += range.second - range.first; - } - GGML_ASSERT(cell_count == cell_count_check); - - write(&cell_count, sizeof(cell_count)); - - write_kv_cache_meta(kv, cell_ranges, seq_id); - write_kv_cache_data(kv, hparams, cell_ranges); - } }; struct llama_data_read { @@ -1135,241 +998,6 @@ struct llama_data_read { read_to(ctx->embd, embeddings_size * sizeof(float)); } } - - bool read_kv_cache_meta(llama_kv_cache & kv, uint32_t cell_count, llama_seq_id dest_seq_id = -1) { - if (dest_seq_id != -1) { - // single sequence - - kv.seq_rm(dest_seq_id, -1, -1); - - llama_sbatch sbatch; - llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false); - - batch.n_tokens = cell_count; - batch.n_seq_tokens = cell_count; - batch.n_seqs = 1; - - for (uint32_t i = 0; i < cell_count; ++i) { - llama_pos pos; - uint32_t n_seq_id; - - read_to(&pos, sizeof(pos)); - read_to(&n_seq_id, sizeof(n_seq_id)); - - if (n_seq_id != 0) { - LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__); - return false; - } - - batch.pos[i] = pos; - } - batch.n_seq_id[0] = 1; - batch.seq_id[0] = &dest_seq_id; - if (!kv.find_slot(batch)) { - LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); - return false; - } - - // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values) - // Assume that this is one contiguous block of cells - GGML_ASSERT(kv.head + cell_count <= kv.size); - GGML_ASSERT(kv.cells[kv.head].pos == batch.pos[0]); - GGML_ASSERT(kv.cells[kv.head + cell_count - 1].pos == batch.pos[cell_count - 1]); - GGML_ASSERT(kv.cells[kv.head].has_seq_id(dest_seq_id)); - GGML_ASSERT(kv.cells[kv.head + cell_count - 1].has_seq_id(dest_seq_id)); - } else { - // whole KV cache restore - - if (cell_count > kv.size) { - LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); - return false; - } - - kv.clear(); - - for (uint32_t i = 0; i < cell_count; ++i) { - llama_kv_cell & cell = kv.cells[i]; - - llama_pos pos; - uint32_t n_seq_id; - - read_to(&pos, sizeof(pos)); - read_to(&n_seq_id, sizeof(n_seq_id)); - - cell.pos = pos; - - for (uint32_t j = 0; j < n_seq_id; ++j) { - llama_seq_id seq_id; - read_to(&seq_id, sizeof(seq_id)); - - // TODO: llama_kv_cache should have a notion of max sequences - //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { - if (seq_id < 0) { - //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); - LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id); - return false; - } - - cell.seq_id.insert(seq_id); - - if (kv.recurrent) { - int32_t & tail = kv.cells[seq_id].tail; - if (tail != -1) { - LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail); - return false; - } - tail = i; - } - } - } - - kv.head = 0; - kv.used = cell_count; - } - - if (kv.recurrent) { - for (uint32_t i = 0; i < cell_count; ++i) { - uint32_t cell_id = kv.head + i; - // make sure the recurrent states will keep their restored state - kv.cells[cell_id].src = cell_id; - } - } - - return true; - } - - bool read_kv_cache_data(llama_kv_cache & kv, const llama_hparams & hparams, uint32_t cell_count) { - uint32_t v_trans; - uint32_t n_layer; - read_to(&v_trans, sizeof(v_trans)); - read_to(&n_layer, sizeof(n_layer)); - - if (n_layer != hparams.n_layer) { - LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); - return false; - } - if (cell_count > kv.size) { - LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv.size); - return false; - } - if (kv.v_trans != (bool) v_trans) { - LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); - return false; - } - - // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block - for (uint32_t il = 0; il < n_layer; ++il) { - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); - - // Read type of key - int32_t k_type_i_ref; - read_to(&k_type_i_ref, sizeof(k_type_i_ref)); - const int32_t k_type_i = (int32_t)kv.k_l[il]->type; - if (k_type_i != k_type_i_ref) { - LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); - return false; - } - - // Read row size of key - uint64_t k_size_row_ref; - read_to(&k_size_row_ref, sizeof(k_size_row_ref)); - const size_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa); - if (k_size_row != k_size_row_ref) { - LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); - return false; - } - - if (cell_count) { - // Read and set the keys for the whole cell range - ggml_backend_tensor_set(kv.k_l[il], read(cell_count * k_size_row), kv.head * k_size_row, cell_count * k_size_row); - } - } - - if (!kv.v_trans) { - for (uint32_t il = 0; il < n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Read type of value - int32_t v_type_i_ref; - read_to(&v_type_i_ref, sizeof(v_type_i_ref)); - const int32_t v_type_i = (int32_t)kv.v_l[il]->type; - if (v_type_i != v_type_i_ref) { - LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); - return false; - } - - // Read row size of value - uint64_t v_size_row_ref; - read_to(&v_size_row_ref, sizeof(v_size_row_ref)); - const size_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa); - if (v_size_row != v_size_row_ref) { - LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); - return false; - } - - if (cell_count) { - // Read and set the values for the whole cell range - ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_row), kv.head * v_size_row, cell_count * v_size_row); - } - } - } else { - // For each layer, read the values for each cell (transposed) - for (uint32_t il = 0; il < n_layer; ++il) { - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); - - // Read type of value - int32_t v_type_i_ref; - read_to(&v_type_i_ref, sizeof(v_type_i_ref)); - const int32_t v_type_i = (int32_t)kv.v_l[il]->type; - if (v_type_i != v_type_i_ref) { - LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); - return false; - } - - // Read element size of value - uint32_t v_size_el_ref; - read_to(&v_size_el_ref, sizeof(v_size_el_ref)); - const size_t v_size_el = ggml_type_size(kv.v_l[il]->type); - if (v_size_el != v_size_el_ref) { - LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); - return false; - } - - // Read GQA embedding size - uint32_t n_embd_v_gqa_ref; - read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); - if (n_embd_v_gqa != n_embd_v_gqa_ref) { - LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il); - return false; - } - - if (cell_count) { - // For each row in the transposed matrix, read the values for the whole cell range - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - const size_t dst_offset = (kv.head + j * kv.size) * v_size_el; - ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); - } - } - } - } - return true; - } - - void read_kv_cache(llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) { - uint32_t cell_count; - read_to(&cell_count, sizeof(cell_count)); - - bool res = read_kv_cache_meta(kv, cell_count, seq_id) && read_kv_cache_data(kv, hparams, cell_count); - - if (!res) { - if (seq_id == -1) { - kv.clear(); - } else { - kv.seq_rm(seq_id, -1, -1); - } - throw std::runtime_error("failed to restore kv cache"); - } - } }; struct llama_data_write_dummy : llama_data_write { @@ -1518,7 +1146,18 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da data_ctx.write_logits(ctx); data_ctx.write_embeddings(ctx); - data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams); + llama_kv_cache::io io = { + /* .write =*/ [&](const void * src, size_t size) { + data_ctx.write(src, size); + }, + /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { + data_ctx.write_tensor_data(tensor, offset, size); + }, + /* .read =*/ nullptr, + /* .read_to =*/ nullptr, + }; + + ctx->kv_self.state_write(io, ctx->model.hparams); return data_ctx.get_size_written(); } @@ -1555,7 +1194,18 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da data_ctx.read_logits(ctx); data_ctx.read_embeddings(ctx); - data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams); + llama_kv_cache::io io = { + /* .write =*/ nullptr, + /* .write_tensor_data =*/ nullptr, + /* .read =*/ [&](size_t size) { + return data_ctx.read(size); + }, + /* .read_to =*/ [&](void * dst, size_t size) { + data_ctx.read_to(dst, size); + }, + }; + + ctx->kv_self.state_read(io, ctx->model.hparams); return data_ctx.get_size_read(); } @@ -1651,7 +1301,18 @@ bool llama_state_save_file(struct llama_context * ctx, const char * path_session static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) { llama_synchronize(ctx); - data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams, seq_id); + llama_kv_cache::io io = { + /* .write =*/ [&](const void * src, size_t size) { + data_ctx.write(src, size); + }, + /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { + data_ctx.write_tensor_data(tensor, offset, size); + }, + /* .read =*/ nullptr, + /* .read_to =*/ nullptr, + }; + + ctx->kv_self.state_write(io, ctx->model.hparams, seq_id); return data_ctx.get_size_written(); } @@ -1674,7 +1335,18 @@ size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_ static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) { llama_synchronize(ctx); - data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams, dest_seq_id); + llama_kv_cache::io io = { + /* .write =*/ nullptr, + /* .write_tensor_data =*/ nullptr, + /* .read =*/ [&](size_t size) { + return data_ctx.read(size); + }, + /* .read_to =*/ [&](void * dst, size_t size) { + data_ctx.read_to(dst, size); + }, + }; + + ctx->kv_self.state_read(io, ctx->model.hparams, dest_seq_id); return data_ctx.get_size_read(); } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 9f3b4e514..6886d24f0 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -8,6 +8,7 @@ #include #include #include +#include static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false}; @@ -696,6 +697,383 @@ size_t llama_kv_cache::size_v_bytes() const { return size_v_bytes; } +void llama_kv_cache::state_write(const io & io, const llama_hparams & hparams, llama_seq_id seq_id) const { + std::vector> cell_ranges; // ranges, from inclusive, to exclusive + uint32_t cell_count = 0; + + // Count the number of cells with the specified seq_id + // Find all the ranges of cells with this seq id (or all, when -1) + uint32_t cell_range_begin = size; + for (uint32_t i = 0; i < size; ++i) { + const auto & cell = cells[i]; + if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { + ++cell_count; + if (cell_range_begin == size) { + cell_range_begin = i; + } + } else { + if (cell_range_begin != size) { + cell_ranges.emplace_back(cell_range_begin, i); + cell_range_begin = size; + } + } + } + if (cell_range_begin != size) { + cell_ranges.emplace_back(cell_range_begin, size); + } + + // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count + uint32_t cell_count_check = 0; + for (const auto & range : cell_ranges) { + cell_count_check += range.second - range.first; + } + GGML_ASSERT(cell_count == cell_count_check); + + io.write(&cell_count, sizeof(cell_count)); + + state_write_meta(io, cell_ranges, seq_id); + state_write_data(io, cell_ranges, hparams); +} + +void llama_kv_cache::state_read(const io & io, const llama_hparams & hparams, llama_seq_id seq_id) { + uint32_t cell_count; + io.read_to(&cell_count, sizeof(cell_count)); + + bool res = true; + res = res && state_read_meta(io, cell_count, seq_id); + res = res && state_read_data(io, hparams, cell_count); + + if (!res) { + if (seq_id == -1) { + clear(); + } else { + seq_rm(seq_id, -1, -1); + } + throw std::runtime_error("failed to restore kv cache"); + } +} + +void llama_kv_cache::state_write_meta(const io & io, const std::vector> & cell_ranges, llama_seq_id seq_id) const { + for (const auto & range : cell_ranges) { + for (uint32_t i = range.first; i < range.second; ++i) { + const auto & cell = cells[i]; + const llama_pos pos = cell.pos; + const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0; + + io.write(&pos, sizeof(pos)); + io.write(&n_seq_id, sizeof(n_seq_id)); + + if (n_seq_id) { + for (auto seq_id : cell.seq_id) { + io.write(&seq_id, sizeof(seq_id)); + } + } + } + } +} + +void llama_kv_cache::state_write_data(const io & io, const std::vector> & cell_ranges, const llama_hparams & hparams) const { + const uint32_t v_trans = this->v_trans ? 1 : 0; + const uint32_t n_layer = hparams.n_layer; + + io.write(&v_trans, sizeof(v_trans)); + io.write(&n_layer, sizeof(n_layer)); + + std::vector tmp_buf; + + // Iterate and write all the keys first, each row is a cell + // Get whole range at a time + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Write key type + const int32_t k_type_i = (int32_t)k_l[il]->type; + io.write(&k_type_i, sizeof(k_type_i)); + + // Write row size of key + const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + io.write(&k_size_row, sizeof(k_size_row)); + + // Read each range of cells of k_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * k_size_row; + io.write_tensor_data(k_l[il], range.first * k_size_row, buf_size); + } + } + + if (!v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Write value type + const int32_t v_type_i = (int32_t)v_l[il]->type; + io.write(&v_type_i, sizeof(v_type_i)); + + // Write row size of value + const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa); + io.write(&v_size_row, sizeof(v_size_row)); + + // Read each range of cells of v_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * v_size_row; + io.write_tensor_data(v_l[il], range.first * v_size_row, buf_size); + } + } + } else { + // When v is transposed, we also need the element size and get the element ranges from each row + const uint32_t kv_size = size; + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Write value type + const int32_t v_type_i = (int32_t)v_l[il]->type; + io.write(&v_type_i, sizeof(v_type_i)); + + // Write element size + const uint32_t v_size_el = ggml_type_size(v_l[il]->type); + io.write(&v_size_el, sizeof(v_size_el)); + + // Write GQA embedding size + io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa)); + + // For each row, we get the element values of each cell + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + // Read each range of cells of v_size_el length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t src_offset = (range.first + j * kv_size) * v_size_el; + const size_t buf_size = range_size * v_size_el; + io.write_tensor_data(v_l[il], src_offset, buf_size); + } + } + } + } +} + +bool llama_kv_cache::state_read_meta(const io & io, uint32_t cell_count, llama_seq_id dest_seq_id) { + if (dest_seq_id != -1) { + // single sequence + + seq_rm(dest_seq_id, -1, -1); + + llama_sbatch sbatch; + llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false); + + batch.n_tokens = cell_count; + batch.n_seq_tokens = cell_count; + batch.n_seqs = 1; + + for (uint32_t i = 0; i < cell_count; ++i) { + llama_pos pos; + uint32_t n_seq_id; + + io.read_to(&pos, sizeof(pos)); + io.read_to(&n_seq_id, sizeof(n_seq_id)); + + if (n_seq_id != 0) { + LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__); + return false; + } + + batch.pos[i] = pos; + } + batch.n_seq_id[0] = 1; + batch.seq_id[0] = &dest_seq_id; + if (!find_slot(batch)) { + LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); + return false; + } + + // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values) + // Assume that this is one contiguous block of cells + GGML_ASSERT(head + cell_count <= size); + GGML_ASSERT(cells[head].pos == batch.pos[0]); + GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]); + GGML_ASSERT(cells[head].has_seq_id(dest_seq_id)); + GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id)); + } else { + // whole KV cache restore + + if (cell_count > size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); + return false; + } + + clear(); + + for (uint32_t i = 0; i < cell_count; ++i) { + llama_kv_cell & cell = cells[i]; + + llama_pos pos; + uint32_t n_seq_id; + + io.read_to(&pos, sizeof(pos)); + io.read_to(&n_seq_id, sizeof(n_seq_id)); + + cell.pos = pos; + + for (uint32_t j = 0; j < n_seq_id; ++j) { + llama_seq_id seq_id; + io.read_to(&seq_id, sizeof(seq_id)); + + // TODO: llama_kv_cache should have a notion of max sequences + //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { + if (seq_id < 0) { + //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); + LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id); + return false; + } + + cell.seq_id.insert(seq_id); + + if (recurrent) { + int32_t & tail = cells[seq_id].tail; + if (tail != -1) { + LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail); + return false; + } + tail = i; + } + } + } + + head = 0; + used = cell_count; + } + + if (recurrent) { + for (uint32_t i = 0; i < cell_count; ++i) { + uint32_t cell_id = head + i; + // make sure the recurrent states will keep their restored state + cells[cell_id].src = cell_id; + } + } + + return true; +} + +bool llama_kv_cache::state_read_data(const io & io, const llama_hparams & hparams, uint32_t cell_count) { + uint32_t v_trans; + uint32_t n_layer; + io.read_to(&v_trans, sizeof(v_trans)); + io.read_to(&n_layer, sizeof(n_layer)); + + if (n_layer != hparams.n_layer) { + LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); + return false; + } + if (cell_count > size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size); + return false; + } + if (v_trans != (bool) v_trans) { + LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); + return false; + } + + // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Read type of key + int32_t k_type_i_ref; + io.read_to(&k_type_i_ref, sizeof(k_type_i_ref)); + const int32_t k_type_i = (int32_t) k_l[il]->type; + if (k_type_i != k_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); + return false; + } + + // Read row size of key + uint64_t k_size_row_ref; + io.read_to(&k_size_row_ref, sizeof(k_size_row_ref)); + const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + if (k_size_row != k_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); + return false; + } + + if (cell_count) { + // Read and set the keys for the whole cell range + ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row); + } + } + + if (!v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Read type of value + int32_t v_type_i_ref; + io.read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } + + // Read row size of value + uint64_t v_size_row_ref; + io.read_to(&v_size_row_ref, sizeof(v_size_row_ref)); + const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa); + if (v_size_row != v_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); + return false; + } + + if (cell_count) { + // Read and set the values for the whole cell range + ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row); + } + } + } else { + // For each layer, read the values for each cell (transposed) + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Read type of value + int32_t v_type_i_ref; + io.read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } + + // Read element size of value + uint32_t v_size_el_ref; + io.read_to(&v_size_el_ref, sizeof(v_size_el_ref)); + const size_t v_size_el = ggml_type_size(v_l[il]->type); + if (v_size_el != v_size_el_ref) { + LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); + return false; + } + + // Read GQA embedding size + uint32_t n_embd_v_gqa_ref; + io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); + if (n_embd_v_gqa != n_embd_v_gqa_ref) { + LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il); + return false; + } + + if (cell_count) { + // For each row in the transposed matrix, read the values for the whole cell range + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + const size_t dst_offset = (head + j * size) * v_size_el; + ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + } + } + } + } + + return true; +} + +///////////// + void llama_kv_cache_clear(llama_kv_cache * kv) { kv->clear(); } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 7fc2fabf5..0384a2b7c 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -6,8 +6,10 @@ #include #include +#include struct llama_cparams; +struct llama_hparams; struct llama_ubatch; struct llama_kv_cell { @@ -45,6 +47,7 @@ struct llama_kv_cache_slot_info { // ring-buffer of cached KV data // TODO: pimpl // TODO: add notion of max sequences +// TODO: add llama_hparams & struct llama_kv_cache { bool has_shift = false; bool do_defrag = false; @@ -111,12 +114,29 @@ struct llama_kv_cache { size_t size_k_bytes() const; size_t size_v_bytes() const; + struct io { + std::function write; + std::function write_tensor_data; + + std::function read; + std::function read_to; + }; + + void state_write(const io & io, const llama_hparams & hparams, llama_seq_id seq_id = -1) const; + void state_read (const io & io, const llama_hparams & hparams, llama_seq_id seq_id = -1); + private: ggml_type type_k = GGML_TYPE_F16; ggml_type type_v = GGML_TYPE_F16; std::vector ctxs; std::vector bufs; + + void state_write_meta(const io & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; + void state_write_data(const io & io, const std::vector> & cell_ranges, const llama_hparams & hparams) const; + + bool state_read_meta(const io & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1); + bool state_read_data(const io & io, const llama_hparams & hparams, uint32_t cell_count); }; // From 17b363afd3575f8f9d025a35d2abb75f528a64c2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 14 Jan 2025 16:47:34 +0200 Subject: [PATCH 08/28] llama : update llama_kv_self API ggml-ci --- common/common.cpp | 6 +- common/speculative.cpp | 10 +- examples/batched-bench/batched-bench.cpp | 6 +- examples/batched.swift/Sources/main.swift | 2 +- .../cvector-generator/cvector-generator.cpp | 3 +- examples/embedding/embedding.cpp | 5 +- examples/gritlm/gritlm.cpp | 8 +- examples/imatrix/imatrix.cpp | 4 +- examples/infill/infill.cpp | 6 +- examples/llama-bench/llama-bench.cpp | 6 +- .../llama/src/main/cpp/llama-android.cpp | 8 +- .../llama.cpp.swift/LibLlama.swift | 8 +- examples/lookahead/lookahead.cpp | 13 +- examples/lookup/lookup.cpp | 3 +- examples/main/main.cpp | 14 +- examples/parallel/parallel.cpp | 11 +- examples/passkey/passkey.cpp | 30 ++-- examples/perplexity/perplexity.cpp | 24 +-- examples/retrieval/retrieval.cpp | 4 +- examples/run/run.cpp | 7 +- examples/save-load-state/save-load-state.cpp | 4 +- examples/server/server.cpp | 25 ++- examples/simple-chat/simple-chat.cpp | 6 +- .../speculative-simple/speculative-simple.cpp | 4 +- examples/speculative/speculative.cpp | 29 ++-- include/llama.h | 105 ++++++++++--- src/llama-context.cpp | 34 ++-- src/llama-kv-cache.cpp | 20 +-- src/llama-kv-cache.h | 42 +++++ src/llama.cpp | 145 +++++++++++++++++- 30 files changed, 387 insertions(+), 205 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 29de45189..098feebee 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -952,9 +952,7 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } - llama_kv_cache * kv = llama_get_kv_cache(lctx); - - if (params.ctx_shift && !llama_kv_cache_can_shift(kv)) { + if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) { LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__); params.ctx_shift = false; } @@ -1059,7 +1057,7 @@ struct common_init_result common_init_from_params(common_params & params) { if (llama_model_has_decoder(model)) { llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch))); } - llama_kv_cache_clear(kv); + llama_kv_self_clear(lctx); llama_synchronize(lctx); llama_perf_context_reset(lctx); } diff --git a/common/speculative.cpp b/common/speculative.cpp index 6ac058517..a660f198a 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -171,10 +171,8 @@ llama_tokens common_speculative_gen_draft( llama_tokens result; result.reserve(params.n_draft); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - if (reuse_n == 0) { - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); prompt.clear(); } else { @@ -193,14 +191,14 @@ llama_tokens common_speculative_gen_draft( } if (reuse_i > 0) { - llama_kv_cache_seq_rm (kv, 0, 0, reuse_i); - llama_kv_cache_seq_add(kv, 0, reuse_i, -1, -reuse_i); + llama_kv_self_seq_rm (ctx, 0, 0, reuse_i); + llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i); prompt.erase(prompt.begin(), prompt.begin() + reuse_i); } if (reuse_n < (int) prompt.size()) { - llama_kv_cache_seq_rm (kv, 0, reuse_n, -1); + llama_kv_self_seq_rm (ctx, 0, reuse_n, -1); prompt.erase(prompt.begin() + reuse_n, prompt.end()); } diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index fcbad37bb..430e8be51 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -57,8 +57,6 @@ int main(int argc, char ** argv) { return 1; } - llama_kv_cache * kv = llama_get_kv_cache(ctx); - const int32_t n_kv_max = llama_n_ctx(ctx); llama_batch batch = llama_batch_init(n_kv_max, 0, 1); @@ -134,7 +132,7 @@ int main(int argc, char ** argv) { const auto t_pp_start = ggml_time_us(); - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { LOG_ERR("%s: llama_decode() failed\n", __func__); @@ -143,7 +141,7 @@ int main(int argc, char ** argv) { if (is_pp_shared) { for (int32_t i = 1; i < pl; ++i) { - llama_kv_cache_seq_cp(kv, 0, i, -1, -1); + llama_kv_self_seq_cp(ctx, 0, i, -1, -1); } } diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 371917b2e..a6494ebdf 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -111,7 +111,7 @@ if llama_decode(context, batch) != 0 { } for i in 1 ..< n_parallel { - llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens) + llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens) } if n_parallel > 1 { diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index adb4a60ad..3733e32d7 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -342,8 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { } static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { - llama_kv_cache * kv = llama_get_kv_cache(ctx); - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index fda0949f1..c4fb1c6d1 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -34,11 +34,10 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - const llama_model * model = llama_get_model(ctx); - llama_kv_cache * kv = llama_get_kv_cache(ctx); + const struct llama_model * model = llama_get_model(ctx); // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 16437453e..f7db7861c 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -13,8 +13,6 @@ static std::vector> encode(llama_context * ctx, const std::ve const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); for (uint64_t i = 0; i < sentences.size(); i++) { @@ -47,7 +45,7 @@ static std::vector> encode(llama_context * ctx, const std::ve } // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); llama_set_embeddings(ctx, true); llama_set_causal_attn(ctx, false); @@ -102,11 +100,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - llama_token eos_token = llama_vocab_eos(vocab); - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); llama_set_embeddings(ctx, false); llama_set_causal_attn(ctx, true); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 5efe4f019..e335ecc74 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -431,8 +431,6 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - const bool add_bos = llama_vocab_get_add_bos(vocab); const int n_ctx = llama_n_ctx(ctx); @@ -499,7 +497,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); llama_batch batch = llama_batch_init(n_batch, 0, 1); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index de8e77695..4e2f7b727 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -139,8 +139,6 @@ int main(int argc, char ** argv) { return 1; } - llama_kv_cache * kv = llama_get_kv_cache(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); const int n_ctx_train = llama_model_n_ctx_train(model); @@ -334,8 +332,8 @@ int main(int argc, char ** argv) { LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (kv, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(kv, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); n_past -= n_discard; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 8843c0048..fc58135fe 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1575,11 +1575,9 @@ int main(int argc, char ** argv) { return 1; } - llama_kv_cache * kv = llama_get_kv_cache(ctx); - test t(inst, lmodel, ctx); - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); // cool off before the test if (params.delay) { @@ -1619,7 +1617,7 @@ int main(int argc, char ** argv) { } for (int i = 0; i < params.reps; i++) { - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); uint64_t t_start = get_time_ns(); diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 2a73983a9..cf5e14907 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( } batch->logits[batch->n_tokens - 1] = true; - llama_kv_cache_clear(context); + llama_kv_self_clear(context); const auto t_pp_start = ggml_time_us(); if (llama_decode(context, *batch) != 0) { @@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( LOGi("Benchmark text generation (tg)"); - llama_kv_cache_clear(context); + llama_kv_self_clear(context); const auto t_tg_start = ggml_time_us(); for (i = 0; i < tg; i++) { @@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( const auto t_tg_end = ggml_time_us(); - llama_kv_cache_clear(context); + llama_kv_self_clear(context); const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0; const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0; @@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( extern "C" JNIEXPORT void JNICALL Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) { - llama_kv_cache_clear(reinterpret_cast(context)); + llama_kv_self_clear(reinterpret_cast(context)); } diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 477c3e6f2..82c26935b 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -208,7 +208,7 @@ actor LlamaContext { } batch.logits[Int(batch.n_tokens) - 1] = 1 // true - llama_kv_cache_clear(context) + llama_kv_self_clear(context) let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000; @@ -221,7 +221,7 @@ actor LlamaContext { // bench text generation - llama_kv_cache_clear(context) + llama_kv_self_clear(context) let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000; @@ -240,7 +240,7 @@ actor LlamaContext { let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000; - llama_kv_cache_clear(context) + llama_kv_self_clear(context) let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0 let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0 @@ -290,7 +290,7 @@ actor LlamaContext { func clear() { tokens_list.removeAll() temporary_invalid_cchars.removeAll() - llama_kv_cache_clear(context) + llama_kv_self_clear(context) } private func tokenize(text: String, add_bos: Bool) -> [llama_token] { diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 1219c2074..b7f334007 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -60,7 +60,6 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); - llama_kv_cache * kv = llama_get_kv_cache(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -96,7 +95,7 @@ int main(int argc, char ** argv) { llama_decode(ctx, llama_batch_get_one(&inp.back(), 1)); for (int s = 1; s < W + G + 1; ++s) { - llama_kv_cache_seq_cp(kv, 0, s, -1, -1); + llama_kv_self_seq_cp(ctx, 0, s, -1, -1); } const auto t_enc_end = ggml_time_us(); @@ -438,17 +437,17 @@ int main(int argc, char ** argv) { // KV cache management // if no verification token matched, we simply remove all cells from this batch -> no fragmentation - llama_kv_cache_seq_rm(kv, -1, n_past, -1); + llama_kv_self_seq_rm(ctx, -1, n_past, -1); if (seq_id_best != 0) { // if a verification token matched, we keep the best sequence and remove the rest // this leads to some KV cache fragmentation - llama_kv_cache_seq_keep(kv, seq_id_best); - llama_kv_cache_seq_cp (kv, seq_id_best, 0, -1, -1); - llama_kv_cache_seq_rm (kv, seq_id_best, -1, -1); + llama_kv_self_seq_keep(ctx, seq_id_best); + llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1); + llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1); for (int s = 1; s < W + G + 1; ++s) { - llama_kv_cache_seq_cp(kv, 0, s, -1, -1); + llama_kv_self_seq_cp(ctx, 0, s, -1, -1); } } } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 8628f7318..4ae93b2a5 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -35,7 +35,6 @@ int main(int argc, char ** argv){ llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); - llama_kv_cache * kv = llama_get_kv_cache(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -193,7 +192,7 @@ int main(int argc, char ** argv){ // KV cache management // clean the cache of draft tokens that weren't accepted - llama_kv_cache_seq_rm(kv, 0, n_past, -1); + llama_kv_self_seq_rm(ctx, 0, n_past, -1); common_batch_clear(batch_tgt); common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 9d79af79e..23437937c 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -157,8 +157,6 @@ int main(int argc, char ** argv) { return 1; } - llama_kv_cache * kv = llama_get_kv_cache(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); auto chat_templates = common_chat_templates_from_model(model, params.chat_template); @@ -330,7 +328,7 @@ int main(int argc, char ** argv) { } // remove any "future" tokens that we might have inherited from the previous session - llama_kv_cache_seq_rm(kv, -1, n_matching_session_tokens, -1); + llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1); } LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", @@ -571,8 +569,8 @@ int main(int argc, char ** argv) { LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (kv, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_add(kv, 0, params.n_keep + n_discard, n_past, -n_discard); + llama_kv_self_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); n_past -= n_discard; @@ -595,9 +593,9 @@ int main(int argc, char ** argv) { LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); - llama_kv_cache_seq_add(kv, 0, ga_i, n_past, ib*bd); - llama_kv_cache_seq_div(kv, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); - llama_kv_cache_seq_add(kv, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); + llama_kv_self_seq_add(ctx, 0, ga_i, n_past, ib*bd); + llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); + llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); n_past -= bd; diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 2ba0706dc..3f9e1bcbb 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -134,7 +134,6 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model.get(); llama_context * ctx = llama_init.context.get(); - llama_kv_cache * kv = llama_get_kv_cache(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -202,7 +201,7 @@ int main(int argc, char ** argv) { // assign the system KV cache to all parallel sequences for (int32_t i = 1; i <= n_clients; ++i) { - llama_kv_cache_seq_cp(kv, 0, i, -1, -1); + llama_kv_self_seq_cp(ctx, 0, i, -1, -1); } LOG_INF("\n"); @@ -234,9 +233,9 @@ int main(int argc, char ** argv) { if (batch.n_tokens == 0) { // all sequences have ended - clear the entire KV cache for (int i = 1; i <= n_clients; ++i) { - llama_kv_cache_seq_rm(kv, i, -1, -1); + llama_kv_self_seq_rm(ctx, i, -1, -1); // but keep the system prompt - llama_kv_cache_seq_cp(kv, 0, i, -1, -1); + llama_kv_self_seq_cp(ctx, 0, i, -1, -1); } LOG_INF("%s: clearing the KV cache\n", __func__); @@ -372,8 +371,8 @@ int main(int argc, char ** argv) { } // delete only the generated part of the sequence, i.e. keep the system prompt in the cache - llama_kv_cache_seq_rm(kv, client.id + 1, -1, -1); - llama_kv_cache_seq_cp(kv, 0, client.id + 1, -1, -1); + llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1); + llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1); const auto t_main_end = ggml_time_us(); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index e2764313b..46de2c2a2 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -86,8 +86,6 @@ int main(int argc, char ** argv) { return 1; } - llama_kv_cache * kv = llama_get_kv_cache(ctx); - auto sparams = llama_sampler_chain_default_params(); llama_sampler * smpl = llama_sampler_chain_init(sparams); @@ -134,11 +132,11 @@ int main(int argc, char ** argv) { const int ib = i/n_batch - 1; const int bd = n_batch_grp*(n_grp - 1); - llama_kv_cache_seq_add(kv, 0, n_past - n_batch, n_past, ib*bd); - llama_kv_cache_seq_div(kv, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); - llama_update_kv_cache (ctx, kv); + llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd); + llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_kv_self_update (ctx); - n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1; + n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1; } common_batch_clear(batch); @@ -168,12 +166,12 @@ int main(int argc, char ** argv) { LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard); - llama_kv_cache_seq_rm (kv, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx, -n_discard); - //llama_kv_cache_defrag (kv); - llama_update_kv_cache (ctx, kv); + llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + //llama_kv_self_defrag (ctx); + llama_kv_self_update (ctx); - n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1; + n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1; common_batch_clear(batch); @@ -199,12 +197,12 @@ int main(int argc, char ** argv) { if (n_discard > 0) { LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); - llama_kv_cache_seq_rm (kv, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx, -n_discard); - //llama_kv_cache_defrag (kv); - llama_update_kv_cache (ctx, kv); + llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + //llama_kv_self_defrag (ctx); + llama_kv_self_update (ctx); - n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1; + n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1; } } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 6c9f716ed..31c436f13 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -299,8 +299,6 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - const bool add_bos = llama_vocab_get_add_bos(vocab); GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); @@ -362,7 +360,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); llama_batch batch = llama_batch_init(n_batch, 0, 1); @@ -452,8 +450,6 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - const bool add_bos = llama_vocab_get_add_bos(vocab); GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); @@ -550,7 +546,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; @@ -745,8 +741,6 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - // Calculates hellaswag score (acc_norm) from prompt // // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl @@ -929,7 +923,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { return; } - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1090,8 +1084,6 @@ static void winogrande_score(llama_context * ctx, const common_params & params) const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - constexpr int k_min_trailing_ctx = 3; auto data = load_winogrande_from_csv(params.prompt); @@ -1210,7 +1202,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) return; } - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1396,8 +1388,6 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - std::istringstream strstream(params.prompt); uint32_t n_task; strstream.read((char *)&n_task, sizeof(n_task)); @@ -1584,7 +1574,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par return; } - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1681,8 +1671,6 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - llama_kv_cache * kv = llama_get_kv_cache(ctx); - if (params.logits_file.empty()) { LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); return; @@ -1776,7 +1764,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { } // clear the KV cache - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); llama_batch batch = llama_batch_init(n_batch, 0, 1); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index a907ea076..0efe20d4b 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -82,10 +82,8 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke } static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { - llama_kv_cache * kv = llama_get_kv_cache(ctx); - // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 8e2c174a9..2c38d1ef6 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -756,8 +756,7 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll // Function to tokenize the prompt static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt, std::vector & prompt_tokens, const LlamaData & llama_data) { - const llama_kv_cache * kv = llama_get_kv_cache(llama_data.context.get()); - const bool is_first = llama_kv_cache_used_cells(kv) == 0; + const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0; const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); prompt_tokens.resize(n_prompt_tokens); @@ -772,10 +771,8 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt // Check if we have enough space in the context to evaluate this batch static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { - llama_kv_cache * kv = llama_get_kv_cache(ctx.get()); - const int n_ctx = llama_n_ctx(ctx.get()); - const int n_ctx_used = llama_kv_cache_used_cells(kv); + const int n_ctx_used = llama_kv_self_used_cells(ctx.get()); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); printe("context size exceeded\n"); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 3839fbe8c..77b1572a9 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -156,8 +156,6 @@ int main(int argc, char ** argv) { // make new context llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params)); - llama_kv_cache * kv3 = llama_get_kv_cache(ctx3); - llama_sampler * smpl3 = llama_sampler_chain_init(sparams); llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed)); @@ -198,7 +196,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); // erase whole kv - llama_kv_cache_clear(kv3); + llama_kv_self_clear(ctx3); fprintf(stderr, "%s : kv cache cleared\n", __func__); // restore kv into seq 1 diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 076044d39..b665bde41 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1693,7 +1693,6 @@ struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; - llama_kv_cache * kv = nullptr; const llama_vocab * vocab = nullptr; @@ -1756,8 +1755,6 @@ struct server_context { return false; } - kv = llama_get_kv_cache(ctx); - vocab = llama_model_get_vocab(model); n_ctx = llama_n_ctx(ctx); @@ -2026,7 +2023,7 @@ struct server_context { SRV_DBG("%s", "clearing KV cache\n"); // clear the entire KV cache - llama_kv_cache_clear(kv); + llama_kv_self_clear(ctx); clean_kv_cache = false; } @@ -2568,8 +2565,8 @@ struct server_context { res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); res->t_start = metrics.t_start; - res->kv_cache_tokens_count = llama_kv_cache_n_tokens(kv); - res->kv_cache_used_cells = llama_kv_cache_used_cells(kv); + res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx); + res->kv_cache_used_cells = llama_kv_self_used_cells(ctx); res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; res->t_prompt_processing_total = metrics.t_prompt_processing_total; @@ -2685,7 +2682,7 @@ struct server_context { // Erase token cache const size_t n_erased = slot->cache_tokens.size(); - llama_kv_cache_seq_rm(kv, slot->id, -1, -1); + llama_kv_self_seq_rm(ctx, slot->id, -1, -1); slot->cache_tokens.clear(); auto res = std::make_unique(); @@ -2753,8 +2750,8 @@ struct server_context { SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (kv, slot.id, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(kv, slot.id, n_keep + n_discard, slot.n_past, -n_discard); + llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); if (slot.params.cache_prompt) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -2941,8 +2938,8 @@ struct server_context { const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - llama_kv_cache_seq_rm (kv, slot.id, head_p, head_c); - llama_kv_cache_seq_add(kv, slot.id, head_c, -1, kv_shift); + llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c); + llama_kv_self_seq_add(ctx, slot.id, head_c, -1, kv_shift); for (size_t i = 0; i < n_match; i++) { slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; @@ -2980,9 +2977,9 @@ struct server_context { } // keep only the common part - if (!llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1)) { + if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(kv, slot.id, -1, -1); + llama_kv_self_seq_rm(ctx, slot.id, -1, -1); // there is no common part left slot.n_past = 0; @@ -3222,7 +3219,7 @@ struct server_context { slot.cache_tokens.push_back(id); slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); - llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1); + llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1); for (size_t i = 0; i < ids.size(); ++i) { completion_token_output result; diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index 130e326b5..84f415973 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -88,8 +88,6 @@ int main(int argc, char ** argv) { return 1; } - const llama_kv_cache * kv = llama_get_kv_cache(ctx); - // initialize the sampler llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params()); llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1)); @@ -100,7 +98,7 @@ int main(int argc, char ** argv) { auto generate = [&](const std::string & prompt) { std::string response; - const bool is_first = llama_kv_cache_used_cells(kv) == 0; + const bool is_first = llama_kv_self_used_cells(ctx) == 0; // tokenize the prompt const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); @@ -115,7 +113,7 @@ int main(int argc, char ** argv) { while (true) { // check if we have enough space in the context to evaluate this batch int n_ctx = llama_n_ctx(ctx); - int n_ctx_used = llama_kv_cache_used_cells(kv); + int n_ctx_used = llama_kv_self_used_cells(ctx); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); fprintf(stderr, "context size exceeded\n"); diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 24bdc806d..a5d2bc9d0 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -45,8 +45,6 @@ int main(int argc, char ** argv) { model_tgt = llama_init_tgt.model.get(); ctx_tgt = llama_init_tgt.context.get(); - llama_kv_cache * kv = llama_get_kv_cache(ctx_tgt); - const llama_vocab * vocab = llama_model_get_vocab(model_tgt); // load the draft model @@ -219,7 +217,7 @@ int main(int argc, char ** argv) { { LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past); - llama_kv_cache_seq_rm(kv, 0, n_past, -1); + llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1); } if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index b4e5259b5..bfddc67e0 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -90,9 +90,6 @@ int main(int argc, char ** argv) { model_dft = llama_init_dft.model.get(); ctx_dft = llama_init_dft.context.get(); - llama_kv_cache * kv_tgt = llama_get_kv_cache(ctx_tgt); - llama_kv_cache * kv_dft = llama_get_kv_cache(ctx_dft); - const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt); const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft); @@ -423,14 +420,14 @@ int main(int argc, char ** argv) { { LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); - llama_kv_cache_seq_keep(kv_dft, s_keep); - llama_kv_cache_seq_cp (kv_dft, s_keep, 0, -1, -1); - llama_kv_cache_seq_keep(kv_dft, 0); + llama_kv_self_seq_keep(ctx_dft, s_keep); + llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1); + llama_kv_self_seq_keep(ctx_dft, 0); - llama_kv_cache_seq_rm (kv_tgt, s_keep, n_past_tgt, -1); - llama_kv_cache_seq_keep(kv_tgt, s_keep); - llama_kv_cache_seq_cp (kv_tgt, s_keep, 0, -1, -1); - llama_kv_cache_seq_keep(kv_tgt, 0); + llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1); + llama_kv_self_seq_keep(ctx_tgt, s_keep); + llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1); + llama_kv_self_seq_keep(ctx_tgt, 0); } for (int s = 0; s < n_seq_dft; ++s) { @@ -447,8 +444,8 @@ int main(int argc, char ** argv) { common_batch_clear(batch_dft); common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); - llama_kv_cache_seq_rm(kv_dft, 0, n_past_dft, -1); - // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(kv_dft, batch_dft).c_str()); + llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1); + // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); llama_decode(ctx_dft, batch_dft); ++n_past_dft; @@ -506,8 +503,8 @@ int main(int argc, char ** argv) { if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) { LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur); - llama_kv_cache_seq_rm(kv_dft, n_seq_cur, -1, -1); - llama_kv_cache_seq_cp(kv_dft, s, n_seq_cur, -1, -1); + llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1); + llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1); // all previous tokens from this branch are now also part of the new branch for (int t = 0; t < batch_tgt.n_tokens; ++t) { @@ -588,9 +585,9 @@ int main(int argc, char ** argv) { // evaluate the target model on the drafted tokens { - llama_kv_cache_seq_keep(kv_tgt, 0); + llama_kv_self_seq_keep(ctx_tgt, 0); for (int s = 1; s < n_seq_dft; ++s) { - llama_kv_cache_seq_cp(kv_tgt, 0, s, -1, -1); + llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1); } // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); diff --git a/include/llama.h b/include/llama.h index 08b8658ad..91300b1ae 100644 --- a/include/llama.h +++ b/include/llama.h @@ -469,7 +469,7 @@ extern "C" { DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead"); LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); // TODO: remove const? - LLAMA_API struct llama_kv_cache * llama_get_kv_cache( struct llama_context * ctx); + LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx); LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); @@ -641,28 +641,28 @@ extern "C" { // Returns the number of tokens in the KV cache (slow, use only for debug) // If a KV cell has multiple sequences assigned to it, it will be counted multiple times - LLAMA_API int32_t llama_kv_cache_n_tokens(const struct llama_kv_cache * kv); + LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx); DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), - "use llama_kv_cache_n_tokens instead"); + "use llama_kv_self_n_tokens instead"); // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) - LLAMA_API int32_t llama_kv_cache_used_cells(const struct llama_kv_cache * kv); + LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx); DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx), - "use llama_kv_cache_used_cells instead"); + "use llama_kv_self_used_cells instead"); // Clear the KV cache - both cell info is erased and KV data is zeroed - LLAMA_API void llama_kv_cache_clear( - struct llama_kv_cache * kv); + LLAMA_API void llama_kv_self_clear( + struct llama_context * ctx); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails // seq_id < 0 : match any sequence // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API bool llama_kv_cache_seq_rm( - struct llama_kv_cache * kv, + LLAMA_API bool llama_kv_self_seq_rm( + struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1); @@ -671,26 +671,26 @@ extern "C" { // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_cache_seq_cp( - struct llama_kv_cache * kv, + LLAMA_API void llama_kv_self_seq_cp( + struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1); // Removes all tokens that do not belong to the specified sequence - LLAMA_API void llama_kv_cache_seq_keep( - struct llama_kv_cache * kv, + LLAMA_API void llama_kv_self_seq_keep( + struct llama_context * ctx, llama_seq_id seq_id); // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) // If the KV cache is RoPEd, the KV data is updated accordingly: // - lazily on next llama_decode() - // - explicitly with llama_kv_cache_update() + // - explicitly with llama_kv_self_update() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_cache_seq_add( - struct llama_kv_cache * kv, + LLAMA_API void llama_kv_self_seq_add( + struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, @@ -699,32 +699,87 @@ extern "C" { // Integer division of the positions by factor of `d > 1` // If the KV cache is RoPEd, the KV data is updated accordingly: // - lazily on next llama_decode() - // - explicitly with llama_kv_cache_update() + // - explicitly with llama_kv_self_update() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_cache_seq_div( - struct llama_kv_cache * kv, + LLAMA_API void llama_kv_self_seq_div( + struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d); // Returns the largest position present in the KV cache for the specified sequence - LLAMA_API llama_pos llama_kv_cache_seq_pos_max( - struct llama_kv_cache * kv, + LLAMA_API llama_pos llama_kv_self_seq_pos_max( + struct llama_context * ctx, llama_seq_id seq_id); // Defragment the KV cache // This will be applied: // - lazily on next llama_decode() - // - explicitly with llama_kv_cache_update() - LLAMA_API void llama_kv_cache_defrag(struct llama_kv_cache * kv); + // - explicitly with llama_kv_self_update() + LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx); // Check if the context supports KV cache shifting - LLAMA_API bool llama_kv_cache_can_shift(const struct llama_kv_cache * kv); + LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx); // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) - LLAMA_API void llama_update_kv_cache(struct llama_context * ctx, struct llama_kv_cache * kv); + LLAMA_API void llama_kv_self_update(struct llama_context * ctx); + + DEPRECATED(LLAMA_API void llama_kv_cache_clear( + struct llama_context * ctx), + "use llama_kv_self_clear instead"); + + DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1), + "use llama_kv_self_seq_rm instead"); + + DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp( + struct llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1), + "use llama_kv_self_seq_cp instead"); + + DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep( + struct llama_context * ctx, + llama_seq_id seq_id), + "use llama_kv_self_seq_keep instead"); + + DEPRECATED(LLAMA_API void llama_kv_cache_seq_add( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta), + "use llama_kv_self_seq_add instead"); + + DEPRECATED(LLAMA_API void llama_kv_cache_seq_div( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d), + "use llama_kv_self_seq_div instead"); + + DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max( + struct llama_context * ctx, + llama_seq_id seq_id), + "use llama_kv_self_seq_pos_max instead"); + + DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx), + "use llama_kv_self_defrag instead"); + + DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx), + "use llama_kv_self_can_shift instead"); + + DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx), + "use llama_kv_self_update instead"); + // // State / sessions diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 0e146652c..0004e214b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -606,7 +606,7 @@ const llama_model * llama_get_model(const llama_context * ctx) { return &ctx->model; } -llama_kv_cache * llama_get_kv_cache(llama_context * ctx) { +llama_kv_cache * llama_get_kv_self(llama_context * ctx) { return &ctx->kv_self; } @@ -1147,14 +1147,14 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da data_ctx.write_embeddings(ctx); llama_kv_cache::io io = { - /* .write =*/ [&](const void * src, size_t size) { + /* .write = */ [&](const void * src, size_t size) { data_ctx.write(src, size); }, - /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { + /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { data_ctx.write_tensor_data(tensor, offset, size); }, - /* .read =*/ nullptr, - /* .read_to =*/ nullptr, + /* .read = */ nullptr, + /* .read_to = */ nullptr, }; ctx->kv_self.state_write(io, ctx->model.hparams); @@ -1195,12 +1195,12 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da data_ctx.read_embeddings(ctx); llama_kv_cache::io io = { - /* .write =*/ nullptr, - /* .write_tensor_data =*/ nullptr, - /* .read =*/ [&](size_t size) { + /* .write = */ nullptr, + /* .write_tensor_data = */ nullptr, + /* .read = */ [&](size_t size) { return data_ctx.read(size); }, - /* .read_to =*/ [&](void * dst, size_t size) { + /* .read_to = */ [&](void * dst, size_t size) { data_ctx.read_to(dst, size); }, }; @@ -1302,14 +1302,14 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam llama_synchronize(ctx); llama_kv_cache::io io = { - /* .write =*/ [&](const void * src, size_t size) { + /* .write = */ [&](const void * src, size_t size) { data_ctx.write(src, size); }, - /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { + /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) { data_ctx.write_tensor_data(tensor, offset, size); }, - /* .read =*/ nullptr, - /* .read_to =*/ nullptr, + /* .read = */ nullptr, + /* .read_to = */ nullptr, }; ctx->kv_self.state_write(io, ctx->model.hparams, seq_id); @@ -1336,12 +1336,12 @@ static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llam llama_synchronize(ctx); llama_kv_cache::io io = { - /* .write =*/ nullptr, - /* .write_tensor_data =*/ nullptr, - /* .read =*/ [&](size_t size) { + /* .write = */ nullptr, + /* .write_tensor_data = */ nullptr, + /* .read = */ [&](size_t size) { return data_ctx.read(size); }, - /* .read_to =*/ [&](void * dst, size_t size) { + /* .read_to = */ [&](void * dst, size_t size) { data_ctx.read_to(dst, size); }, }; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 6886d24f0..d2b81a022 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1072,7 +1072,17 @@ bool llama_kv_cache::state_read_data(const io & io, const llama_hparams & hparam return true; } -///////////// +// +// interface implementation +// + +int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) { + return kv->n_tokens(); +} + +int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) { + return kv->used; +} void llama_kv_cache_clear(llama_kv_cache * kv) { kv->clear(); @@ -1125,14 +1135,6 @@ void llama_kv_cache_defrag(llama_kv_cache * kv) { kv->defrag(); } -int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) { - return kv->n_tokens(); -} - -int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) { - return kv->used; -} - bool llama_kv_cache_can_shift(const llama_kv_cache * kv) { return kv->can_shift; } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 0384a2b7c..2e021d4ed 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -190,6 +190,48 @@ struct llama_kv_slot_restorer { } }; +// TODO: maybe become part of the public llama_kv_cache in the future +int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv); + +int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv); + +void llama_kv_cache_clear(llama_kv_cache * kv); + +bool llama_kv_cache_seq_rm( + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1); + +void llama_kv_cache_seq_cp( + llama_kv_cache * kv, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1); + +void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id); + +void llama_kv_cache_seq_add( + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta); + +void llama_kv_cache_seq_div( + llama_kv_cache * kv, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d); + +llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id); + +void llama_kv_cache_defrag(llama_kv_cache * kv); + +bool llama_kv_cache_can_shift(const llama_kv_cache * kv); + // // kv cache view // diff --git a/src/llama.cpp b/src/llama.cpp index 0227ba6b3..b8f404375 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8564,7 +8564,7 @@ static int llama_decode_impl( // non-causal masks do not use the KV cache if (hparams.causal_attn) { - llama_update_kv_cache(&lctx, &lctx.kv_self); // TODO: lctx->update_kv_cache() + llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update() // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it @@ -9182,9 +9182,12 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) { //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); } -static void llama_update_kv_cache_impl(llama_context & lctx, llama_kv_cache & kv) { +// TODO: move to llama_context +static void llama_kv_self_update_impl(llama_context & lctx) { bool need_reserve = false; + auto & kv = lctx.kv_self; + if (kv.has_shift) { if (!kv.can_shift) { GGML_ABORT("The current context does not support K-shift"); @@ -9856,17 +9859,151 @@ void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * // deprecated int32_t llama_get_kv_cache_token_count(const llama_context * ctx) { + return llama_kv_self_n_tokens(ctx); +} + +int32_t llama_kv_self_n_tokens(const llama_context * ctx) { return llama_kv_cache_n_tokens(&ctx->kv_self); } // deprecated int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) { + return llama_kv_self_used_cells(ctx); +} + +int32_t llama_kv_self_used_cells(const llama_context * ctx) { return llama_kv_cache_used_cells(&ctx->kv_self); } +// deprecated +void llama_kv_cache_clear(llama_context * ctx) { + llama_kv_self_clear(ctx); +} + +void llama_kv_self_clear(llama_context * ctx) { + llama_kv_cache_clear(&ctx->kv_self); +} + +// deprecated +bool llama_kv_cache_seq_rm( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return llama_kv_self_seq_rm(ctx, seq_id, p0, p1); +} + +bool llama_kv_self_seq_rm( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1); +} + +// deprecated +void llama_kv_cache_seq_cp( + llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1); +} + +void llama_kv_self_seq_cp( + llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); +} + +// deprecated +void llama_kv_cache_seq_keep( + llama_context * ctx, + llama_seq_id seq_id) { + return llama_kv_self_seq_keep(ctx, seq_id); +} + +void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id); +} + +// deprecated +void llama_kv_cache_seq_add( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta); +} + +void llama_kv_self_seq_add( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta); +} + +// deprecated +void llama_kv_cache_seq_div( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d); +} + +void llama_kv_self_seq_div( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d); +} + +// deprecated +llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_self_seq_pos_max(ctx, seq_id); +} + +llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id); +} + +// deprecated +void llama_kv_cache_defrag(llama_context * ctx) { + return llama_kv_self_defrag(ctx); +} + +void llama_kv_self_defrag(llama_context * ctx) { + return llama_kv_cache_defrag(&ctx->kv_self); +} + +// deprecated +bool llama_kv_cache_can_shift(const llama_context * ctx) { + return llama_kv_self_can_shift(ctx); +} + +bool llama_kv_self_can_shift(const llama_context * ctx) { + return llama_kv_cache_can_shift(&ctx->kv_self); +} + +// deprecated +void llama_kv_cache_update(llama_context * ctx) { + llama_kv_self_update(ctx); +} + // TODO: move to llama-context -void llama_update_kv_cache(llama_context * ctx, llama_kv_cache * kv) { - llama_update_kv_cache_impl(*ctx, *kv); +void llama_kv_self_update(llama_context * ctx) { + llama_kv_self_update_impl(*ctx); } /// From a19f671fe078497f73ec1898951475e026ffdc20 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 15 Jan 2025 10:54:21 +0200 Subject: [PATCH 09/28] context : minor ggml-ci --- src/llama-context.cpp | 36 +++++++++++------------------------- src/llama-context.h | 8 +++----- src/llama-kv-cache.cpp | 1 + src/llama-kv-cache.h | 6 +++--- src/llama.cpp | 33 +++++++++++++++++++-------------- 5 files changed, 37 insertions(+), 47 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 0004e214b..9eae6fe57 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -8,30 +8,6 @@ #include #include -void llama_set_k_shift(struct llama_context & lctx) { - const int64_t kv_size = lctx.kv_self.size; - - assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); - - int32_t * data = (int32_t *) lctx.inp_K_shift->data; - - for (int i = 0; i < kv_size; ++i) { - data[i] = lctx.kv_self.cells[i].delta; - } -} - -void llama_set_s_copy(struct llama_context & lctx) { - const int64_t kv_size = lctx.kv_self.size; - - assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer)); - - int32_t * data = (int32_t *) lctx.inp_s_copy->data; - - for (int i = 0; i < kv_size; ++i) { - data[i] = lctx.kv_self.cells[i].src; - } -} - // llama input static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { @@ -58,6 +34,16 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } +void llama_context::set_k_shift(llama_kv_cache & kv) { + assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); + + int32_t * data = (int32_t *) inp_K_shift->data; + + for (uint32_t i = 0; i < kv.size; ++i) { + data[i] = kv.cells[i].delta; + } +} + void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { // // set input data @@ -134,7 +120,6 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - float * data = nullptr; float * data_swa = nullptr; @@ -599,6 +584,7 @@ uint32_t llama_n_ubatch(const struct llama_context * ctx) { } uint32_t llama_n_seq_max(const struct llama_context * ctx) { + // TODO: add notion of n_seq_max to llama_kv_cache and use it here return ctx->kv_self.size; } diff --git a/src/llama-context.h b/src/llama-context.h index a9268b292..73baa711f 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -18,7 +18,7 @@ struct llama_context { llama_context(const llama_model & model) : model(model) , t_start_us(model.t_start_us) - , t_load_us(model.t_load_us) {} + , t_load_us (model.t_load_us) {} const struct llama_model & model; @@ -107,13 +107,11 @@ struct llama_context { struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] + + void set_k_shift(llama_kv_cache & kv); }; // TODO: make these methods of llama_context -void llama_set_k_shift(struct llama_context & lctx); - -void llama_set_s_copy(struct llama_context & lctx); - void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch); // Make sure enough space is available for outputs. diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index d2b81a022..b79c2ff93 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -6,6 +6,7 @@ #include "llama-model.h" #include +#include #include #include #include diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 2e021d4ed..5ffee6281 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -88,11 +88,11 @@ struct llama_kv_cache { void clear(); - bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1); + bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1); void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1); void seq_keep(llama_seq_id seq_id); - void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta); - void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d); + void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta); + void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d); llama_pos seq_pos_max(llama_seq_id seq_id); diff --git a/src/llama.cpp b/src/llama.cpp index b8f404375..3e1cd8260 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1142,18 +1142,18 @@ struct llm_build_context { ctx0 = ggml_init(params); - lctx.inp_tokens = nullptr; - lctx.inp_embd = nullptr; - lctx.inp_pos = nullptr; - lctx.inp_out_ids = nullptr; - lctx.inp_KQ_mask = nullptr; - lctx.inp_KQ_mask_swa = nullptr; - lctx.inp_K_shift = nullptr; - lctx.inp_mean = nullptr; - lctx.inp_cls = nullptr; - lctx.inp_s_copy = nullptr; - lctx.inp_s_mask = nullptr; - lctx.inp_s_seq = nullptr; + lctx.inp_tokens = nullptr; + lctx.inp_embd = nullptr; + lctx.inp_pos = nullptr; + lctx.inp_out_ids = nullptr; + lctx.inp_KQ_mask = nullptr; + lctx.inp_KQ_mask_swa = nullptr; + lctx.inp_K_shift = nullptr; + lctx.inp_mean = nullptr; + lctx.inp_cls = nullptr; + lctx.inp_s_copy = nullptr; + lctx.inp_s_mask = nullptr; + lctx.inp_s_seq = nullptr; lctx.inp_pos_bucket = nullptr; lctx.inp_embd_enc = nullptr; lctx.inp_KQ_mask_cross = nullptr; @@ -1174,9 +1174,11 @@ struct llm_build_context { ggml_set_input(lctx.inp_K_shift); for (int il = 0; il < n_layer; ++il) { - const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_head_kv = hparams.n_head_kv(il); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * k = ggml_view_3d(ctx0, kv_self.k_l[il], n_embd_head_k, n_head_kv, n_ctx, @@ -1189,6 +1191,7 @@ struct llm_build_context { // dequantize to f32 -> RoPE -> quantize back tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); cb(tmp, "K_f32", il); + for (auto & backend : lctx.backends) { // Figure out which backend KV cache belongs to if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { @@ -1200,6 +1203,7 @@ struct llm_build_context { lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(tmp, "K_shifted_f32", il); + tmp = ggml_cpy(ctx0, tmp, k); } else { // we rotate only the first n_rot dimensions @@ -1208,6 +1212,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); } cb(tmp, "K_shifted", il); + ggml_build_forward_expand(gf, tmp); } @@ -9201,7 +9206,7 @@ static void llama_kv_self_update_impl(llama_context & lctx) { ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - llama_set_k_shift(lctx); + lctx.set_k_shift(kv); llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); From ae274f9747cce6ba6b4910d05ddc3016cd0b4e21 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 15 Jan 2025 13:35:56 +0200 Subject: [PATCH 10/28] llama : fix names [no ci] --- src/llama.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 3e1cd8260..37816ddc2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1373,9 +1373,9 @@ struct llm_build_context { inp = ggml_graph_node(gf, i); if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) { break; - } else { - inp = nullptr; } + + inp = nullptr; } GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor"); @@ -1431,7 +1431,7 @@ struct llm_build_context { return gf; } - struct ggml_tensor * llm_build_pos_bucket(bool causal) { + struct ggml_tensor * build_pos_bucket(bool causal) { if (causal) { lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); } else { @@ -1444,7 +1444,7 @@ struct llm_build_context { return lctx.inp_pos_bucket; } - struct ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { + struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); cb(pos_bucket_1d, "pos_bucket_1d", -1); @@ -1463,7 +1463,7 @@ struct llm_build_context { return pos_bias; } - struct ggml_tensor * llm_build_inp_embd_enc() { + struct ggml_tensor * build_inp_embd_enc() { const int64_t n_embd = hparams.n_embd; lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc); ggml_set_input(lctx.inp_embd_enc); @@ -1471,7 +1471,7 @@ struct llm_build_context { return lctx.inp_embd_enc; } - struct ggml_tensor * llm_build_inp_KQ_mask_cross() { + struct ggml_tensor * build_inp_KQ_mask_cross() { lctx.inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); ggml_set_input(lctx.inp_KQ_mask_cross); cb(lctx.inp_KQ_mask_cross, "KQ_mask_cross", -1); @@ -6775,7 +6775,7 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); GGML_ASSERT(lctx.is_encoding); - struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false); + struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); @@ -6810,7 +6810,7 @@ struct llm_build_context { cb(kq, "kq", il); struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; - struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b); + struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b); struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); cb(kq_b, "kq_b", il); @@ -6909,11 +6909,11 @@ struct llm_build_context { GGML_ASSERT(!lctx.is_encoding); GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); - struct ggml_tensor * embd_enc = llm_build_inp_embd_enc(); - struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true); + struct ggml_tensor * embd_enc = build_inp_embd_enc(); + struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true); struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); - struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross(); + struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -6961,7 +6961,7 @@ struct llm_build_context { cb(kq, "kq", il); struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; - struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b); + struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b); struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); cb(kq_b, "kq_b", il); From f2524c0e4137a4327473c086f97a01aa0632ca3e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 16 Jan 2025 15:04:14 +0200 Subject: [PATCH 11/28] llama : remove references to llama_kv_cache (wip) Intermediate step necessary to abstract the `llama_context` and `llama_kv_cache`. ggml-ci --- src/llama-context.cpp | 1035 +++++++++- src/llama-context.h | 162 +- src/llama.cpp | 4568 +++++++++++++++++------------------------ 3 files changed, 2982 insertions(+), 2783 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 9eae6fe57..910e2243d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -8,8 +8,6 @@ #include #include -// llama input - static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { // TODO move to hparams if a T5 variant appears that uses a different value const int64_t max_distance = 128; @@ -34,56 +32,88 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } -void llama_context::set_k_shift(llama_kv_cache & kv) { - assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); - - int32_t * data = (int32_t *) inp_K_shift->data; - - for (uint32_t i = 0; i < kv.size; ++i) { - data[i] = kv.cells[i].delta; - } +// TODO: improve +void llama_context::reset() { + inp_tokens = nullptr; + inp_embd = nullptr; + inp_pos = nullptr; + inp_out_ids = nullptr; + inp_mean = nullptr; + inp_cls = nullptr; + inp_embd_enc = nullptr; + inp_pos_bucket = nullptr; + inp_KQ_mask = nullptr; + inp_KQ_mask_cnv = nullptr; + inp_KQ_mask_swa = nullptr; + inp_KQ_mask_swa_cnv = nullptr; + inp_KQ_mask_cross = nullptr; + inp_K_shift = nullptr; + inp_s_copy = nullptr; + inp_s_mask = nullptr; } -void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { +void llama_context::prepare_k_shift() { +} + +void llama_context::prepare_defrag() { +} + +void llama_context::prepare_decode(const llama_ubatch & /*ubatch*/) { +} + +// llama input + +void llama_context::set_inputs(const llama_ubatch & ubatch) { + const llama_hparams & hparams = model.hparams; + // // set input data // - const auto & hparams = lctx.model.hparams; - const auto & cparams = lctx.cparams; - const auto & kv_self = lctx.kv_self; + if (inp_K_shift) { + assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); + + int32_t * data = (int32_t *) inp_K_shift->data; + + for (uint32_t i = 0; i < kv_self.size; ++i) { + data[i] = kv_self.cells[i].delta; + } + + // the K-shift graph requires just this input + return; + } if (ubatch.token) { const int64_t n_tokens = ubatch.n_tokens; - ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens)); + ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens)); } if (ubatch.embd) { const int64_t n_embd = hparams.n_embd; const int64_t n_tokens = ubatch.n_tokens; - ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd)); + ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd)); } - if (ubatch.pos && lctx.inp_pos) { + if (ubatch.pos && inp_pos) { const int64_t n_tokens = ubatch.n_tokens; - auto n_pos = lctx.n_pos_per_token; - ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos)); + auto n_pos = n_pos_per_token; + ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(inp_pos)); } if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); + //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs"); - if (!lctx.inp_out_ids) { - LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__); + if (!inp_out_ids) { + LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__); } else { const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer)); - int32_t * data = (int32_t *) lctx.inp_out_ids->data; + GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer)); + int32_t * data = (int32_t *) inp_out_ids->data; - if (lctx.n_outputs == n_tokens) { + if (n_outputs == n_tokens) { for (int i = 0; i < n_tokens; ++i) { data[i] = i; } @@ -95,26 +125,26 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { } } // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(lctx.n_outputs == n_outputs); - } else if (lctx.n_outputs == 1) { + GGML_ASSERT(n_outputs == n_outputs); + } else if (n_outputs == 1) { // only keep last output data[0] = n_tokens - 1; } else { - GGML_ASSERT(lctx.n_outputs == 0); + GGML_ASSERT(n_outputs == 0); } } } GGML_ASSERT( - // (!a || b) is a logical implication (a -> b) - // !hparams.causal_attn -> !cparams.causal_attn - (hparams.causal_attn || !cparams.causal_attn) && - "causal attention is not supported by this model" - ); + // (!a || b) is a logical implication (a -> b) + // !hparams.causal_attn -> !cparams.causal_attn + (hparams.causal_attn || !cparams.causal_attn) && + "causal attention is not supported by this model" + ); - if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) { + if (inp_KQ_mask || inp_KQ_mask_swa) { // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. - if (cparams.causal_attn && !lctx.is_encoding) { + if (cparams.causal_attn && !is_encoding) { const int64_t n_kv = kv_self.n; const int64_t n_tokens = ubatch.n_tokens; const int64_t n_seq_tokens = ubatch.n_seq_tokens; @@ -123,14 +153,14 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { float * data = nullptr; float * data_swa = nullptr; - if (lctx.inp_KQ_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); - data = (float *) lctx.inp_KQ_mask->data; + if (inp_KQ_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer)); + data = (float *) inp_KQ_mask->data; } - if (lctx.inp_KQ_mask_swa) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer)); - data_swa = (float *) lctx.inp_KQ_mask_swa->data; + if (inp_KQ_mask_swa) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_swa->buffer)); + data_swa = (float *) inp_KQ_mask_swa->data; } // For causal attention, use only the previous KV cells @@ -191,11 +221,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; // when using kv cache, the mask needs to match the kv cache size - const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens; + const int64_t n_stride = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer)); - float * data = (float *) lctx.inp_KQ_mask->data; + float * data = (float *) inp_KQ_mask->data; for (int h = 0; h < 1; ++h) { for (int s1 = 0; s1 < n_seqs; ++s1) { @@ -238,11 +268,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(lctx.inp_mean); - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); + GGML_ASSERT(inp_mean); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer)); - float * data = (float *) lctx.inp_mean->data; - memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); + float * data = (float *) inp_mean->data; + memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean)); std::vector sum(n_tokens, 0); @@ -279,11 +309,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(lctx.inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); + GGML_ASSERT(inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - uint32_t * data = (uint32_t *) lctx.inp_cls->data; - memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); + uint32_t * data = (uint32_t *) inp_cls->data; + memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); for (int s = 0; s < n_seqs; ++s) { const llama_seq_id seq_id = ubatch.seq_id[s][0]; @@ -306,11 +336,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { const int64_t n_seq_tokens = ubatch.n_seq_tokens; const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(lctx.inp_cls); - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); + GGML_ASSERT(inp_cls); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer)); - uint32_t * data = (uint32_t *) lctx.inp_cls->data; - memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); + uint32_t * data = (uint32_t *) inp_cls->data; + memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls)); std::vector last_pos(n_tokens, -1); std::vector last_row(n_tokens, -1); @@ -341,17 +371,18 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { if (kv_self.recurrent) { const int64_t n_kv = kv_self.n; - if (lctx.inp_s_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer)); - float * data = (float *) lctx.inp_s_mask->data; + if (inp_s_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_mask->buffer)); + float * data = (float *) inp_s_mask->data; // clear unused states for (int i = 0; i < n_kv; ++i) { const uint32_t cell_id = i + kv_self.head; - llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id]; + llama_kv_cell & kv_cell = kv_self.cells[cell_id]; data[i] = (float) (kv_cell.src >= 0); + // TODO: do not mutate the KV cache // only clear once if (kv_cell.src < 0) { kv_cell.src = cell_id; @@ -359,14 +390,14 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { } } - if (lctx.inp_s_copy) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer)); - int32_t * data = (int32_t *) lctx.inp_s_copy->data; + if (inp_s_copy) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_copy->buffer)); + int32_t * data = (int32_t *) inp_s_copy->data; // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n for (uint32_t i = 0; i < n_kv; ++i) { const uint32_t cell_id = i + kv_self.head; - llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id]; + llama_kv_cell & kv_cell = kv_self.cells[cell_id]; // prevent out-of-bound sources if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) { @@ -375,6 +406,7 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { data[i] = kv_cell.src; + // TODO: do not mutate the KV cache // ensure copy only happens once if (kv_cell.src != (int32_t) cell_id) { kv_cell.src = cell_id; @@ -383,20 +415,20 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { } } - if (lctx.inp_pos_bucket) { + if (inp_pos_bucket) { const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_pos_bucket->buffer)); GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - int32_t * data = (int32_t *) lctx.inp_pos_bucket->data; + int32_t * data = (int32_t *) inp_pos_bucket->data; - if (!lctx.is_encoding) { + if (!is_encoding) { const int64_t n_kv = kv_self.n; for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_kv; ++i) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding); + data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding); } } } @@ -404,28 +436,28 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_tokens; ++i) { - data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding); + data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding); } } } } } - if (!lctx.is_encoding && lctx.inp_embd_enc) { - assert(lctx.inp_embd_enc->type == GGML_TYPE_F32); - assert((size_t) ggml_nelements(lctx.inp_embd_enc) == lctx.embd_enc.size()); + if (!is_encoding && inp_embd_enc) { + assert(inp_embd_enc->type == GGML_TYPE_F32); + assert((size_t) ggml_nelements(inp_embd_enc) == embd_enc.size()); - ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.embd_enc.data(), 0, ggml_nbytes(lctx.inp_embd_enc)); + ggml_backend_tensor_set(inp_embd_enc, embd_enc.data(), 0, ggml_nbytes(inp_embd_enc)); } - if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) { - const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd; + if (!is_encoding && inp_KQ_mask_cross) { + const int64_t n_output_enc = embd_enc.size() / hparams.n_embd; const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_cross->buffer)); GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - float * data = (float *) lctx.inp_KQ_mask_cross->data; + float * data = (float *) inp_KQ_mask_cross->data; for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { @@ -433,7 +465,7 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { float f = -INFINITY; for (int s = 0; s < ubatch.n_seq_id[j]; ++s) { const llama_seq_id seq_id = ubatch.seq_id[j][s]; - if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) { + if (seq_ids_enc[i].find(seq_id) != seq_ids_enc[i].end()) { f = 0.0f; } } @@ -450,6 +482,851 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { } } +// do mat_mul, while optionally apply lora +ggml_tensor * llama_context::build_lora_mm( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * ab_cur = ggml_mul_mat( + ctx0, lw->b, + ggml_mul_mat(ctx0, lw->a, cur) + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; +} + +// do mat_mul_id, while optionally apply lora +ggml_tensor * llama_context::build_lora_mm_id( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur, + ggml_tensor * ids) { + struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float alpha = lora.first->alpha; + const float rank = (float) lw->b->ne[0]; + const float scale = alpha ? lora.second * alpha / rank : lora.second; + + struct ggml_tensor * ab_cur = ggml_mul_mat_id( + ctx0, lw->b, + ggml_mul_mat_id(ctx0, lw->a, cur, ids), + ids + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; +} + +void llama_context::build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + + inp_KQ_mask = causal + ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) + : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + //cb(inp_KQ_mask, "KQ_mask", -1); + ggml_set_input(inp_KQ_mask); + + inp_KQ_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask, GGML_TYPE_F16) : inp_KQ_mask; + + if (swa) { + GGML_ASSERT(hparams.n_swa > 0); + + inp_KQ_mask_swa = causal + ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) + : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + //cb(inp_KQ_mask_swa, "KQ_mask_swa", -1); + ggml_set_input(inp_KQ_mask_swa); + + inp_KQ_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask_swa, GGML_TYPE_F16) : inp_KQ_mask_swa; + } +} + +void llama_context::build_attn_kv_store( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, + int64_t il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto & n_ctx = cparams.n_ctx; + + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + GGML_ASSERT(kv_self.size == n_ctx); + + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)*kv_head); + //cb(k_cache_view, "k_cache_view", il); + + // note: storing RoPE-ed version of K in the KV cache + ggml_build_forward_expand(graph, ggml_cpy(ctx0, k_cur, k_cache_view)); + + assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); + + struct ggml_tensor * v_cache_view = nullptr; + + if (cparams.flash_attn) { + v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head); + } else { + // note: the V cache is transposed when not using flash attention + v_cache_view = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_v_gqa, + ( n_ctx)*ggml_element_size(kv_self.v_l[il]), + (kv_head)*ggml_element_size(kv_self.v_l[il])); + + v_cur = ggml_transpose(ctx0, v_cur); + } + //cb(v_cache_view, "v_cache_view", il); + + ggml_build_forward_expand(graph, ggml_cpy(ctx0, v_cur, v_cache_view)); +} + +ggml_tensor * llama_context::build_attn_qkv( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + int32_t n_tokens, + float kq_scale, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto & n_ctx = cparams.n_ctx; + const auto & n_embd_head_k = hparams.n_embd_head_k; + const auto & n_embd_head_v = hparams.n_embd_head_v; + + // TODO: improve + bool is_sliding = false; + + switch (model.arch) { + case LLM_ARCH_COHERE2: + { + const int32_t sliding_window_pattern = 4; + is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); + } break; + case LLM_ARCH_GEMMA2: + { + const int32_t sliding_window_pattern = 2; + is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); + } break; + case LLM_ARCH_PHI3: + { + is_sliding = hparams.n_swa > 0; + } break; + default: + { + is_sliding = false; + } + }; + + const auto & kq_mask = is_sliding ? inp_KQ_mask_swa_cnv : inp_KQ_mask_cnv; + + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); + + struct ggml_tensor * k = + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_kv, n_head_kv, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + 0); + //cb(k, "k", il); + + struct ggml_tensor * cur; + + if (cparams.flash_attn) { + GGML_UNUSED(model); + GGML_UNUSED(n_ctx); + + // split cached v into n_head heads (not transposed) + struct ggml_tensor * v = + ggml_view_3d(ctx0, kv_self.v_l[il], + n_embd_head_v, n_kv, n_head_kv, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_head_v), + 0); + //cb(v, "v", il); + + cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, + hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); + + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + + cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens); + } else { + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + //cb(kq, "kq", il); + + // note: this op tends to require high floating point range + // while for some models F16 is enough, for others it is not, so we default to F32 here + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + + if (model.arch == LLM_ARCH_GROK) { + // need to do the following: + // multiply by attn_output_multiplyer of 0.08838834764831845 + // and then : + // kq = 30 * tanh(kq / 30) + // before the softmax below + + kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f)); + kq = ggml_scale(ctx0, kq, 30); + } + + if (hparams.attn_soft_cap) { + kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping); + kq = ggml_tanh(ctx0, kq); + kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping); + } + + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); + //cb(kq, "kq_soft_max_ext", il); + + GGML_ASSERT(kv_self.size == n_ctx); + + // split cached v into n_head heads + struct ggml_tensor * v = + ggml_view_3d(ctx0, kv_self.v_l[il], + n_kv, n_embd_head_v, n_head_kv, + ggml_element_size(kv_self.v_l[il])*n_ctx, + ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + 0); + //cb(v, "v", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + //cb(kqv, "kqv", il); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + //cb(kqv_merged, "kqv_merged", il); + + cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); + //cb(cur, "kqv_merged_cont", il); + + if (!cparams.offload_kqv) { + // all nodes between the KV store and the attention output are run on the CPU + ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu); + } + } + + ggml_build_forward_expand(graph, cur); + + if (wo) { + cur = build_lora_mm(ctx0, wo, cur); + } + + if (wo_b) { + //cb(cur, "kqv_wo", il); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + + return cur; +} + +ggml_tensor * llama_context::build_soft_max_ext( + ggml_context * ctx0, + ggml_tensor * kq, + float kq_scale) { + const auto & hparams = model.hparams; + + return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); +} + +ggml_tensor * llama_context::get_rope_factors(int il) { + const auto & hparams = model.hparams; + + // choose long/short freq factors based on the context size + const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; + + if (model.layers[il].rope_freqs != nullptr) { + return model.layers[il].rope_freqs; + } + + if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { + return model.layers[il].rope_long; + } + + return model.layers[il].rope_short; +} + +void llama_context::build_k_shift( + ggml_context * ctx0, + ggml_cgraph * graph) { + const auto & n_ctx = cparams.n_ctx; + const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; + const auto & freq_base = cparams.rope_freq_base; + const auto & freq_scale = cparams.rope_freq_scale; + + const auto & yarn_ext_factor = cparams.yarn_ext_factor; + const auto & yarn_attn_factor = cparams.yarn_attn_factor; + const auto & yarn_beta_fast = cparams.yarn_beta_fast; + const auto & yarn_beta_slow = cparams.yarn_beta_slow; + + const auto & hparams = model.hparams; + + const auto & n_rot = hparams.n_rot; + const auto & n_layer = hparams.n_layer; + const auto & rope_type = hparams.rope_type; + + const auto & n_embd_head_k = hparams.n_embd_head_k; + //const auto & n_embd_head_v = hparams.n_embd_head_v; + + GGML_ASSERT(kv_self.size == n_ctx); + + inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); + //cb(inp_K_shift, "K_shift", -1); + ggml_set_input(inp_K_shift); + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + + struct ggml_tensor * rope_factors = get_rope_factors(il); + + struct ggml_tensor * k = + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_head_kv, n_ctx, + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + 0); + + struct ggml_tensor * tmp; + if (ggml_is_quantized(k->type)) { + // dequantize to f32 -> RoPE -> quantize back + tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); + //cb(tmp, "K_f32", il); + + for (auto & backend : backends) { + // Figure out which backend KV cache belongs to + if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { + ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); + break; + } + } + tmp = ggml_rope_ext_inplace(ctx0, tmp, + inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + //cb(tmp, "K_shifted_f32", il); + + tmp = ggml_cpy(ctx0, tmp, k); + } else { + // we rotate only the first n_rot dimensions + tmp = ggml_rope_ext_inplace(ctx0, k, + inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + } + //cb(tmp, "K_shifted", il); + + ggml_build_forward_expand(graph, tmp); + } +} + +void llama_context::build_defrag( + ggml_context * ctx0, + ggml_cgraph * graph) { + const auto & hparams = model.hparams; + + const uint32_t n_layer = hparams.n_layer; + + const uint32_t n_kv = kv_self.cell_max(); + const uint32_t n_used = kv_self.used; + + assert(n_used <= n_kv); + + //const int64_t t_start = ggml_time_us(); + + // number of cells moved + uint32_t n_moves = 0; + + // each move requires 6*n_layer tensors (see build_defrag) + // - source view, destination view, copy operation + // - x2 for keys and values + //const uint32_t max_moves = model.max_nodes()/(6*n_layer); + // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 + const uint32_t max_moves = (model.max_nodes() - 2*n_layer)/(6*n_layer); + + // determine which KV cells to move where + // + // cell i moves to ids[i] + // + // if ids[i] == i || ids[i] == n_kv, then cell i is not moved + // + std::vector ids(n_kv, n_kv); + + for (uint32_t i0 = 0; i0 < n_used; ++i0) { + const auto & cell0 = kv_self.cells[i0]; + + if (!cell0.is_empty()) { + ids[i0] = i0; + + continue; + } + + // found a hole - fill it with data from the end of the cache + + uint32_t nh = 1; + + // determine the size of the hole + while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { + nh++; + } + + uint32_t nf = 0; + uint32_t is = n_kv - 1; + + // starting from the end, find nh non-empty cells + for (; is > i0; --is) { + const auto & cell1 = kv_self.cells[is]; + + if (cell1.is_empty() || ids[is] != n_kv) { + continue; + } + + // non-empty cell which is not yet moved + nf++; + + if (nf == nh) { + break; + } + } + + // this can only happen if `n_used` is not accurate, which would be a bug + GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); + + nf = 0; + + uint32_t i1 = is; + + // are we moving a continuous block of memory? + bool cont = false; + + // should we stop searching for the next move? + bool stop = false; + + // go back and move the nf cells to the hole + for (; i1 < n_kv; ++i1) { + auto & cell1 = kv_self.cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { + if (n_moves == max_moves) { + stop = true; + break; + } + + cont = false; + continue; + } + + // this cell goes to (i0 + nf) + ids[i1] = i0 + nf; + + // move the cell meta data + kv_self.cells[i0 + nf] = cell1; + + // clear the old cell and move the head there + cell1 = llama_kv_cell(); + kv_self.head = n_used; + + if (!cont) { + n_moves++; + cont = true; + } + + nf++; + + if (nf == nh) { + break; + } + } + + if (stop || n_moves == max_moves) { + break; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); + + i0 += nh - 1; + } + + if (n_moves == 0) { + return; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); + + //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); + +#if 0 + // CPU defrag + // + // TODO: optimizations are possible: + // - multiple threads + // - avoid copying to the host memory when already there + // + // likely not worth the effort, as we have ggml_graph based defrag + // + + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + const uint32_t kv_size = kv_self.size; + + std::vector buf_k; + std::vector buf_v; + + for (uint32_t il = 0; il < n_layer; ++il) { + const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); + + const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size); + + buf_k.resize(k_size); + buf_v.resize(v_size); + + ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); + + // batch move [i, i+nm) to [id, id+nm) + // note: cells can move only to a lower index + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < n_kv && ids[i + nm] == id + nm) { + nm++; + } + + // move keys + { + const int64_t os = i*k_size_row; + const int64_t od = id*k_size_row; + + memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); + } + + // move values (note: they are transposed) + { + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); + } + } + + i += nm - 1; + } + + ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); + } +#else + for (uint32_t i = 0; i < ids.size(); ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == ids.size()) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < ids.size() && ids[i + nm] == id + nm) { + nm++; + } + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + + ggml_tensor * view_v_src; + ggml_tensor * view_v_dst; + + if (cparams.flash_attn) { + // NOTE: the V cache is not transposed when using flash attention + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); + } else { + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, i)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, id)); + } + + ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_k_src, view_k_dst)); + ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_v_src, view_v_dst)); + } + + i += nm - 1; + } + + //LLAMA_LOG_INFO("graph->n_nodes = %d\n", graph->n_nodes); +#endif +} + +ggml_tensor * llama_context::build_inp_s_copy( + ggml_context * ctx0, + bool worst_case) { + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + + inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); + //cb(inp_s_copy, "inp_s_copy", -1); + ggml_set_input(inp_s_copy); + return inp_s_copy; +} + +ggml_tensor * llama_context::build_inp_s_mask( + ggml_context * ctx0, + bool worst_case) { + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); + //cb(inp_s_mask, "inp_s_mask", -1); + ggml_set_input(inp_s_mask); + return inp_s_mask; +} + +ggml_tensor * llama_context::build_copy_mask_state( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * s, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + int32_t n_tokens, + int32_t n_state, + int32_t n_seqs, + bool worst_case) { + const auto n_kv = worst_case ? kv_self.size : kv_self.n; + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + struct ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self.size); + + // copy states + // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv + // this shrinks the tensors's ne[1] to n_kv + states = ggml_get_rows(ctx0, states, state_copy); + + // clear states of sequences which are starting at the beginning of this batch + // FIXME: zero-out NANs? + states = ggml_mul(ctx0, states, state_mask); + + // copy states which won't be changed further (between n_seqs and n_kv) + ggml_build_forward_expand(graph, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs )*n_state*ggml_element_size(states)), + ggml_view_1d(ctx0, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s)))); + + // the part of the states that will be used and modified + return ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0); +} + +// TODO: split +ggml_tensor * llama_context::build_mamba_layer( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto & n_tokens = ubatch.n_tokens; + + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t dt_rank = hparams.ssm_dt_rank; + const int64_t n_seqs = ubatch.n_seqs; + // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers) + const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms; + // Use the same RMS norm as the final layer norm + const float norm_rms_eps = hparams.f_norm_rms_eps; + + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + struct ggml_tensor * conv_states_all = kv_self.k_l[il]; + struct ggml_tensor * ssm_states_all = kv_self.v_l[il]; + + // (ab)using the KV cache to store the states + struct ggml_tensor * conv = build_copy_mask_state( + ctx0, graph, conv_states_all, state_copy, state_mask, + n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); + conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs); + struct ggml_tensor * ssm = build_copy_mask_state( + ctx0, graph, ssm_states_all, state_copy, state_mask, + n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); + ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs); + + // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); + + // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs} + struct ggml_tensor * xz = build_lora_mm(ctx0, model.layers[il].ssm_in, cur); + // split the above in two + // => {d_inner, n_seq_tokens, n_seqs} + struct ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0); + struct ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz)); + + // conv + { + // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs} + struct ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0); + + // copy last (d_conv - 1) columns back into the state cache + struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); + + ggml_build_forward_expand(graph, + ggml_cpy(ctx0, last_conv, + ggml_view_1d(ctx0, conv_states_all, + (d_conv - 1)*(d_inner)*(n_seqs), + kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all)))); + + // 1D convolution + // The equivalent is to make a self-overlapping view of conv_x + // over d_conv columns at each stride in the 3rd dimension, + // then element-wise multiply that with the conv1d weight, + // then sum the elements of each row, + // (the last two steps are a dot product over rows (also doable with mul_mat)) + // then permute away the ne[0] dimension, + // and then you're left with the resulting x tensor. + // For simultaneous sequences, all sequences need to have the same length. + x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); + + // bias + x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b); + + x = ggml_silu(ctx0, x); + } + + // ssm + { + // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs} + struct ggml_tensor * x_db = build_lora_mm(ctx0, model.layers[il].ssm_x, x); + // split + struct ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0); + struct ggml_tensor * B = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank); + struct ggml_tensor * C = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state)); + + // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers + if (ssm_dt_b_c_rms) { + dt = ggml_rms_norm(ctx0, dt, norm_rms_eps); + B = ggml_rms_norm(ctx0, B, norm_rms_eps); + C = ggml_rms_norm(ctx0, C, norm_rms_eps); + } + + // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs} + dt = build_lora_mm(ctx0, model.layers[il].ssm_dt, dt); + dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); + + // Custom operator to optimize the parallel associative scan + // as described in the Annex D of the Mamba paper. + // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} + struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C); + + // store last states + ggml_build_forward_expand(graph, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), + ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); + + struct ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0); + + // TODO: skip computing output earlier for unused tokens + + // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} + y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); + y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z))); + + // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} + cur = build_lora_mm(ctx0, model.layers[il].ssm_out, y); + } + + // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); + //cb(cur, "mamba_out", il); + + return cur; +} + + // llama output size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { diff --git a/src/llama-context.h b/src/llama-context.h index 73baa711f..a2f41b5c8 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -14,6 +14,8 @@ #include #include +using llama_loras = std::unordered_map; + struct llama_context { llama_context(const llama_model & model) : model(model) @@ -22,12 +24,10 @@ struct llama_context { const struct llama_model & model; - struct llama_cparams cparams; - struct llama_sbatch sbatch; // TODO: revisit if needed - struct llama_kv_cache kv_self; - struct llama_adapter_cvec cvec; - - std::unordered_map lora; + llama_cparams cparams; + llama_sbatch sbatch; // TODO: revisit if needed + llama_adapter_cvec cvec; + llama_loras loras; std::vector backends; std::vector> set_n_threads_fns; @@ -72,18 +72,6 @@ struct llama_context { // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE std::map> embd_seq; - // whether we are computing encoder output or decoder output - bool is_encoding = false; - - // TODO: find a better way to accommodate mutli-dimension position encoding methods - // number of position id each token get, 1 for each token in most cases. - // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate. - int n_pos_per_token = 1; - - // output of the encoder part of the encoder-decoder models - std::vector embd_enc; - std::vector> seq_ids_enc; - // memory buffers used to evaluate the model std::vector buf_compute_meta; ggml_backend_sched_ptr sched; @@ -91,29 +79,145 @@ struct llama_context { ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; + void reset(); + + void prepare_k_shift(); + void prepare_defrag(); + void prepare_decode(const llama_ubatch & ubatch); + + void set_inputs(const llama_ubatch & ubatch); + + ggml_tensor * build_lora_mm( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur); + + ggml_tensor * build_lora_mm_id( + ggml_context * ctx0, + ggml_tensor * w, // struct ggml_tensor * as + ggml_tensor * cur, // struct ggml_tensor * b + ggml_tensor * ids); + // input tensors struct ggml_tensor * inp_tokens; // I32 [n_batch] struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch] struct ggml_tensor * inp_out_ids; // I32 [n_outputs] - struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_K_shift; // I32 [kv_size] struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] struct ggml_tensor * inp_cls; // I32 [n_batch] + + // === encoder-decoder === + + // whether we are computing encoder output or decoder output + bool is_encoding = false; + + // output of the encoder part of the encoder-decoder models + std::vector embd_enc; + std::vector> seq_ids_enc; + + struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] + struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] + + // === unified KV cache === + + llama_kv_cache kv_self; + + struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_cnv; // [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_swa_cnv; // [kv_size, n_batch] + struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] + struct ggml_tensor * inp_K_shift; // I32 [kv_size] + + void build_attn_inp( + ggml_context * ctx0, + int32_t n_tokens, + bool causal, + bool swa, + bool worst_case); + + void build_attn_kv_store( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + int32_t n_tokens, + int64_t il, + bool worst_case); + + ggml_tensor * build_attn_qkv( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + int32_t n_tokens, + float kq_scale, + int il, + bool worst_case); + + ggml_tensor * build_soft_max_ext( + ggml_context * ctx0, + ggml_tensor * kq, + float kq_scale); + + ggml_tensor * get_rope_factors(int il); + + void build_k_shift( + ggml_context * ctx0, + ggml_cgraph * graph); + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + void build_defrag( + ggml_context * ctx0, + ggml_cgraph * graph); + + // === recurrent === + + // TODO: add recurrent cache + // TODO: add mamba-specific llama_context + + // TODO: change these to build_mamba_inp and hide `state_copy` and `state_mask` inside the llama_context impl + ggml_tensor * build_inp_s_copy( + ggml_context * ctx0, + bool worst_case); + + ggml_tensor * build_inp_s_mask( + ggml_context * ctx0, + bool worst_case); + + ggml_tensor * build_copy_mask_state( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * s, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + int32_t n_tokens, + int32_t n_state, + int32_t n_seqs, + bool worst_case); + + ggml_tensor * build_mamba_layer( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case); + struct ggml_tensor * inp_s_copy; // I32 [kv_size] struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] - struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch] - struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] - struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] - struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] - void set_k_shift(llama_kv_cache & kv); + // === vision === + + // TODO: find a better way to accommodate mutli-dimension position encoding methods + // number of position id each token get, 1 for each token in most cases. + // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate. + int n_pos_per_token = 1; }; -// TODO: make these methods of llama_context -void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch); - // Make sure enough space is available for outputs. // Returns max number of outputs for which space was reserved. size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs); diff --git a/src/llama.cpp b/src/llama.cpp index 37816ddc2..a2e5e0bea 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4,8 +4,6 @@ #include "llama-mmap.h" #include "llama-context.h" #include "llama-vocab.h" -#include "llama-sampling.h" -#include "llama-kv-cache.h" #include "llama-model-loader.h" #include "llama-model.h" @@ -106,946 +104,15 @@ enum llm_norm_type { LLM_NORM_GROUP, }; -static struct ggml_tensor * llm_build_inp_embd( - struct ggml_context * ctx, - struct llama_context & lctx, - const llama_hparams & hparams, - const llama_ubatch & ubatch, - struct ggml_tensor * tok_embd, - const llm_build_cb & cb) { - const int64_t n_embd = hparams.n_embd; - - struct ggml_tensor * inpL; - - if (ubatch.token) { - lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens); - cb(lctx.inp_tokens, "inp_tokens", -1); - ggml_set_input(lctx.inp_tokens); - - inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); - - // apply lora for embedding tokens if needed - for (auto & it : lctx.lora) { - struct llama_adapter_lora_weight * lw = it.first->get_weight(tok_embd); - if (lw == nullptr) { - continue; - } - const float adapter_scale = it.second; - const float scale = lw->get_scale(it.first->alpha, adapter_scale); - struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat( - ctx, lw->b, // non-transposed lora_b - ggml_get_rows(ctx, lw->a, lctx.inp_tokens) - ), scale); - inpL = ggml_add(ctx, inpL, inpL_delta); - } - } else { - lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens); - inpL = lctx.inp_embd; - ggml_set_input(lctx.inp_embd); - } - - // For Granite architecture - if (hparams.f_embedding_scale != 0.0f) { - inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale); - } - - cb(inpL, "inp_embd", -1); - - return inpL; -} - -static void llm_build_kv_store( - struct ggml_context * ctx, - const llama_hparams & hparams, - const llama_cparams & cparams, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * k_cur, - struct ggml_tensor * v_cur, - int32_t n_tokens, - int32_t kv_head, - const llm_build_cb & cb, - int64_t il) { - const int64_t n_ctx = cparams.n_ctx; - - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - GGML_ASSERT(kv.size == n_ctx); - - struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head); - cb(k_cache_view, "k_cache_view", il); - - // note: storing RoPE-ed version of K in the KV cache - ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); - - assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); - - struct ggml_tensor * v_cache_view = nullptr; - - if (cparams.flash_attn) { - v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head); - } else { - // note: the V cache is transposed when not using flash attention - v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa, - ( n_ctx)*ggml_element_size(kv.v_l[il]), - (kv_head)*ggml_element_size(kv.v_l[il])); - - v_cur = ggml_transpose(ctx, v_cur); - } - cb(v_cache_view, "v_cache_view", il); - - ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view)); -} - -// do mat_mul, while optionally apply lora -static struct ggml_tensor * llm_build_lora_mm( - struct llama_context & lctx, - struct ggml_context * ctx0, - struct ggml_tensor * w, - struct ggml_tensor * cur) { - struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); - for (auto & it : lctx.lora) { - struct llama_adapter_lora_weight * lw = it.first->get_weight(w); - if (lw == nullptr) { - continue; - } - const float adapter_scale = it.second; - const float scale = lw->get_scale(it.first->alpha, adapter_scale); - struct ggml_tensor * ab_cur = ggml_mul_mat( - ctx0, lw->b, - ggml_mul_mat(ctx0, lw->a, cur) - ); - ab_cur = ggml_scale(ctx0, ab_cur, scale); - res = ggml_add(ctx0, res, ab_cur); - } - return res; -} - -// do mat_mul_id, while optionally apply lora -static struct ggml_tensor * llm_build_lora_mm_id( - struct llama_context & lctx, - struct ggml_context * ctx0, - struct ggml_tensor * w, // struct ggml_tensor * as - struct ggml_tensor * cur, // struct ggml_tensor * b - struct ggml_tensor * ids) { - struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); - for (auto & it : lctx.lora) { - struct llama_adapter_lora_weight * lw = it.first->get_weight(w); - if (lw == nullptr) { - continue; - } - const float alpha = it.first->alpha; - const float rank = (float) lw->b->ne[0]; - const float scale = alpha ? it.second * alpha / rank : it.second; - struct ggml_tensor * ab_cur = ggml_mul_mat_id( - ctx0, lw->b, - ggml_mul_mat_id(ctx0, lw->a, cur, ids), - ids - ); - ab_cur = ggml_scale(ctx0, ab_cur, scale); - res = ggml_add(ctx0, res, ab_cur); - } - return res; -} - -static struct ggml_tensor * llm_build_norm( - struct ggml_context * ctx, - struct ggml_tensor * cur, - const llama_hparams & hparams, - struct ggml_tensor * mw, - struct ggml_tensor * mb, - llm_norm_type type, - const llm_build_cb & cb, - int il) { - switch (type) { - case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break; - case LLM_NORM_RMS: cur = ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break; - case LLM_NORM_GROUP: - { - cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]); - cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps); - cur = ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]); - } break; - } - - if (mw || mb) { - cb(cur, "norm", il); - } - - if (mw) { - cur = ggml_mul(ctx, cur, mw); - if (mb) { - cb(cur, "norm_w", il); - } - } - - if (mb) { - cur = ggml_add(ctx, cur, mb); - } - - return cur; -} - -static struct ggml_tensor * llm_build_ffn( - struct ggml_context * ctx, - struct llama_context & lctx, - struct ggml_tensor * cur, - struct ggml_tensor * up, - struct ggml_tensor * up_b, - struct ggml_tensor * up_s, - struct ggml_tensor * gate, - struct ggml_tensor * gate_b, - struct ggml_tensor * gate_s, - struct ggml_tensor * down, - struct ggml_tensor * down_b, - struct ggml_tensor * down_s, - struct ggml_tensor * act_scales, - llm_ffn_op_type type_op, - llm_ffn_gate_type type_gate, - const llm_build_cb & cb, - int il) { - struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur; - cb(tmp, "ffn_up", il); - - if (up_b) { - tmp = ggml_add(ctx, tmp, up_b); - cb(tmp, "ffn_up_b", il); - } - - if (up_s) { - tmp = ggml_mul(ctx, tmp, up_s); - cb(tmp, "ffn_up_s", il); - } - - if (gate) { - switch (type_gate) { - case LLM_FFN_SEQ: - { - cur = llm_build_lora_mm(lctx, ctx, gate, tmp); - cb(cur, "ffn_gate", il); - } break; - case LLM_FFN_PAR: - { - cur = llm_build_lora_mm(lctx, ctx, gate, cur); - cb(cur, "ffn_gate", il); - } break; - } - - if (gate_b) { - cur = ggml_add(ctx, cur, gate_b); - cb(cur, "ffn_gate_b", il); - } - - if (gate_s) { - cur = ggml_mul(ctx, cur, gate_s); - cb(cur, "ffn_gate_s", il); - } - - } else { - cur = tmp; - } - - switch (type_op) { - case LLM_FFN_SILU: - { - cur = ggml_silu(ctx, cur); - cb(cur, "ffn_silu", il); - } break; - case LLM_FFN_GELU: - { - cur = ggml_gelu(ctx, cur); - cb(cur, "ffn_gelu", il); - if (act_scales != NULL) { - cur = ggml_div(ctx, cur, act_scales); - cb(cur, "ffn_act", il); - } - } break; - case LLM_FFN_RELU: - { - cur = ggml_relu(ctx, cur); - cb(cur, "ffn_relu", il); - } break; - case LLM_FFN_RELU_SQR: - { - cur = ggml_relu(ctx, cur); - cb(cur, "ffn_relu", il); - - cur = ggml_sqr(ctx, cur); - cb(cur, "ffn_sqr(relu)", il); - } break; - case LLM_FFN_SWIGLU: - { - // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf - int64_t split_point = cur->ne[0] / 2; - struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0)); - struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); - - x0 = ggml_silu(ctx, x0); - cb(cur, "ffn_silu", il); - - cur = ggml_mul(ctx, x0, x1); - cb(cur, "ffn_mul", il); - } break; - } - - if (type_gate == LLM_FFN_PAR) { - cur = ggml_mul(ctx, cur, tmp); - cb(cur, "ffn_gate_par", il); - } - - if (down) { - cur = llm_build_lora_mm(lctx, ctx, down, cur); - } - - if (down_b) { - cb(cur, "ffn_down", il); - } - - if (down_b) { - cur = ggml_add(ctx, cur, down_b); - } - - if (down_s) { - cur = ggml_mul(ctx, cur, down_s); - cb(cur, "ffn_down_s", il); - } - - return cur; -} - -static struct ggml_tensor * llm_build_moe_ffn( - struct ggml_context * ctx, - struct llama_context & lctx, - struct ggml_tensor * cur, - struct ggml_tensor * gate_inp, - struct ggml_tensor * up_exps, - struct ggml_tensor * gate_exps, - struct ggml_tensor * down_exps, - struct ggml_tensor * exp_probs_b, - int64_t n_expert, - int64_t n_expert_used, - llm_ffn_op_type type_op, - bool norm_w, - bool scale_w, - float w_scale, -llama_expert_gating_func_type gating_op, - const llm_build_cb & cb, - int il) { - int64_t n_embd = cur->ne[0]; - int64_t n_tokens = cur->ne[1]; - - ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens] - cb(logits, "ffn_moe_logits", il); - - ggml_tensor * probs = nullptr; - switch (gating_op) { - case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: - { - probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] - } break; - case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: - { - probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens] - } break; - default: - GGML_ABORT("fatal error"); - } - cb(probs, "ffn_moe_probs", il); - - // add experts selection bias - introduced in DeepSeek V3 - // leave probs unbiased as it's later used to get expert weights - ggml_tensor * selection_probs = probs; - if (exp_probs_b != nullptr) { - selection_probs = ggml_add(ctx, probs, exp_probs_b); - cb(selection_probs, "ffn_moe_probs_biased", il); - } - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens] - cb(selected_experts->src[0], "ffn_moe_argsort", il); - cb(selected_experts, "ffn_moe_topk", il); - - ggml_tensor * weights = ggml_get_rows(ctx, - ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] - cb(weights, "ffn_moe_weights", il); - - if (norm_w) { - weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens); - - ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens] - cb(weights_sum, "ffn_moe_weights_sum", il); - - weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens] - cb(weights, "ffn_moe_weights_norm", il); - - weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens); - } - if (scale_w) { - weights = ggml_scale(ctx, weights, w_scale); - cb(weights, "ffn_moe_weights_scaled", il); - } - - cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); - ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(up, "ffn_moe_up", il); - - ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(gate, "ffn_moe_gate", il); - - switch (type_op) { - case LLM_FFN_SILU: - { - gate = ggml_silu(ctx, gate); - cb(gate, "ffn_moe_silu", il); - } break; - case LLM_FFN_GELU: - { - gate = ggml_gelu(ctx, gate); - cb(gate, "ffn_moe_gelu", il); - } break; - default: - GGML_ABORT("fatal error"); - } - - ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] - cb(par, "ffn_moe_gate_par", il); - - ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] - cb(experts, "ffn_moe_down", il); - - experts = ggml_mul(ctx, experts, weights); - - // aggregate experts - ggml_tensor * moe_out = nullptr; - for (int i = 0; i < n_expert_used; ++i) { - ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens, - experts->nb[2], i*experts->nb[1]); - - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx, moe_out, cur_expert); - } - } - - if (n_expert_used == 1) { - // avoid returning a non-contiguous tensor - moe_out = ggml_cont(ctx, moe_out); - } - - return moe_out; -} - -static struct ggml_tensor * llm_build_kqv( - struct ggml_context * ctx, - struct llama_context & lctx, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * wo, - struct ggml_tensor * wo_b, - struct ggml_tensor * q_cur, - struct ggml_tensor * kq_mask, - int32_t n_tokens, - int32_t n_kv, - float kq_scale, - const llm_build_cb & cb, - int il) { - const llama_model & model = lctx.model; - const llama_hparams & hparams = lctx.model.hparams; - const llama_cparams & cparams = lctx.cparams; - - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head(il); - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_head_k = hparams.n_embd_head_k; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_head_v = hparams.n_embd_head_v; - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); - cb(q, "q", il); - - struct ggml_tensor * k = - ggml_view_3d(ctx, kv.k_l[il], - n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv.k_l[il]->type, n_embd_head_k), - 0); - cb(k, "k", il); - - struct ggml_tensor * cur; - - if (cparams.flash_attn) { - GGML_UNUSED(model); - GGML_UNUSED(n_ctx); - - // split cached v into n_head heads (not transposed) - struct ggml_tensor * v = - ggml_view_3d(ctx, kv.v_l[il], - n_embd_head_v, n_kv, n_head_kv, - ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv.v_l[il]->type, n_embd_head_v), - 0); - cb(v, "v", il); - - cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, - hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); - - ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); - - cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); - } else { - struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); - cb(kq, "kq", il); - - // note: this op tends to require high floating point range - // while for some models F16 is enough, for others it is not, so we default to F32 here - ggml_mul_mat_set_prec(kq, GGML_PREC_F32); - - if (model.arch == LLM_ARCH_GROK) { - // need to do the following: - // multiply by attn_output_multiplyer of 0.08838834764831845 - // and then : - // kq = 30 * tanh(kq / 30) - // before the softmax below - - kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f)); - kq = ggml_scale(ctx, kq, 30); - } - - if (hparams.attn_soft_cap) { - kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping); - kq = ggml_tanh(ctx, kq); - kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping); - } - - kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); - - GGML_ASSERT(kv.size == n_ctx); - - // split cached v into n_head heads - struct ggml_tensor * v = - ggml_view_3d(ctx, kv.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv.v_l[il])*n_ctx, - ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v, - 0); - cb(v, "v", il); - - struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); - cb(kqv, "kqv", il); - - struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); - - cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens); - cb(cur, "kqv_merged_cont", il); - } - - ggml_build_forward_expand(graph, cur); - - if (wo) { - cur = llm_build_lora_mm(lctx, ctx, wo, cur); - } - - if (wo_b) { - cb(cur, "kqv_wo", il); - } - - if (wo_b) { - cur = ggml_add(ctx, cur, wo_b); - } - - return cur; -} - -static struct ggml_tensor * llm_build_kv( - struct ggml_context * ctx, - struct llama_context & lctx, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * wo, - struct ggml_tensor * wo_b, - struct ggml_tensor * k_cur, - struct ggml_tensor * v_cur, - struct ggml_tensor * q_cur, - struct ggml_tensor * kq_mask, - int32_t n_tokens, - int32_t kv_head, - int32_t n_kv, - float kq_scale, - const llm_build_cb & cb, - int il) { - const llama_hparams & hparams = lctx.model.hparams; - const llama_cparams & cparams = lctx.cparams; - - // these nodes are added to the graph together so that they are not reordered - // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(graph, q_cur); - ggml_build_forward_expand(graph, k_cur); - ggml_build_forward_expand(graph, v_cur); - - llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il); - - struct ggml_tensor * cur; - - cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il); - cb(cur, "kqv_out", il); - - return cur; -} - -static struct ggml_tensor * llm_build_copy_mask_state( - struct ggml_context * ctx, - struct ggml_cgraph * graph, - struct ggml_tensor * s, - struct ggml_tensor * state_copy, - struct ggml_tensor * state_mask, - int32_t n_state, - int32_t kv_size, - int32_t kv_head, - int32_t n_kv, - int32_t n_seqs) { - struct ggml_tensor * states = ggml_reshape_2d(ctx, s, n_state, kv_size); - - // copy states - // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv - // this shrinks the tensors's ne[1] to n_kv - states = ggml_get_rows(ctx, states, state_copy); - - // clear states of sequences which are starting at the beginning of this batch - // FIXME: zero-out NANs? - states = ggml_mul(ctx, states, state_mask); - - // copy states which won't be changed further (between n_seqs and n_kv) - ggml_build_forward_expand(graph, - ggml_cpy(ctx, - ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)), - ggml_view_1d(ctx, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s)))); - - // the part of the states that will be used and modified - return ggml_view_2d(ctx, states, n_state, n_seqs, states->nb[1], 0); -} - -// TODO: split -static struct ggml_tensor * llm_build_mamba( - struct ggml_context * ctx, - struct llama_context & lctx, - const llama_ubatch & ubatch, - struct ggml_cgraph * graph, - struct ggml_tensor * cur, - struct ggml_tensor * state_copy, - struct ggml_tensor * state_mask, - int32_t kv_head, - int32_t n_kv, - const llm_build_cb & cb, - int il) { - const llama_model & model = lctx.model; - const llama_hparams & hparams = model.hparams; - const llama_kv_cache & kv = lctx.kv_self; - const int64_t d_conv = hparams.ssm_d_conv; - const int64_t d_inner = hparams.ssm_d_inner; - const int64_t d_state = hparams.ssm_d_state; - const int64_t dt_rank = hparams.ssm_dt_rank; - const int64_t n_seqs = ubatch.n_seqs; - // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers) - const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms; - // Use the same RMS norm as the final layer norm - const float norm_rms_eps = hparams.f_norm_rms_eps; - - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - - GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs); - GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); - - struct ggml_tensor * conv_states_all = kv.k_l[il]; - struct ggml_tensor * ssm_states_all = kv.v_l[il]; - - // (ab)using the KV cache to store the states - struct ggml_tensor * conv = llm_build_copy_mask_state(ctx, - graph, conv_states_all, state_copy, state_mask, - hparams.n_embd_k_s(), kv.size, kv_head, n_kv, n_seqs); - conv = ggml_reshape_3d(ctx, conv, d_conv - 1, d_inner, n_seqs); - struct ggml_tensor * ssm = llm_build_copy_mask_state(ctx, - graph, ssm_states_all, state_copy, state_mask, - hparams.n_embd_v_s(), kv.size, kv_head, n_kv, n_seqs); - ssm = ggml_reshape_3d(ctx, ssm, d_state, d_inner, n_seqs); - - // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} - cur = ggml_reshape_3d(ctx, cur, cur->ne[0], n_seq_tokens, n_seqs); - - // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs} - struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_in, cur); - // split the above in two - // => {d_inner, n_seq_tokens, n_seqs} - struct ggml_tensor * x = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0); - struct ggml_tensor * z = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz)); - - // conv - { - // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs} - struct ggml_tensor * conv_x = ggml_concat(ctx, conv, ggml_transpose(ctx, x), 0); - - // copy last (d_conv - 1) columns back into the state cache - struct ggml_tensor * last_conv = ggml_view_3d(ctx, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); - - ggml_build_forward_expand(graph, - ggml_cpy(ctx, last_conv, - ggml_view_1d(ctx, conv_states_all, - (d_conv - 1)*(d_inner)*(n_seqs), - kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all)))); - - // 1D convolution - // The equivalent is to make a self-overlapping view of conv_x - // over d_conv columns at each stride in the 3rd dimension, - // then element-wise multiply that with the conv1d weight, - // then sum the elements of each row, - // (the last two steps are a dot product over rows (also doable with mul_mat)) - // then permute away the ne[0] dimension, - // and then you're left with the resulting x tensor. - // For simultaneous sequences, all sequences need to have the same length. - x = ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d); - - // bias - x = ggml_add(ctx, x, model.layers[il].ssm_conv1d_b); - - x = ggml_silu(ctx, x); - } - - // ssm - { - // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs} - struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_x, x); - // split - struct ggml_tensor * dt = ggml_view_3d(ctx, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0); - struct ggml_tensor * B = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank); - struct ggml_tensor * C = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state)); - - // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers - if (ssm_dt_b_c_rms) { - dt = ggml_rms_norm(ctx, dt, norm_rms_eps); - B = ggml_rms_norm(ctx, B, norm_rms_eps); - C = ggml_rms_norm(ctx, C, norm_rms_eps); - } - - // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs} - dt = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_dt, dt); - dt = ggml_add(ctx, dt, model.layers[il].ssm_dt_b); - - // Custom operator to optimize the parallel associative scan - // as described in the Annex D of the Mamba paper. - // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} - struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C); - - // store last states - ggml_build_forward_expand(graph, - ggml_cpy(ctx, - ggml_view_1d(ctx, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), - ggml_view_1d(ctx, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); - - struct ggml_tensor * y = ggml_view_3d(ctx, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0); - - // TODO: skip computing output earlier for unused tokens - - // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs} - y = ggml_add(ctx, y, ggml_mul(ctx, x, model.layers[il].ssm_d)); - y = ggml_mul(ctx, y, ggml_silu(ctx, ggml_cont(ctx, z))); - - // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} - cur = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_out, y); - } - - // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} - cur = ggml_reshape_2d(ctx, cur, cur->ne[0], n_seq_tokens * n_seqs); - cb(cur, "mamba_out", il); - - return cur; -} - -static struct ggml_tensor * llm_build_rwkv6_time_mix( - struct llama_context & lctx, - struct ggml_context * ctx, - const struct llama_layer * layer, - struct ggml_tensor * cur, - struct ggml_tensor * x_prev, - struct ggml_tensor ** wkv_state, - size_t wkv_head_size, - size_t head_count_kv) { - size_t n_embd = cur->ne[0]; - size_t n_seq_tokens = cur->ne[1]; - size_t n_seqs = cur->ne[2]; - - size_t head_size = wkv_head_size; - size_t head_count = n_embd / head_size; - - size_t n_tokens = n_seqs * n_seq_tokens; - - bool is_qrwkv = layer->time_mix_first == nullptr; - - struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); - - sx = ggml_reshape_2d(ctx, sx, n_embd, n_tokens); - cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); - - struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur); - - xxx = ggml_reshape_4d( - ctx, - ggml_tanh( - ctx, - ggml_mul_mat(ctx, layer->time_mix_w1, xxx) - ), - layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens - ); - - xxx = ggml_cont(ctx, ggml_permute(ctx, xxx, 0, 1, 3, 2)); - - xxx = ggml_mul_mat( - ctx, - ggml_reshape_4d( - ctx, - layer->time_mix_w2, - layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 - ), - xxx - ); - - struct ggml_tensor *xw, *xk, *xv, *xr, *xg; - if (layer->time_mix_lerp_fused) { - // fusing these weights makes some performance improvement - sx = ggml_reshape_3d(ctx, sx, n_embd, 1, n_tokens); - cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); - xxx = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xxx, layer->time_mix_lerp_fused), sx), cur); - xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); - xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - } else { - // for backward compatibility - xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); - xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - - xw = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xw, layer->time_mix_lerp_w), sx), cur); - xk = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xk, layer->time_mix_lerp_k), sx), cur); - xv = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xv, layer->time_mix_lerp_v), sx), cur); - xr = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xr, layer->time_mix_lerp_r), sx), cur); - xg = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xg, layer->time_mix_lerp_g), sx), cur); - } - - struct ggml_tensor * r = llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr); - struct ggml_tensor * k = llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk); - struct ggml_tensor * v = llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv); - if (layer->time_mix_receptance_b) { - r = ggml_add(ctx, r, layer->time_mix_receptance_b); - } - if (layer->time_mix_key_b) { - k = ggml_add(ctx, k, layer->time_mix_key_b); - } - if (layer->time_mix_value_b) { - v = ggml_add(ctx, v, layer->time_mix_value_b); - } - - struct ggml_tensor * g = llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg); - if (is_qrwkv) { - g = ggml_sigmoid(ctx, g); - } else { - g = ggml_silu(ctx, g); - } - - if (head_count_kv != head_count) { - GGML_ASSERT(head_count % head_count_kv == 0); - k = ggml_reshape_4d(ctx, k, head_size, 1, head_count_kv, n_tokens); - v = ggml_reshape_4d(ctx, v, head_size, 1, head_count_kv, n_tokens); - struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens); - k = ggml_repeat(ctx, k, tmp); - v = ggml_repeat(ctx, v, tmp); - } - - k = ggml_reshape_3d(ctx, k, head_size, head_count, n_tokens); - v = ggml_reshape_3d(ctx, v, head_size, head_count, n_tokens); - r = ggml_reshape_3d(ctx, r, head_size, head_count, n_tokens); - - struct ggml_tensor * w = ggml_mul_mat( - ctx, - layer->time_mix_decay_w2, - ggml_tanh( - ctx, - ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw) - ) - ); - - w = ggml_add(ctx, w, layer->time_mix_decay); - w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w))); - w = ggml_reshape_3d(ctx, w, head_size, head_count, n_tokens); - - if (is_qrwkv) { - // k = k * (1 - w) - k = ggml_sub(ctx, k, ggml_mul(ctx, k, w)); - } - - struct ggml_tensor * wkv_output; - if (!layer->time_mix_first) { - wkv_output = ggml_gated_linear_attn(ctx, k, v, r, w, *wkv_state, pow(head_size, -0.5f)); - } else { - wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); - } - cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0); - *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); - - if (!is_qrwkv) { - // group norm with head_count groups - cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens); - cur = ggml_norm(ctx, cur, 64e-5f); - - // Convert back to regular vectors. - cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); - cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); - } else { - cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); - } - - cur = ggml_mul(ctx, cur, g); - cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur); - - return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs); -} - -static struct ggml_tensor * llm_build_rwkv6_channel_mix( - struct llama_context & lctx, - struct ggml_context * ctx, - const struct llama_layer * layer, - struct ggml_tensor * cur, - struct ggml_tensor * x_prev) { - struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); - struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur); - struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur); - - struct ggml_tensor * r = ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr)); - struct ggml_tensor * k = ggml_sqr( - ctx, - ggml_relu( - ctx, - llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk) - ) - ); - - return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k)); -} - struct llm_build_context { - const llama_model & model; - llama_context & lctx; - const llama_hparams & hparams; - const llama_cparams & cparams; - const llama_ubatch & ubatch; - const llama_kv_cache & kv_self; + llama_context & lctx; + const llama_model & model; + const llama_hparams & hparams; + const llama_cparams & cparams; + const llama_ubatch & ubatch; + //const llama_kv_cache & kv_self; + const llama_adapter_cvec & cvec; + const llama_loras & loras; const int64_t n_embd; const int64_t n_layer; @@ -1070,12 +137,13 @@ struct llm_build_context { const float norm_rms_eps; const int32_t n_tokens; - const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size) + //const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size) + //const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_outputs; const int32_t n_outputs_enc; - const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_ctx_orig; + const bool worst_case; const bool flash_attn; const enum llama_pooling_type pooling_type; @@ -1089,16 +157,18 @@ struct llm_build_context { // TODO: consider making the entire interface noexcept llm_build_context( - llama_context & lctx, - const llama_ubatch & ubatch, - const llm_build_cb & cb, - bool worst_case) : - model (lctx.model), + llama_context & lctx, + const llama_ubatch & ubatch, + const llm_build_cb & cb, + bool worst_case) : lctx (lctx), + model (lctx.model), hparams (model.hparams), cparams (lctx.cparams), ubatch (ubatch), - kv_self (lctx.kv_self), + //kv_self (lctx.kv_self), + cvec (lctx.cvec), + loras (lctx.loras), n_embd (hparams.n_embd), n_layer (hparams.n_layer), n_rot (hparams.n_rot), @@ -1120,11 +190,12 @@ struct llm_build_context { norm_eps (hparams.f_norm_eps), norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (ubatch.n_tokens), - n_kv (worst_case ? kv_self.size : kv_self.n), + //n_kv (worst_case ? kv_self.size : kv_self.n), + //kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), n_outputs (worst_case ? n_tokens : lctx.n_outputs), n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd), - kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), n_ctx_orig (cparams.n_ctx_orig_yarn), + worst_case (worst_case), flash_attn (cparams.flash_attn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), @@ -1142,21 +213,7 @@ struct llm_build_context { ctx0 = ggml_init(params); - lctx.inp_tokens = nullptr; - lctx.inp_embd = nullptr; - lctx.inp_pos = nullptr; - lctx.inp_out_ids = nullptr; - lctx.inp_KQ_mask = nullptr; - lctx.inp_KQ_mask_swa = nullptr; - lctx.inp_K_shift = nullptr; - lctx.inp_mean = nullptr; - lctx.inp_cls = nullptr; - lctx.inp_s_copy = nullptr; - lctx.inp_s_mask = nullptr; - lctx.inp_s_seq = nullptr; - lctx.inp_pos_bucket = nullptr; - lctx.inp_embd_enc = nullptr; - lctx.inp_KQ_mask_cross = nullptr; + lctx.reset(); } void free() { @@ -1164,125 +221,597 @@ struct llm_build_context { ctx0 = nullptr; } + struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) { + struct ggml_tensor * inpL; + + if (ubatch.token) { + lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + cb(lctx.inp_tokens, "inp_tokens", -1); + ggml_set_input(lctx.inp_tokens); + + inpL = ggml_get_rows(ctx0, tok_embd, lctx.inp_tokens); + + // apply lora for embedding tokens if needed + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat( + ctx0, lw->b, // non-transposed lora_b + ggml_get_rows(ctx0, lw->a, lctx.inp_tokens) + ), scale); + + inpL = ggml_add(ctx0, inpL, inpL_delta); + } + } else { + lctx.inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens); + inpL = lctx.inp_embd; + ggml_set_input(lctx.inp_embd); + } + + // For Granite architecture + if (hparams.f_embedding_scale != 0.0f) { + inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale); + } + + cb(inpL, "inp_embd", -1); + + return inpL; + } + + // do mat_mul, while optionally apply lora + struct ggml_tensor * build_lora_mm( + struct ggml_tensor * w, + struct ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * ab_cur = ggml_mul_mat( + ctx0, lw->b, + ggml_mul_mat(ctx0, lw->a, cur) + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; + } + + // do mat_mul_id, while optionally apply lora + struct ggml_tensor * build_lora_mm_id( + struct ggml_tensor * w, // struct ggml_tensor * as + struct ggml_tensor * cur, // struct ggml_tensor * b + struct ggml_tensor * ids) { + struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float alpha = lora.first->alpha; + const float rank = (float) lw->b->ne[0]; + const float scale = alpha ? lora.second * alpha / rank : lora.second; + + struct ggml_tensor * ab_cur = ggml_mul_mat_id( + ctx0, lw->b, + ggml_mul_mat_id(ctx0, lw->a, cur, ids), + ids + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; + } + + struct ggml_tensor * build_norm( + struct ggml_tensor * cur, + struct ggml_tensor * mw, + struct ggml_tensor * mb, + llm_norm_type type, + int il) { + switch (type) { + case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm (ctx0, cur, hparams.f_norm_rms_eps); break; + case LLM_NORM_GROUP: + { + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]); + cur = ggml_group_norm(ctx0, cur, hparams.n_norm_groups, hparams.f_norm_group_eps); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[2]); + } break; + } + + if (mw || mb) { + cb(cur, "norm", il); + } + + if (mw) { + cur = ggml_mul(ctx0, cur, mw); + if (mb) { + cb(cur, "norm_w", il); + } + } + + if (mb) { + cur = ggml_add(ctx0, cur, mb); + } + + return cur; + } + + struct ggml_tensor * build_ffn( + struct ggml_tensor * cur, + struct ggml_tensor * up, + struct ggml_tensor * up_b, + struct ggml_tensor * up_s, + struct ggml_tensor * gate, + struct ggml_tensor * gate_b, + struct ggml_tensor * gate_s, + struct ggml_tensor * down, + struct ggml_tensor * down_b, + struct ggml_tensor * down_s, + struct ggml_tensor * act_scales, + llm_ffn_op_type type_op, + llm_ffn_gate_type type_gate, + const llm_build_cb & cb, + int il) { + struct ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur; + cb(tmp, "ffn_up", il); + + if (up_b) { + tmp = ggml_add(ctx0, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (up_s) { + tmp = ggml_mul(ctx0, tmp, up_s); + cb(tmp, "ffn_up_s", il); + } + + if (gate) { + switch (type_gate) { + case LLM_FFN_SEQ: + { + cur = build_lora_mm(gate, tmp); + cb(cur, "ffn_gate", il); + } break; + case LLM_FFN_PAR: + { + cur = build_lora_mm(gate, cur); + cb(cur, "ffn_gate", il); + } break; + } + + if (gate_b) { + cur = ggml_add(ctx0, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + + if (gate_s) { + cur = ggml_mul(ctx0, cur, gate_s); + cb(cur, "ffn_gate_s", il); + } + + } else { + cur = tmp; + } + + switch (type_op) { + case LLM_FFN_SILU: + { + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_silu", il); + } break; + case LLM_FFN_GELU: + { + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_gelu", il); + if (act_scales != NULL) { + cur = ggml_div(ctx0, cur, act_scales); + cb(cur, "ffn_act", il); + } + } break; + case LLM_FFN_RELU: + { + cur = ggml_relu(ctx0, cur); + cb(cur, "ffn_relu", il); + } break; + case LLM_FFN_RELU_SQR: + { + cur = ggml_relu(ctx0, cur); + cb(cur, "ffn_relu", il); + + cur = ggml_sqr(ctx0, cur); + cb(cur, "ffn_sqr(relu)", il); + } break; + case LLM_FFN_SWIGLU: + { + // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + int64_t split_point = cur->ne[0] / 2; + struct ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); + struct ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); + + x0 = ggml_silu(ctx0, x0); + cb(cur, "ffn_silu", il); + + cur = ggml_mul(ctx0, x0, x1); + cb(cur, "ffn_mul", il); + } break; + } + + if (type_gate == LLM_FFN_PAR) { + cur = ggml_mul(ctx0, cur, tmp); + cb(cur, "ffn_gate_par", il); + } + + if (down) { + cur = build_lora_mm(down, cur); + } + + if (down_b) { + cb(cur, "ffn_down", il); + } + + if (down_b) { + cur = ggml_add(ctx0, cur, down_b); + } + + if (down_s) { + cur = ggml_mul(ctx0, cur, down_s); + cb(cur, "ffn_down_s", il); + } + + return cur; + } + + struct ggml_tensor * build_moe_ffn( + struct ggml_tensor * cur, + struct ggml_tensor * gate_inp, + struct ggml_tensor * up_exps, + struct ggml_tensor * gate_exps, + struct ggml_tensor * down_exps, + struct ggml_tensor * exp_probs_b, + int64_t n_expert, + int64_t n_expert_used, + llm_ffn_op_type type_op, + bool norm_w, + bool scale_w, + float w_scale, + llama_expert_gating_func_type gating_op, + const llm_build_cb & cb, + int il) { + int64_t n_embd = cur->ne[0]; + int64_t n_tokens = cur->ne[1]; + + ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens] + cb(logits, "ffn_moe_logits", il); + + ggml_tensor * probs = nullptr; + switch (gating_op) { + case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: + { + probs = ggml_soft_max(ctx0, logits); // [n_expert, n_tokens] + } break; + case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: + { + probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens] + } break; + default: + GGML_ABORT("fatal error"); + } + cb(probs, "ffn_moe_probs", il); + + // add experts selection bias - introduced in DeepSeek V3 + // leave probs unbiased as it's later used to get expert weights + ggml_tensor * selection_probs = probs; + if (exp_probs_b != nullptr) { + selection_probs = ggml_add(ctx0, probs, exp_probs_b); + cb(selection_probs, "ffn_moe_probs_biased", il); + } + + // select experts + ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens] + cb(selected_experts->src[0], "ffn_moe_argsort", il); + cb(selected_experts, "ffn_moe_topk", il); + + ggml_tensor * weights = ggml_get_rows(ctx0, + ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights", il); + + if (norm_w) { + weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); + + ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] + cb(weights_sum, "ffn_moe_weights_sum", il); + + weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights_norm", il); + + weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); + } + if (scale_w) { + weights = ggml_scale(ctx0, weights, w_scale); + cb(weights, "ffn_moe_weights_scaled", il); + } + + cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); + ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(up, "ffn_moe_up", il); + + ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(gate, "ffn_moe_gate", il); + + switch (type_op) { + case LLM_FFN_SILU: + { + gate = ggml_silu(ctx0, gate); + cb(gate, "ffn_moe_silu", il); + } break; + case LLM_FFN_GELU: + { + gate = ggml_gelu(ctx0, gate); + cb(gate, "ffn_moe_gelu", il); + } break; + default: + GGML_ABORT("fatal error"); + } + + ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] + cb(par, "ffn_moe_gate_par", il); + + ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + cb(experts, "ffn_moe_down", il); + + experts = ggml_mul(ctx0, experts, weights); + + // aggregate experts + ggml_tensor * moe_out = nullptr; + for (int i = 0; i < n_expert_used; ++i) { + ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens, + experts->nb[2], i*experts->nb[1]); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx0, moe_out, cur_expert); + } + } + + if (n_expert_used == 1) { + // avoid returning a non-contiguous tensor + moe_out = ggml_cont(ctx0, moe_out); + } + + return moe_out; + } + + struct ggml_tensor * build_attn( + struct ggml_cgraph * graph, + struct ggml_tensor * wo, + struct ggml_tensor * wo_b, + struct ggml_tensor * k_cur, + struct ggml_tensor * v_cur, + struct ggml_tensor * q_cur, + int32_t n_tokens, + float kq_scale, + const llm_build_cb & cb, + int il) { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(graph, q_cur); + ggml_build_forward_expand(graph, k_cur); + ggml_build_forward_expand(graph, v_cur); + + //build_kv_store(graph, k_cur, v_cur, il); + lctx.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case); + + struct ggml_tensor * cur; + + //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il); + cur = lctx.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); + cb(cur, "kqv_out", il); + + return cur; + } + + //struct ggml_tensor * build_rwkv6_time_mix( + // const struct llama_layer * layer, + // struct ggml_tensor * cur, + // struct ggml_tensor * x_prev, + // struct ggml_tensor ** wkv_state, + // size_t wkv_head_size, + // size_t head_count_kv) { + // size_t n_embd = cur->ne[0]; + // size_t n_seq_tokens = cur->ne[1]; + // size_t n_seqs = cur->ne[2]; + + // size_t head_size = wkv_head_size; + // size_t head_count = n_embd / head_size; + + // size_t n_tokens = n_seqs * n_seq_tokens; + + // bool is_qrwkv = layer->time_mix_first == nullptr; + + // struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + + // sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens); + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + + // struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur); + + // xxx = ggml_reshape_4d( + // ctx0, + // ggml_tanh( + // ctx0, + // ggml_mul_mat(ctx0, layer->time_mix_w1, xxx) + // ), + // layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens + // ); + + // xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2)); + + // xxx = ggml_mul_mat( + // ctx0, + // ggml_reshape_4d( + // ctx0, + // layer->time_mix_w2, + // layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 + // ), + // xxx + // ); + + // struct ggml_tensor *xw, *xk, *xv, *xr, *xg; + // if (layer->time_mix_lerp_fused) { + // // fusing these weights makes some performance improvement + // sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens); + // cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); + // xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur); + // xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + // xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + // xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + // xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + // xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + // } else { + // // for backward compatibility + // xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + // xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + // xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + // xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + // xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + + // xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur); + // xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur); + // xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur); + // xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur); + // xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur); + // } + + // struct ggml_tensor * r = build_lora_mm(layer->time_mix_receptance, xr); + // struct ggml_tensor * k = build_lora_mm(layer->time_mix_key, xk); + // struct ggml_tensor * v = build_lora_mm(layer->time_mix_value, xv); + // if (layer->time_mix_receptance_b) { + // r = ggml_add(ctx0, r, layer->time_mix_receptance_b); + // } + // if (layer->time_mix_key_b) { + // k = ggml_add(ctx0, k, layer->time_mix_key_b); + // } + // if (layer->time_mix_value_b) { + // v = ggml_add(ctx0, v, layer->time_mix_value_b); + // } + + // struct ggml_tensor * g = build_lora_mm(layer->time_mix_gate, xg); + // if (is_qrwkv) { + // g = ggml_sigmoid(ctx0, g); + // } else { + // g = ggml_silu(ctx0, g); + // } + + // if (head_count_kv != head_count) { + // GGML_ASSERT(head_count % head_count_kv == 0); + // k = ggml_reshape_4d(ctx0, k, head_size, 1, head_count_kv, n_tokens); + // v = ggml_reshape_4d(ctx0, v, head_size, 1, head_count_kv, n_tokens); + // struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens); + // k = ggml_repeat(ctx0, k, tmp); + // v = ggml_repeat(ctx0, v, tmp); + // } + + // k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens); + // v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens); + // r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens); + + // struct ggml_tensor * w = ggml_mul_mat( + // ctx0, + // layer->time_mix_decay_w2, + // ggml_tanh( + // ctx0, + // ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw) + // ) + // ); + + // w = ggml_add(ctx0, w, layer->time_mix_decay); + // w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w))); + // w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens); + + // if (is_qrwkv) { + // // k = k * (1 - w) + // k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w)); + // } + + // struct ggml_tensor * wkv_output; + // if (!layer->time_mix_first) { + // wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, *wkv_state, pow(head_size, -0.5f)); + // } else { + // wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, *wkv_state); + // } + // cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0); + // *wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); + + // if (!is_qrwkv) { + // // group norm with head_count groups + // cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens); + // cur = ggml_norm(ctx0, cur, 64e-5f); + + // // Convert back to regular vectors. + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + // cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b); + // } else { + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + // } + + // cur = ggml_mul(ctx0, cur, g); + // cur = build_lora_mm(layer->time_mix_output, cur); + + // return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs); + //} + + //struct ggml_tensor * build_rwkv6_channel_mix( + // const struct llama_layer * layer, + // struct ggml_tensor * cur, + // struct ggml_tensor * x_prev) { + // struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + // struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur); + // struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur); + + // struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr)); + // struct ggml_tensor * k = ggml_sqr( + // ctx0, + // ggml_relu( + // ctx0, + // build_lora_mm(layer->channel_mix_key, xk) + // ) + // ); + + // return ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k)); + //} + struct ggml_cgraph * build_k_shift() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - GGML_ASSERT(kv_self.size == n_ctx); - - lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(lctx.inp_K_shift, "K_shift", -1); - ggml_set_input(lctx.inp_K_shift); - - for (int il = 0; il < n_layer; ++il) { - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - - struct ggml_tensor * rope_factors = build_rope_factors(il); - - struct ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_head_kv, n_ctx, - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - 0); - - struct ggml_tensor * tmp; - if (ggml_is_quantized(k->type)) { - // dequantize to f32 -> RoPE -> quantize back - tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); - cb(tmp, "K_f32", il); - - for (auto & backend : lctx.backends) { - // Figure out which backend KV cache belongs to - if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { - ggml_backend_sched_set_tensor_backend(lctx.sched.get(), tmp, backend.get()); - break; - } - } - tmp = ggml_rope_ext_inplace(ctx0, tmp, - lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(tmp, "K_shifted_f32", il); - - tmp = ggml_cpy(ctx0, tmp, k); - } else { - // we rotate only the first n_rot dimensions - tmp = ggml_rope_ext_inplace(ctx0, k, - lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - } - cb(tmp, "K_shifted", il); - - ggml_build_forward_expand(gf, tmp); - } + lctx.build_k_shift(ctx0, gf); return gf; } - struct ggml_cgraph * build_defrag(const std::vector & ids) { + struct ggml_cgraph * build_defrag() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - for (uint32_t i = 0; i < ids.size(); ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == ids.size()) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < ids.size() && ids[i + nm] == id + nm) { - nm++; - } - - for (int il = 0; il < n_layer; ++il) { - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); - - ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); - - ggml_tensor * view_v_src; - ggml_tensor * view_v_dst; - - if (flash_attn) { - // NOTE: the V cache is not transposed when using flash attention - view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); - - view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); - } else { - view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, i)); - - view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, id)); - } - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); - } - - i += nm - 1; - } - - //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); + lctx.build_defrag(ctx0, gf); return gf; } @@ -1294,21 +823,6 @@ struct llm_build_context { return lctx.inp_pos; } - struct ggml_tensor * build_rope_factors(int il) { - // choose long/short freq factors based on the context size - const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; - - if (model.layers[il].rope_freqs != nullptr) { - return model.layers[il].rope_freqs; - } - - if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { - return model.layers[il].rope_long; - } - - return model.layers[il].rope_short; - } - struct ggml_tensor * build_inp_out_ids() { lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); cb(lctx.inp_out_ids, "inp_out_ids", -1); @@ -1316,28 +830,6 @@ struct llm_build_context { return lctx.inp_out_ids; } - struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { - lctx.inp_KQ_mask = causal - ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) - : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - cb(lctx.inp_KQ_mask, "KQ_mask", -1); - ggml_set_input(lctx.inp_KQ_mask); - - return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask; - } - - struct ggml_tensor * build_inp_KQ_mask_swa(bool causal = true) { - GGML_ASSERT(hparams.n_swa > 0); - - lctx.inp_KQ_mask_swa = causal - ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) - : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); - cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1); - ggml_set_input(lctx.inp_KQ_mask_swa); - - return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa; - } - struct ggml_tensor * build_inp_mean() { lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); cb(lctx.inp_mean, "inp_mean", -1); @@ -1352,20 +844,6 @@ struct llm_build_context { return lctx.inp_cls; } - struct ggml_tensor * build_inp_s_copy() { - lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); - cb(lctx.inp_s_copy, "inp_s_copy", -1); - ggml_set_input(lctx.inp_s_copy); - return lctx.inp_s_copy; - } - - struct ggml_tensor * build_inp_s_mask() { - lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); - cb(lctx.inp_s_mask, "inp_s_mask", -1); - ggml_set_input(lctx.inp_s_mask); - return lctx.inp_s_mask; - } - struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) { // find result_norm tensor for input struct ggml_tensor * inp = nullptr; @@ -1431,37 +909,37 @@ struct llm_build_context { return gf; } - struct ggml_tensor * build_pos_bucket(bool causal) { - if (causal) { - lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); - } else { - lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); - } + //struct ggml_tensor * build_pos_bucket(bool causal) { + // if (causal) { + // lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); + // } else { + // lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); + // } - ggml_set_input(lctx.inp_pos_bucket); - cb(lctx.inp_pos_bucket, "pos_bucket", -1); + // ggml_set_input(lctx.inp_pos_bucket); + // cb(lctx.inp_pos_bucket, "pos_bucket", -1); - return lctx.inp_pos_bucket; - } + // return lctx.inp_pos_bucket; + //} - struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { - struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); - cb(pos_bucket_1d, "pos_bucket_1d", -1); + //struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) { + // struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0); + // cb(pos_bucket_1d, "pos_bucket_1d", -1); - struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d); - cb(pos_bias, "pos_bias", -1); + // struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d); + // cb(pos_bias, "pos_bias", -1); - pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0); - cb(pos_bias, "pos_bias", -1); + // pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0); + // cb(pos_bias, "pos_bias", -1); - pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3); - cb(pos_bias, "pos_bias", -1); + // pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3); + // cb(pos_bias, "pos_bias", -1); - pos_bias = ggml_cont(ctx0, pos_bias); - cb(pos_bias, "pos_bias", -1); + // pos_bias = ggml_cont(ctx0, pos_bias); + // cb(pos_bias, "pos_bias", -1); - return pos_bias; - } + // return pos_bias; + //} struct ggml_tensor * build_inp_embd_enc() { const int64_t n_embd = hparams.n_embd; @@ -1491,45 +969,44 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -1550,9 +1027,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -1574,12 +1051,12 @@ struct llm_build_context { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -1588,12 +1065,12 @@ struct llm_build_context { cb(cur, "ffn_out", il); } else { // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -1615,7 +1092,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1624,13 +1101,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); // For Granite architecture if (hparams.f_logit_scale) { @@ -1657,13 +1134,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -1676,37 +1152,37 @@ struct llm_build_context { cur = inpL; } else { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); } if (n_head > 0 && n_head_kv == 0) { // "linear attention" of Llama-3_1-Nemotron-51B - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cur = build_lora_mm(model.layers[il].wo, cur); cb(cur, "wo", il); } else if (n_head > 0) { // self-attention // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -1727,9 +1203,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -1754,12 +1230,12 @@ struct llm_build_context { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -1776,7 +1252,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1785,13 +1261,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); // For Granite architecture if (hparams.f_logit_scale) { @@ -1815,31 +1291,30 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); switch (model.type) { @@ -1865,9 +1340,9 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -1882,12 +1357,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -1897,7 +1372,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1906,13 +1381,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -1930,31 +1405,30 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -1970,9 +1444,9 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -1987,12 +1461,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2002,7 +1476,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2011,11 +1485,11 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2034,37 +1508,36 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; - attn_norm = llm_build_norm(ctx0, inpL, hparams, + attn_norm = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(attn_norm, "attn_norm", il); // self-attention { if (model.layers[il].attn_norm_2) { // Falcon-40B - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm_2", il); } else { cur = attn_norm; } - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); @@ -2091,9 +1564,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2108,7 +1581,7 @@ struct llm_build_context { // feed forward { - cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result + cur = build_ffn(attn_norm, // !! use the attn norm, not the result model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2119,7 +1592,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, inpL); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2129,13 +1602,13 @@ struct llm_build_context { cur = inpL; // norm - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2156,7 +1629,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // multiply by embedding_multiplier_scale of 78.38367176906169 inpL = ggml_scale(ctx0, inpL, 78.38367176906169f); @@ -2164,37 +1637,36 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -2215,9 +1687,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -2231,9 +1703,9 @@ struct llm_build_context { // Grok // if attn_out_norm is present then apply it before adding the input if (model.layers[il].attn_out_norm) { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_out_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_out_norm", il); } @@ -2242,12 +1714,12 @@ struct llm_build_context { // feed-forward network // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -2264,16 +1736,16 @@ struct llm_build_context { // if layer_out_norm is present then apply it before adding the input // Idea: maybe ffn_out_norm is a better name if (model.layers[il].layer_out_norm) { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].layer_out_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "layer_out_norm", il); } cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2282,13 +1754,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); // Grok // multiply logits by output_multiplier_scale of 0.5773502691896257 @@ -2316,21 +1788,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM, cb, il); + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention @@ -2339,7 +1810,7 @@ struct llm_build_context { struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Vcur = nullptr; - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -2367,9 +1838,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2385,12 +1856,12 @@ struct llm_build_context { // feed-forward network // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].attn_out_norm, NULL, - LLM_NORM, cb, il); + cur = build_norm(ffn_inp, + model.layers[il].attn_out_norm, NULL, + LLM_NORM, il); cb(cur, "attn_out_norm", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -2406,7 +1877,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2415,13 +1886,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM, cb, -1); + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); @@ -2440,13 +1911,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -2455,15 +1925,15 @@ struct llm_build_context { cb(inpL, "inpL", -1); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -2479,9 +1949,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2497,13 +1967,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -2513,20 +1983,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2543,28 +2013,27 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); @@ -2573,9 +2042,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2590,12 +2059,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2605,7 +2074,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2614,13 +2083,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2645,7 +2114,7 @@ struct llm_build_context { } // construct input embeddings (token, type, position) - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // token types are hardcoded to zero ("Sentence A") struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); @@ -2656,11 +2125,10 @@ struct llm_build_context { cb(inpL, "inp_embd", -1); // embed layer norm - inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); + inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); cb(inpL, "inp_norm", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false); + lctx.build_attn_inp(ctx0, n_tokens, false, false, worst_case); // iterate layers for (int il = 0; il < n_layer; ++il) { @@ -2672,33 +2140,33 @@ struct llm_build_context { // self-attention if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { - Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq); + Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq); cb(Qcur, "Qcur", il); if (model.layers[il].attn_q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); } - Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk); + Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk); cb(Kcur, "Kcur", il); if (model.layers[il].attn_k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); } - Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv); + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); } else { // compute Q and K and RoPE them - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); @@ -2730,7 +2198,8 @@ struct llm_build_context { struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); cb(kq, "kq", il); - kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); + //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); + kq = lctx.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); cb(kq, "kq_soft_max_ext", il); struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); @@ -2747,7 +2216,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cur = build_lora_mm(model.layers[il].wo, cur); if (model.layers[il].bo) { cb(cur, "kqv_wo", il); } @@ -2768,11 +2237,11 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, inpL); // attention layer norm - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il); + cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il); if (model.layers[il].attn_norm_2 != nullptr) { cur = ggml_add(ctx0, cur, inpL); // re-add the layer input - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il); + cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il); } struct ggml_tensor * ffn_inp = cur; @@ -2780,21 +2249,21 @@ struct llm_build_context { // feed-forward network if (model.arch == LLM_ARCH_BERT) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, cb, il); } else { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -2807,7 +2276,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); // output layer norm - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il); + cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il); // input for next layer inpL = cur; @@ -2832,27 +2301,26 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); - inpL = llm_build_norm(ctx0, inpL, hparams, + inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(inpL, "inp_norm", -1); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -2868,9 +2336,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -2886,13 +2354,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -2902,20 +2370,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -2934,10 +2402,9 @@ struct llm_build_context { struct ggml_tensor * pos; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); if (model.pos_embd) { // inp_pos - contains the positions @@ -2952,17 +2419,17 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; - attn_norm = llm_build_norm(ctx0, inpL, hparams, + attn_norm = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(attn_norm, "attn_norm", il); // self-attention { cur = attn_norm; - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); if (model.layers[il].bqkv){ @@ -2985,30 +2452,30 @@ struct llm_build_context { // Q/K Layernorm if (model.layers[il].attn_q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Qcur, "Qcur", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Kcur, "Kcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } else { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } } @@ -3025,12 +2492,12 @@ struct llm_build_context { // feed forward { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -3040,7 +2507,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3049,13 +2516,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3072,22 +2539,21 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); struct ggml_tensor * inpSA = cur; @@ -3095,21 +2561,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -3122,17 +2588,17 @@ struct llm_build_context { cb(Kcur, "Kcur", il); if (model.layers[il].attn_q_norm) { - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Qcur, "Qcur", il); } if (model.layers[il].attn_k_norm) { - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Kcur, "Kcur", il); } @@ -3151,9 +2617,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3170,16 +2636,16 @@ struct llm_build_context { // feed-forward network { if (model.layers[il].ffn_norm) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); } else { // parallel residual cur = inpSA; } - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3189,7 +2655,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3198,14 +2664,14 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3222,25 +2688,24 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -3270,9 +2735,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3287,12 +2752,12 @@ struct llm_build_context { // feed-forward forward { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3302,7 +2767,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3311,13 +2776,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3335,37 +2800,36 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -3384,9 +2848,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3400,12 +2864,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3414,7 +2878,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3423,13 +2887,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3446,7 +2910,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4); @@ -3454,8 +2918,8 @@ struct llm_build_context { ggml_set_input(lctx.inp_pos); struct ggml_tensor * inp_pos = lctx.inp_pos; - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -3463,25 +2927,25 @@ struct llm_build_context { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -3502,9 +2966,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3518,12 +2982,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3532,7 +2996,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3541,13 +3005,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3568,37 +3032,36 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -3617,9 +3080,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -3634,13 +3097,13 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, lctx, cur, + build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -3655,14 +3118,14 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur); + ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur); cb(cur_gate_inp, "ffn_shexp_gate_inp", il); // sigmoid ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); cb(cur_gate, "ffn_shexp_gate", il); - ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * cur_ffn = build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -3680,7 +3143,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3689,13 +3152,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3715,19 +3178,18 @@ struct llm_build_context { struct ggml_tensor * ffn_output; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { - attn_norm_output = llm_build_norm(ctx0, inpL, hparams, + attn_norm_output = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(attn_norm_output, "attn_norm", il); // self-attention @@ -3737,7 +3199,7 @@ struct llm_build_context { struct ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output); + cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -3747,9 +3209,9 @@ struct llm_build_context { Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); } else { - Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); - Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); - Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); } cb(Qcur, "Qcur", il); @@ -3776,9 +3238,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -3791,7 +3253,7 @@ struct llm_build_context { // FF { - ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output, + ffn_output = build_ffn(attn_norm_output, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -3802,20 +3264,20 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_output); cur = ggml_add(ctx0, cur, inpL); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output_no_bias", -1); cur = ggml_add(ctx0, cur, model.output_b); @@ -3834,19 +3296,13 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = nullptr; - if (hparams.n_swa == 0) { - // Phi-4 doesn't use sliding window attention - KQ_mask = build_inp_KQ_mask(); - } else { - KQ_mask = build_inp_KQ_mask_swa(); - } + lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case); for (int il = 0; il < n_layer; ++il) { auto residual = inpL; @@ -3854,12 +3310,12 @@ struct llm_build_context { // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); - struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams, + struct ggml_tensor* attn_norm_output = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(attn_norm_output, "attn_norm", il); struct ggml_tensor * Qcur = nullptr; @@ -3867,16 +3323,16 @@ struct llm_build_context { struct ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output); + cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); cb(cur, "wqkv", il); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); } else { - Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); - Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); - Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); } cb(Qcur, "Qcur", il); @@ -3901,9 +3357,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -3916,14 +3372,14 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, residual); residual = cur; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -3932,7 +3388,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); } else { // MoE branch - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -3947,20 +3403,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, residual, cur); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); if (model.output_b != nullptr) { cb(cur, "result_output_no_bias", -1); @@ -3984,20 +3440,19 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); struct ggml_tensor * attention_norm = cur; @@ -4005,13 +3460,13 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -4026,9 +3481,9 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } struct ggml_tensor * sa_out = cur; @@ -4044,7 +3499,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4055,7 +3510,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, sa_out); cur = ggml_add(ctx0, cur, inpL); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4064,13 +3519,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4089,13 +3544,12 @@ struct llm_build_context { struct ggml_tensor * pos; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -4104,15 +3558,15 @@ struct llm_build_context { cb(inpL, "inpL", -1); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -4128,9 +3582,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -4146,13 +3600,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -4162,20 +3616,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4194,24 +3648,23 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -4239,9 +3692,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -4257,13 +3710,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -4273,20 +3726,20 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4304,41 +3757,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); // if (model.layers[il].bq) { // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); // cb(Qcur, "Qcur", il); // } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); // if (model.layers[il].bk) { // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); // cb(Kcur, "Kcur", il); // } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); // if (model.layers[il].bv) { // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -4359,9 +3811,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -4375,12 +3827,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4389,7 +3841,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4398,13 +3850,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4422,41 +3874,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -4477,9 +3928,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -4493,12 +3944,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4507,7 +3958,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4516,13 +3967,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4546,7 +3997,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // scale the input embeddings inpL = ggml_scale(ctx0, inpL, scale_embd); @@ -4555,17 +4006,16 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention @@ -4575,9 +4025,9 @@ struct llm_build_context { q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); cb(q, "q", il); - q = llm_build_norm(ctx0, q, hparams, + q = build_norm(q, model.layers[il].attn_q_a_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(q, "q", il); // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} @@ -4616,9 +4066,9 @@ struct llm_build_context { cb(k_pe, "k_pe", il); kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm - kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, + kv_compressed = build_norm(kv_compressed, model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(kv_compressed, "kv_compressed", il); // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} @@ -4670,9 +4120,9 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + k_states, v_states, q_states, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -4692,12 +4142,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4711,7 +4161,7 @@ struct llm_build_context { cb(cur, "hidden_scaled_ffn", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4720,9 +4170,9 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head scaling @@ -4731,7 +4181,7 @@ struct llm_build_context { cb(cur, "lmhead_scaling", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4747,7 +4197,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); @@ -4755,26 +4205,25 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -4792,9 +4241,9 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -4807,14 +4256,14 @@ struct llm_build_context { struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); - cur = llm_build_norm(ctx0, sa_out, hparams, + cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4824,7 +4273,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, sa_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4833,13 +4282,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4855,7 +4304,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); @@ -4863,31 +4312,25 @@ struct llm_build_context { // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - // gemma 2 requires different mask for layers using sliding window (SWA) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true); - struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true); + lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case); for (int il = 0; il < n_layer; ++il) { - // (il % 2) layers use SWA - struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask; - // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -4911,14 +4354,14 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il); } - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); if (il == n_layer - 1) { @@ -4931,14 +4374,14 @@ struct llm_build_context { struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); - cur = llm_build_norm(ctx0, sa_out, hparams, + cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -4947,13 +4390,13 @@ struct llm_build_context { cb(cur, "ffn_out", il); } - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "ffn_post_norm", -1); cur = ggml_add(ctx0, cur, sa_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4962,13 +4405,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); // final logit soft-capping cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); @@ -4993,41 +4436,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -5048,9 +4490,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5065,12 +4507,12 @@ struct llm_build_context { // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -5079,7 +4521,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5088,13 +4530,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5109,21 +4551,20 @@ struct llm_build_context { struct ggml_tensor * inpL; // {n_embd, n_tokens} - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = build_inp_s_copy(); - struct ggml_tensor * state_mask = build_inp_s_mask(); + struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); - cur = llm_build_mamba(ctx0, lctx, ubatch, gf, cur, - state_copy, state_mask, - kv_head, n_kv, cb, il); + //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il); + cur = lctx.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case); if (il == n_layer - 1) { // skip computing output for unused tokens @@ -5142,13 +4583,13 @@ struct llm_build_context { } // final rmsnorm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5167,41 +4608,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); struct ggml_tensor * ffn_inp = cur; // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -5220,16 +4660,16 @@ struct llm_build_context { 0); cb(Kcur, "Kcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Qcur, "Qcur", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Kcur, "Kcur", il); } @@ -5247,9 +4687,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5264,7 +4704,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, ffn_inp, + cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5276,7 +4716,7 @@ struct llm_build_context { // add together residual + FFN + self-attention cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5285,13 +4725,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); @@ -5315,15 +4755,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - // cohere2 requires different mask for layers using sliding window (SWA) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); + lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case); // sliding window switch pattern const int32_t sliding_window_pattern = 4; @@ -5331,35 +4768,34 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { // three layers sliding window attention (window size 4096) and ROPE // fourth layer uses global attention without positional embeddings - const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); - struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask; + const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); // norm - cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il); + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il); cb(cur, "attn_norm", il); struct ggml_tensor * ffn_inp = cur; // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -5385,8 +4821,8 @@ struct llm_build_context { cb(Kcur, "Kcur", il); } - cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, - KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il); + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, + n_tokens, 1.0f / sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5401,7 +4837,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, + cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); @@ -5410,7 +4846,7 @@ struct llm_build_context { // add together residual + FFN + self-attention cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5419,11 +4855,11 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1); + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); @@ -5455,41 +4891,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, NULL, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (hparams.f_clamp_kqv > 0.0f) { Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (hparams.f_clamp_kqv > 0.0f) { Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (hparams.f_clamp_kqv > 0.0f) { Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -5510,9 +4945,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5527,12 +4962,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, NULL, NULL, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5543,7 +4978,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5552,13 +4987,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, NULL, NULL, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5579,13 +5014,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5595,21 +5029,21 @@ struct llm_build_context { // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, - LLM_NORM_RMS, cb, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, - LLM_NORM_RMS, cb, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -5629,14 +5063,14 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_post_norm", il); if (il == n_layer - 1) { @@ -5651,7 +5085,7 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_ffn(ctx0, lctx, ffn_inp, + cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5659,15 +5093,15 @@ struct llm_build_context { LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "ffn_post_norm", -1); cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5676,13 +5110,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5707,41 +5141,40 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, - LLM_NORM_RMS, cb, il); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, - LLM_NORM_RMS, cb, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, il); cb(Kcur, "Kcur_normed", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -5761,9 +5194,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5778,12 +5211,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -5797,7 +5230,7 @@ struct llm_build_context { cb(cur, "ffn_moe_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5806,13 +5239,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5828,13 +5261,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { const int64_t n_head = hparams.n_head(il); @@ -5845,14 +5277,14 @@ struct llm_build_context { struct ggml_tensor * residual = cur; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); @@ -5866,14 +5298,14 @@ struct llm_build_context { struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); cb(Vcur, "Vcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(Qcur, "Qcur", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(Kcur, "Kcur", il); Qcur = ggml_rope_ext( @@ -5891,9 +5323,9 @@ struct llm_build_context { Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); cb(Qcur, "Vcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -5908,12 +5340,12 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -5923,7 +5355,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); inpL = cur; @@ -5932,12 +5364,12 @@ struct llm_build_context { cur = inpL; // norm - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5955,24 +5387,23 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -6000,9 +5431,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -6019,13 +5450,13 @@ struct llm_build_context { struct ggml_tensor * attn_out = cur; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -6037,7 +5468,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, attn_out); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6050,13 +5481,13 @@ struct llm_build_context { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -6065,7 +5496,7 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6073,13 +5504,13 @@ struct llm_build_context { } } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -6100,33 +5531,32 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -6143,9 +5573,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -6160,12 +5590,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6177,12 +5607,12 @@ struct llm_build_context { cb(ffn_out, "ffn_out", il); // MoE - cur = llm_build_norm(ctx0, inpSA, hparams, + cur = build_norm(inpSA, model.layers[il].ffn_norm_exps, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm_exps", il); - cur = llm_build_moe_ffn(ctx0, lctx, cur, + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -6198,7 +5628,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_out); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6207,13 +5637,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -6234,44 +5664,45 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -6292,9 +5723,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -6309,13 +5740,13 @@ struct llm_build_context { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6325,7 +5756,7 @@ struct llm_build_context { } else { // MoE branch ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, lctx, cur, + build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -6340,7 +5771,7 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -6354,7 +5785,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6363,13 +5794,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); @@ -6400,21 +5831,20 @@ struct llm_build_context { struct ggml_tensor * inpL; // {n_embd, n_tokens} - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self_attention @@ -6425,9 +5855,9 @@ struct llm_build_context { q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); cb(q, "q", il); - q = llm_build_norm(ctx0, q, hparams, + q = build_norm(q, model.layers[il].attn_q_a_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(q, "q", il); // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} @@ -6470,9 +5900,9 @@ struct llm_build_context { cb(k_pe, "k_pe", il); kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm - kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, + kv_compressed = build_norm(kv_compressed, model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(kv_compressed, "kv_compressed", il); // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} @@ -6524,9 +5954,9 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + k_states, v_states, q_states, n_tokens, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -6540,13 +5970,13 @@ struct llm_build_context { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -6556,7 +5986,7 @@ struct llm_build_context { } else { // MoE branch ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, lctx, cur, + build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -6571,7 +6001,7 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + ggml_tensor * ffn_shexp = build_ffn(cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -6585,7 +6015,7 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6594,9 +6024,9 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head @@ -6617,26 +6047,25 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); if (model.layers[il].wq_scale) { Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale); } @@ -6647,7 +6076,7 @@ struct llm_build_context { } // B1.K - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); if (model.layers[il].wk_scale) { Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale); } @@ -6658,7 +6087,7 @@ struct llm_build_context { } // B1.V - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); if (model.layers[il].wv_scale) { Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale); } @@ -6682,16 +6111,16 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, NULL, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_sub_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_sub_norm", il); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cur = build_lora_mm(model.layers[il].wo, cur); if (model.layers[il].wo_scale) { cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale); } @@ -6712,12 +6141,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward forward - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, NULL, NULL, NULL, @@ -6725,12 +6154,12 @@ struct llm_build_context { LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_sub_out", il); - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].ffn_sub_norm, NULL, - LLM_NORM_RMS, cb, il); + cur = build_norm(cur, + model.layers[il].ffn_sub_norm, NULL, + LLM_NORM_RMS, il); cb(cur, "ffn_sub_norm", il); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur); + cur = build_lora_mm(model.layers[il].ffn_down, cur); if (model.layers[il].ffn_down_scale) { cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); } @@ -6745,356 +6174,356 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head // FIXME: do not use model.tok_embd directly, duplicate as model.output - cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur); + cur = build_lora_mm(model.tok_embd, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); return gf; } - struct ggml_cgraph * build_t5_enc() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + //struct ggml_cgraph * build_t5_enc() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; + // // mutable variable, needed during the last layer of the computation to skip unused tokens + // int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + // const int64_t n_embd_head = hparams.n_embd_head_v; + // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + // inpL = build_inp_embd(model.tok_embd); - GGML_ASSERT(lctx.is_encoding); - struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false); + // GGML_ASSERT(lctx.is_encoding); + // struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); + // // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + // struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + // for (int il = 0; il < n_layer; ++il) { + // struct ggml_tensor * inpSA = inpL; - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + // // norm + // cur = build_norm(inpL, + // model.layers[il].attn_norm_enc, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "attn_norm", il); - // self-attention - { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur); - cb(Qcur, "Qcur", il); + // // self-attention + // { + // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur); + // cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur); - cb(Kcur, "Kcur", il); + // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur); + // cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur); - cb(Vcur, "Vcur", il); + // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur); + // cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + // struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // cb(kq, "kq", il); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; - struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b); - struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + // struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; + // struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b); + // struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + // cb(kq_b, "kq_b", il); - kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + // kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias); + // cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); - cb(v, "v", il); + // struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); + // cb(v, "v", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq); + // cb(kqv, "kqv", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + // cb(kqv_merged, "kqv_merged", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + // cb(cur, "kqv_merged_cont", il); - ggml_build_forward_expand(gf, cur); + // ggml_build_forward_expand(gf, cur); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur); - cb(cur, "kqv_out", il); - } + // cur = build_lora_mm(model.layers[il].wo_enc, cur); + // cb(cur, "kqv_out", il); + // } - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } + // if (il == n_layer - 1) { + // // skip computing output for unused tokens + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // n_tokens = n_outputs; + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + // } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); + // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + // cb(ffn_inp, "ffn_inp", il); - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm_enc, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + // // feed-forward network + // { + // cur = build_norm(ffn_inp, + // model.layers[il].ffn_norm_enc, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "ffn_norm", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up_enc, NULL, NULL, - model.layers[il].ffn_gate_enc, NULL, NULL, - model.layers[il].ffn_down_enc, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } + // // T5 uses relu, flan-T5 uses gelu-gated + // cur = build_ffn(cur, + // model.layers[il].ffn_up_enc, NULL, NULL, + // model.layers[il].ffn_gate_enc, NULL, NULL, + // model.layers[il].ffn_down_enc, NULL, NULL, + // NULL, + // model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + // model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + // cb, il); + // cb(cur, "ffn_out", il); + // } - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); + // cur = ggml_add(ctx0, cur, ffn_inp); + // cb(cur, "ffn_out", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); + // ggml_tensor * layer_dir = cvec.tensor_for(il); + // if (layer_dir != nullptr) { + // cur = ggml_add(ctx0, cur, layer_dir); + // } + // cb(cur, "l_out", il); - // input for next layer - inpL = cur; - } + // // input for next layer + // inpL = cur; + // } - cur = inpL; - cb(cur, "result_embd", -1); + // cur = inpL; + // cb(cur, "result_embd", -1); - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm_enc, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); + // cur = build_norm(cur, + // model.output_norm_enc, NULL, + // LLM_NORM_RMS, -1); + // cb(cur, "result_norm", -1); - ggml_build_forward_expand(gf, cur); + // ggml_build_forward_expand(gf, cur); - return gf; - } + // return gf; + //} - struct ggml_cgraph * build_t5_dec() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + //struct ggml_cgraph * build_t5_dec() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // mutable variable, needed during the last layer of the computation to skip unused tokens - int32_t n_tokens = this->n_tokens; + // // mutable variable, needed during the last layer of the computation to skip unused tokens + // int32_t n_tokens = this->n_tokens; - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + // const int64_t n_embd_head = hparams.n_embd_head_v; + // const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + // GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + // inpL = build_inp_embd(model.tok_embd); - GGML_ASSERT(!lctx.is_encoding); - GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); + // GGML_ASSERT(!lctx.is_encoding); + // GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); - struct ggml_tensor * embd_enc = build_inp_embd_enc(); - struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true); + // struct ggml_tensor * embd_enc = build_inp_embd_enc(); + // struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true); - struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); - struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross(); + // struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask(); + // struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross(); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + // for (int il = 0; il < n_layer; ++il) { + // struct ggml_tensor * inpSA = inpL; - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); + // // norm + // cur = build_norm(inpL, + // model.layers[il].attn_norm, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "attn_norm", il); - // self-attention - { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); + // // self-attention + // { + // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + // cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); + // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + // cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); + // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + // cb(Vcur, "Vcur", il); - llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); + // build_kv_store(gf, Kcur, Vcur, il); - struct ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - 0); - cb(k, "k", il); + // struct ggml_tensor * k = + // ggml_view_3d(ctx0, kv_self.k_l[il], + // n_embd_head_k, n_kv, n_head_kv, + // ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + // ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + // 0); + // cb(k, "k", il); - struct ggml_tensor * v = - ggml_view_3d(ctx0, kv_self.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv_self.v_l[il])*n_ctx, - ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, - 0); - cb(v, "v", il); + // struct ggml_tensor * v = + // ggml_view_3d(ctx0, kv_self.v_l[il], + // n_kv, n_embd_head_v, n_head_kv, + // ggml_element_size(kv_self.v_l[il])*n_ctx, + // ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v, + // 0); + // cb(v, "v", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // cb(kq, "kq", il); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; - struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b); - struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); - cb(kq_b, "kq_b", il); + // struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; + // struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b); + // struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias); + // cb(kq_b, "kq_b", il); - kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + // kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias); + // cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - cb(kqv, "kqv", il); + // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + // cb(kqv, "kqv", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + // cb(kqv_merged, "kqv_merged", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + // cb(cur, "kqv_merged_cont", il); - ggml_build_forward_expand(gf, cur); + // ggml_build_forward_expand(gf, cur); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); - cb(cur, "kqv_out", il); - } + // cur = build_lora_mm(model.layers[il].wo, cur); + // cb(cur, "kqv_out", il); + // } - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "cross_inp", il); + // cur = ggml_add(ctx0, cur, inpSA); + // cb(cur, "cross_inp", il); - struct ggml_tensor * inpCA = cur; + // struct ggml_tensor * inpCA = cur; - // norm - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].attn_norm_cross, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm_cross", il); + // // norm + // cur = build_norm(cur, + // model.layers[il].attn_norm_cross, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "attn_norm_cross", il); - // cross-attention - { - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur); - cb(Qcur, "Qcur", il); + // // cross-attention + // { + // struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur); + // cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc); - cb(Kcur, "Kcur", il); + // struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc); + // cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc); - cb(Vcur, "Vcur", il); + // struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc); + // cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); + // Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); - struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + // struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + // struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cb(kq, "kq", il); + // struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // cb(kq, "kq", il); - kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + // kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); + // cb(kq, "kq_soft_max_ext", il); - struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); - cb(v, "v", il); + // struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); + // cb(v, "v", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); - cb(kqv, "kqv", il); + // struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); + // cb(kqv, "kqv", il); - struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); + // struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + // cb(kqv_merged, "kqv_merged", il); - cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); - cb(cur, "kqv_merged_cont", il); + // cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); + // cb(cur, "kqv_merged_cont", il); - ggml_build_forward_expand(gf, cur); + // ggml_build_forward_expand(gf, cur); - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur); - cb(cur, "kqv_out", il); - } + // cur = build_lora_mm(model.layers[il].wo_cross, cur); + // cb(cur, "kqv_out", il); + // } - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - n_tokens = n_outputs; - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); - } + // if (il == n_layer - 1) { + // // skip computing output for unused tokens + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // n_tokens = n_outputs; + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + // inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); + // } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); - cb(ffn_inp, "ffn_inp", il); + // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); + // cb(ffn_inp, "ffn_inp", il); - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + // // feed-forward network + // { + // cur = build_norm(ffn_inp, + // model.layers[il].ffn_norm, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "ffn_norm", il); - // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, - cb, il); - cb(cur, "ffn_out", il); - } + // // T5 uses relu, flan-T5 uses gelu-gated + // cur = build_ffn(cur, + // model.layers[il].ffn_up, NULL, NULL, + // model.layers[il].ffn_gate, NULL, NULL, + // model.layers[il].ffn_down, NULL, NULL, + // NULL, + // model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, + // model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + // cb, il); + // cb(cur, "ffn_out", il); + // } - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "ffn_out", il); + // cur = ggml_add(ctx0, cur, ffn_inp); + // cb(cur, "ffn_out", il); - ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } - cb(cur, "l_out", il); + // ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + // if (layer_dir != nullptr) { + // cur = ggml_add(ctx0, cur, layer_dir); + // } + // cb(cur, "l_out", il); - // input for next layer - inpL = cur; - } + // // input for next layer + // inpL = cur; + // } - cur = inpL; - cb(cur, "result_embd", -1); + // cur = inpL; + // cb(cur, "result_embd", -1); - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); + // cur = build_norm(cur, + // model.output_norm, NULL, + // LLM_NORM_RMS, -1); + // cb(cur, "result_norm", -1); + + // // lm_head + // cur = build_lora_mm(model.output, cur); + // cb(cur, "result_output", -1); + + // ggml_build_forward_expand(gf, cur); - return gf; - } + // return gf; + //} struct ggml_cgraph * build_jais() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -7106,21 +6535,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -7136,9 +6564,9 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/float(n_embd_head), cb, il); } if (il == n_layer - 1) { @@ -7154,13 +6582,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -7173,13 +6601,13 @@ struct llm_build_context { cb(inpL, "l_out", il); } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); @@ -7198,21 +6626,20 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention @@ -7221,7 +6648,7 @@ struct llm_build_context { struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Vcur = nullptr; - cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -7249,9 +6676,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -7268,13 +6695,13 @@ struct llm_build_context { // FF { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7288,13 +6715,13 @@ struct llm_build_context { cb(inpL, "l_out", il); } - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -7312,42 +6739,41 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -7368,9 +6794,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -7384,13 +6810,13 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -7409,13 +6835,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -7436,44 +6862,43 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = build_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -7494,9 +6919,9 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); } if (il == n_layer - 1) { @@ -7511,12 +6936,12 @@ struct llm_build_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7536,13 +6961,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -7550,230 +6975,232 @@ struct llm_build_context { return gf; } - ggml_cgraph * build_rwkv6() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + //ggml_cgraph * build_rwkv6() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // Token shift state dimensions should be 2 * n_emb - GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); + // // Token shift state dimensions should be 2 * n_emb + // GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); - const int64_t n_seqs = ubatch.n_seqs; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs); - GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); + // const int64_t n_seqs = ubatch.n_seqs; + // const int64_t n_seq_tokens = ubatch.n_seq_tokens; + // const int64_t n_tokens = ubatch.n_tokens; + // GGML_ASSERT(n_seqs != 0); + // GGML_ASSERT(ubatch.equal_seqs); + // GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - struct ggml_tensor * state_copy = build_inp_s_copy(); - struct ggml_tensor * state_mask = build_inp_s_mask(); + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; + // struct ggml_tensor * state_copy = build_inp_s_copy(); + // struct ggml_tensor * state_mask = build_inp_s_mask(); - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); - inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); + // inpL = build_inp_embd(model.tok_embd); + // inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - for (int il = 0; il < n_layer; ++il) { - const llama_layer * layer = &model.layers[il]; + // for (int il = 0; il < n_layer; ++il) { + // const llama_layer * layer = &model.layers[il]; - // (ab)using the KV cache to store the states - struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0, - gf, kv_self.k_l[il], state_copy, state_mask, - hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs); - struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0, - gf, kv_self.v_l[il], state_copy, state_mask, - hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs); + // // (ab)using the KV cache to store the states + // struct ggml_tensor * token_shift = build_copy_mask_state( + // gf, kv_self.k_l[il], state_copy, state_mask, + // hparams.n_embd_k_s(), n_seqs); - cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs); + // struct ggml_tensor * wkv_states = build_copy_mask_state( + // gf, kv_self.v_l[il], state_copy, state_mask, + // hparams.n_embd_v_s(), n_seqs); - struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); - struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); + // cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); + // token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs); - struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il); - struct ggml_tensor * x_prev = ggml_concat( - ctx0, - att_shift, - ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), - 1 - ); + // struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); + // struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); - cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size)); - ggml_build_forward_expand(gf, cur); - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, - wkv_states, - ggml_view_1d( - ctx0, - kv_self.v_l[il], - hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - ) - ) - ); + // struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); + // struct ggml_tensor * x_prev = ggml_concat( + // ctx0, + // att_shift, + // ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), + // 1 + // ); - struct ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il); - x_prev = ggml_concat( - ctx0, - ffn_shift, - ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0), - 1 - ); - cur = ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev)); - ggml_build_forward_expand(gf, cur); + // cur = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size)); + // ggml_build_forward_expand(gf, cur); + // ggml_build_forward_expand( + // gf, + // ggml_cpy( + // ctx0, + // wkv_states, + // ggml_view_1d( + // ctx0, + // kv_self.v_l[il], + // hparams.n_embd_v_s() * n_seqs, + // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + // ) + // ) + // ); - struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); - struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn)); + // struct ggml_tensor * x_norm_ffn = build_norm(cur, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); + // x_prev = ggml_concat( + // ctx0, + // ffn_shift, + // ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0), + // 1 + // ); + // cur = ggml_add(ctx0, cur, build_rwkv6_channel_mix(layer, x_norm_ffn, x_prev)); + // ggml_build_forward_expand(gf, cur); - token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1); + // struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); + // struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn)); - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, - ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0), - ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) - ) - ); + // token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1); - if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { - cur = ggml_scale(ctx0, cur, 0.5F); - } + // ggml_build_forward_expand( + // gf, + // ggml_cpy( + // ctx0, + // ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0), + // ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) + // ) + // ); - cur = lctx.cvec.apply_to(ctx0, cur, il); - cb(cur, "l_out", il); + // if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { + // cur = ggml_scale(ctx0, cur, 0.5F); + // } - // input for next layer - inpL = cur; - } + // cur = lctx.cvec.apply_to(ctx0, cur, il); + // cb(cur, "l_out", il); - cur = inpL; - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // // input for next layer + // inpL = cur; + // } - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); - cb(cur, "result_norm", -1); + // cur = inpL; + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); + // cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); + // cb(cur, "result_norm", -1); - ggml_build_forward_expand(gf, cur); + // cur = build_lora_mm(model.output, cur); + // cb(cur, "result_output", -1); - return gf; - } + // ggml_build_forward_expand(gf, cur); + + // return gf; + //} // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py - ggml_cgraph * build_rwkv6qwen2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + //ggml_cgraph * build_rwkv6qwen2() { + // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - GGML_ASSERT(n_embd == hparams.n_embd_k_s()); + // GGML_ASSERT(n_embd == hparams.n_embd_k_s()); - const int64_t n_seqs = ubatch.n_seqs; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_tokens = ubatch.n_tokens; - GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs); - GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); + // const int64_t n_seqs = ubatch.n_seqs; + // const int64_t n_seq_tokens = ubatch.n_seq_tokens; + // const int64_t n_tokens = ubatch.n_tokens; + // GGML_ASSERT(n_seqs != 0); + // GGML_ASSERT(ubatch.equal_seqs); + // GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - struct ggml_tensor * state_copy = build_inp_s_copy(); - struct ggml_tensor * state_mask = build_inp_s_mask(); + // struct ggml_tensor * cur; + // struct ggml_tensor * inpL; + // struct ggml_tensor * state_copy = build_inp_s_copy(); + // struct ggml_tensor * state_mask = build_inp_s_mask(); - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + // inpL = build_inp_embd(model.tok_embd); - for (int il = 0; il < n_layer; ++il) { - const llama_layer * layer = &model.layers[il]; + // for (int il = 0; il < n_layer; ++il) { + // const llama_layer * layer = &model.layers[il]; - // (ab)using the KV cache to store the states - struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0, - gf, kv_self.k_l[il], state_copy, state_mask, - hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs); - struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0, - gf, kv_self.v_l[il], state_copy, state_mask, - hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs); + // // (ab)using the KV cache to store the states + // struct ggml_tensor * token_shift = build_copy_mask_state( + // gf, kv_self.k_l[il], state_copy, state_mask, + // hparams.n_embd_k_s(), n_seqs); - cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs); + // struct ggml_tensor * wkv_states = build_copy_mask_state( + // gf, kv_self.v_l[il], state_copy, state_mask, + // hparams.n_embd_v_s(), n_seqs); - struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, cb, il); - struct ggml_tensor * x_prev = ggml_concat( - ctx0, - token_shift, - ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), - 1 - ); + // cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); + // token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs); - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, - wkv_states, - ggml_view_1d( - ctx0, - kv_self.v_l[il], - hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - ) - ) - ); + // struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); + // struct ggml_tensor * x_prev = ggml_concat( + // ctx0, + // token_shift, + // ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), + // 1 + // ); - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv())); - ggml_build_forward_expand(gf, ffn_inp); - ggml_build_forward_expand( - gf, - ggml_cpy( - ctx0, - wkv_states, - ggml_view_1d( - ctx0, - kv_self.v_l[il], - hparams.n_embd_v_s() * n_seqs, - hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - ) - ) - ); + // ggml_build_forward_expand( + // gf, + // ggml_cpy( + // ctx0, + // wkv_states, + // ggml_view_1d( + // ctx0, + // kv_self.v_l[il], + // hparams.n_embd_v_s() * n_seqs, + // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + // ) + // ) + // ); - cb(ffn_inp, "ffn_inp", il); + // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv())); + // ggml_build_forward_expand(gf, ffn_inp); + // ggml_build_forward_expand( + // gf, + // ggml_cpy( + // ctx0, + // wkv_states, + // ggml_view_1d( + // ctx0, + // kv_self.v_l[il], + // hparams.n_embd_v_s() * n_seqs, + // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + // ) + // ) + // ); - // feed-forward network - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + // cb(ffn_inp, "ffn_inp", il); - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); + // // feed-forward network + // cur = build_norm(ffn_inp, + // model.layers[il].ffn_norm, NULL, + // LLM_NORM_RMS, il); + // cb(cur, "ffn_norm", il); - cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); - cb(cur, "l_out", il); + // cur = build_ffn(cur, + // model.layers[il].ffn_up, NULL, NULL, + // model.layers[il].ffn_gate, NULL, NULL, + // model.layers[il].ffn_down, NULL, NULL, + // NULL, + // LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + // cb(cur, "ffn_out", il); - // input for next layer - inpL = cur; - } + // cur = ggml_add(ctx0, cur, ffn_inp); + // cur = lctx.cvec.apply_to(ctx0, cur, il); + // cb(cur, "l_out", il); - cur = inpL; - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // // input for next layer + // inpL = cur; + // } - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); + // cur = inpL; + // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + // cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); + // cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); + // cb(cur, "result_norm", -1); - ggml_build_forward_expand(gf, cur); + // cur = build_lora_mm(model.output, cur); + // cb(cur, "result_output", -1); - return gf; - } + // ggml_build_forward_expand(gf, cur); + + // return gf; + //} // ref: https://github.com/facebookresearch/chameleon // based on the original build_llama() function, changes: @@ -7794,13 +7221,12 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -7809,22 +7235,22 @@ struct llm_build_context { if (hparams.swin_norm) { cur = inpL; } else { - cur = llm_build_norm(ctx0, inpL, hparams, + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "attn_norm", il); } // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].attn_q_norm) { @@ -7834,10 +7260,10 @@ struct llm_build_context { 0); cb(Qcur, "Qcur", il); - Qcur = llm_build_norm(ctx0, Qcur, hparams, + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Qcur, "Qcur", il); } @@ -7848,10 +7274,10 @@ struct llm_build_context { 0); cb(Kcur, "Kcur", il); - Kcur = llm_build_norm(ctx0, Kcur, hparams, + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, - LLM_NORM, cb, il); + LLM_NORM, il); cb(Kcur, "Kcur", il); } @@ -7869,14 +7295,14 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, lctx, kv_self, gf, + cur = build_attn(gf, model.layers[il].wo, nullptr, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il); if (hparams.swin_norm) { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); } } @@ -7893,13 +7319,13 @@ struct llm_build_context { // feed-forward network if (!hparams.swin_norm) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); } - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -7908,9 +7334,9 @@ struct llm_build_context { cb(cur, "ffn_out", il); if (hparams.swin_norm) { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); + LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); } @@ -7926,13 +7352,13 @@ struct llm_build_context { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); + LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "result_output_with_img_logits", -1); // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs. @@ -7959,7 +7385,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + inpL = build_inp_embd(model.tok_embd); cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); @@ -7978,20 +7404,20 @@ struct llm_build_context { case 3: case 4: { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.norm1, layer.norm1_b, - LLM_NORM_GROUP, cb, 0); + LLM_NORM_GROUP, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1); cur = ggml_add(ctx0, cur, layer.conv1_b); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.norm2, layer.norm2_b, - LLM_NORM_GROUP, cb, 0); + LLM_NORM_GROUP, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); @@ -8002,10 +7428,10 @@ struct llm_build_context { } break; case 2: { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.attn_norm, layer.attn_norm_b, - LLM_NORM_GROUP, cb, 0); + LLM_NORM_GROUP, 0); struct ggml_tensor * q; struct ggml_tensor * k; @@ -8035,10 +7461,10 @@ struct llm_build_context { } break; case 5: { - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.norm, layer.norm_b, - LLM_NORM_GROUP, cb, 0); + LLM_NORM_GROUP, 0); } break; default: GGML_ABORT("unknown posnet layer"); }; @@ -8046,10 +7472,10 @@ struct llm_build_context { cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.tok_norm, model.tok_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); @@ -8066,12 +7492,12 @@ struct llm_build_context { cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, layer.norm, layer.norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); - cur = llm_build_ffn(ctx0, lctx, cur, + cur = build_ffn(cur, layer.pw1, layer.pw1_b, NULL, NULL, NULL, NULL, layer.pw2, layer.pw2_b, NULL, @@ -8089,13 +7515,13 @@ struct llm_build_context { cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cur = llm_build_norm(ctx0, cur, hparams, + cur = build_norm(cur, model.output_norm, model.output_norm_b, - LLM_NORM, cb, -1); + LLM_NORM, -1); // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_lora_mm(model.output, cur); cur = ggml_add(ctx0, cur, model.output_b); cb(cur, "result_embd", -1); @@ -8106,7 +7532,7 @@ struct llm_build_context { } }; -static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { +static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx) { llama_ubatch dummy = {}; dummy.equal_seqs = true; @@ -8116,7 +7542,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const llm.init(); - struct ggml_cgraph * result = llm.build_defrag(ids); + struct ggml_cgraph * result = llm.build_defrag(); llm.free(); @@ -8356,18 +7782,18 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_bitnet(); } break; - case LLM_ARCH_T5: - { - if (lctx.is_encoding) { - result = llm.build_t5_enc(); - } else { - result = llm.build_t5_dec(); - } - } break; - case LLM_ARCH_T5ENCODER: - { - result = llm.build_t5_enc(); - } break; + //case LLM_ARCH_T5: + // { + // if (lctx.is_encoding) { + // result = llm.build_t5_enc(); + // } else { + // result = llm.build_t5_dec(); + // } + // } break; + //case LLM_ARCH_T5ENCODER: + // { + // result = llm.build_t5_enc(); + // } break; case LLM_ARCH_JAIS: { result = llm.build_jais(); @@ -8380,14 +7806,14 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_exaone(); } break; - case LLM_ARCH_RWKV6: - { - result = llm.build_rwkv6(); - } break; - case LLM_ARCH_RWKV6QWEN2: - { - result = llm.build_rwkv6qwen2(); - } break; + //case LLM_ARCH_RWKV6: + // { + // result = llm.build_rwkv6(); + // } break; + //case LLM_ARCH_RWKV6QWEN2: + // { + // result = llm.build_rwkv6qwen2(); + // } break; case LLM_ARCH_CHAMELEON: { result = llm.build_chameleon(); @@ -8543,6 +7969,7 @@ static int llama_decode_impl( } else { ubatch = lctx.sbatch.split_simple(n_ubatch); } + const uint32_t n_tokens = ubatch.n_tokens; // count the outputs in this u_batch @@ -8567,6 +7994,8 @@ static int llama_decode_impl( GGML_ASSERT(n_threads > 0); + lctx.prepare_decode(ubatch); + // non-causal masks do not use the KV cache if (hparams.causal_attn) { llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update() @@ -8600,6 +8029,12 @@ static int llama_decode_impl( ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); + + lctx.set_inputs(ubatch); + // the output is always the last tensor in the graph struct ggml_tensor * res = ggml_graph_node(gf, -1); struct ggml_tensor * embd = ggml_graph_node(gf, -2); @@ -8623,12 +8058,6 @@ static int llama_decode_impl( GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - - ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - - llama_set_inputs(lctx, ubatch); - const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); if (compute_status != GGML_STATUS_SUCCESS) { kv_slot_restorer.restore(kv_self); @@ -8850,11 +8279,17 @@ static int llama_encode_impl( GGML_ASSERT(n_threads > 0); + lctx.prepare_decode(ubatch); + ggml_backend_sched_reset(lctx.sched.get()); ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); + ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); + + lctx.set_inputs(ubatch); + // the output embeddings after the final encoder normalization struct ggml_tensor * embd = nullptr; @@ -8875,10 +8310,6 @@ static int llama_encode_impl( } } - ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - - llama_set_inputs(lctx, ubatch); - const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); switch (compute_status) { case GGML_STATUS_SUCCESS: @@ -8966,227 +8397,6 @@ static int llama_encode_impl( return 0; } -// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache -static void llama_kv_cache_defrag_impl(struct llama_context & lctx) { - auto & kv_self = lctx.kv_self; - - const auto & hparams = lctx.model.hparams; - - const uint32_t n_layer = hparams.n_layer; - - const uint32_t n_kv = kv_self.cell_max(); - const uint32_t n_used = kv_self.used; - - assert(n_used <= n_kv); - - //const int64_t t_start = ggml_time_us(); - - // number of cells moved - uint32_t n_moves = 0; - - // each move requires 6*n_layer tensors (see build_defrag) - // - source view, destination view, copy operation - // - x2 for keys and values - //const uint32_t max_moves = model.max_nodes()/(6*n_layer); - // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (lctx.model.max_nodes() - 2*n_layer)/(6*n_layer); - - // determine which KV cells to move where - // - // cell i moves to ids[i] - // - // if ids[i] == i || ids[i] == n_kv, then cell i is not moved - // - std::vector ids(n_kv, n_kv); - - for (uint32_t i0 = 0; i0 < n_used; ++i0) { - const auto & cell0 = kv_self.cells[i0]; - - if (!cell0.is_empty()) { - ids[i0] = i0; - - continue; - } - - // found a hole - fill it with data from the end of the cache - - uint32_t nh = 1; - - // determine the size of the hole - while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { - nh++; - } - - uint32_t nf = 0; - uint32_t is = n_kv - 1; - - // starting from the end, find nh non-empty cells - for (; is > i0; --is) { - const auto & cell1 = kv_self.cells[is]; - - if (cell1.is_empty() || ids[is] != n_kv) { - continue; - } - - // non-empty cell which is not yet moved - nf++; - - if (nf == nh) { - break; - } - } - - // this can only happen if `n_used` is not accurate, which would be a bug - GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); - - nf = 0; - - uint32_t i1 = is; - - // are we moving a continuous block of memory? - bool cont = false; - - // should we stop searching for the next move? - bool stop = false; - - // go back and move the nf cells to the hole - for (; i1 < n_kv; ++i1) { - auto & cell1 = kv_self.cells[i1]; - - if (cell1.is_empty() || ids[i1] != n_kv) { - if (n_moves == max_moves) { - stop = true; - break; - } - - cont = false; - continue; - } - - // this cell goes to (i0 + nf) - ids[i1] = i0 + nf; - - // move the cell meta data - kv_self.cells[i0 + nf] = cell1; - - // clear the old cell and move the head there - cell1 = llama_kv_cell(); - kv_self.head = n_used; - - if (!cont) { - n_moves++; - cont = true; - } - - nf++; - - if (nf == nh) { - break; - } - } - - if (stop || n_moves == max_moves) { - break; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); - - i0 += nh - 1; - } - - if (n_moves == 0) { - return; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); - - //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); - -#if 0 - // CPU defrag - // - // TODO: optimizations are possible: - // - multiple threads - // - avoid copying to the host memory when already there - // - // likely not worth the effort, as we have ggml_graph based defrag - // - - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - - const uint32_t kv_size = kv_self.size; - - std::vector buf_k; - std::vector buf_v; - - for (uint32_t il = 0; il < n_layer; ++il) { - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); - - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); - const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size); - - buf_k.resize(k_size); - buf_v.resize(v_size); - - ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); - - // batch move [i, i+nm) to [id, id+nm) - // note: cells can move only to a lower index - for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == n_kv) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < n_kv && ids[i + nm] == id + nm) { - nm++; - } - - // move keys - { - const int64_t os = i*k_size_row; - const int64_t od = id*k_size_row; - - memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); - } - - // move values (note: they are transposed) - { - const int64_t os = i; - const int64_t od = id; - - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); - } - } - - i += nm - 1; - } - - ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); - } -#else - // ggml_graph defrag - - ggml_backend_sched_reset(lctx.sched.get()); - - ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); - - llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); -#endif - - //const int64_t t_end = ggml_time_us(); - - //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); -} - // TODO: move to llama_context static void llama_kv_self_update_impl(llama_context & lctx) { bool need_reserve = false; @@ -9200,13 +8410,15 @@ static void llama_kv_self_update_impl(llama_context & lctx) { // apply K-shift if needed if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { + lctx.prepare_k_shift(); + ggml_backend_sched_reset(lctx.sched.get()); ggml_cgraph * gf = llama_build_graph_k_shift(lctx); ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - lctx.set_k_shift(kv); + lctx.set_inputs({}); llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); @@ -9224,7 +8436,13 @@ static void llama_kv_self_update_impl(llama_context & lctx) { // defragment the KV cache if needed if (kv.do_defrag) { - llama_kv_cache_defrag_impl(lctx); + lctx.prepare_defrag(); + + ggml_backend_sched_reset(lctx.sched.get()); + + ggml_cgraph * gf = llama_build_graph_defrag(lctx); + + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); need_reserve = true; @@ -9253,16 +8471,16 @@ int32_t llama_set_adapter_lora( struct llama_context * ctx, struct llama_adapter_lora * adapter, float scale) { - ctx->lora[adapter] = scale; + ctx->loras[adapter] = scale; return 0; } int32_t llama_rm_adapter_lora( struct llama_context * ctx, struct llama_adapter_lora * adapter) { - auto pos = ctx->lora.find(adapter); - if (pos != ctx->lora.end()) { - ctx->lora.erase(pos); + auto pos = ctx->loras.find(adapter); + if (pos != ctx->loras.end()) { + ctx->loras.erase(pos); return 0; } @@ -9270,7 +8488,7 @@ int32_t llama_rm_adapter_lora( } void llama_clear_adapter_lora(struct llama_context * ctx) { - ctx->lora.clear(); + ctx->loras.clear(); } int32_t llama_apply_adapter_cvec( From b4ec1d44294b628a811cc97367bb7ace0a32c9fd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 16 Jan 2025 21:55:12 +0200 Subject: [PATCH 12/28] cont : move kv_self update to llama_context ggml-ci --- src/llama-context.cpp | 119 +++++++++++++++++++++++++++ src/llama-context.h | 10 +++ src/llama.cpp | 182 +++++++----------------------------------- 3 files changed, 157 insertions(+), 154 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 910e2243d..daea125fe 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -32,6 +32,38 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } +enum ggml_status llama_context::compute_graph( + ggml_cgraph * graph, + bool batched) { + int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; + ggml_threadpool_t tp = batched ? threadpool_batch : threadpool; + + if (backend_cpu != nullptr) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); + auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); + set_threadpool_fn(backend_cpu, tp); + } + + // set the number of threads for all the backends + for (const auto & set_n_threads_fn : set_n_threads_fns) { + set_n_threads_fn.second(set_n_threads_fn.first, n_threads); + } + + auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph); + if (status != GGML_STATUS_SUCCESS) { + LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); + } + + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched)); + + return status; +} + + +llama_pos llama_context::pos_max() const { + return kv_self.pos_max(); +} + // TODO: improve void llama_context::reset() { inp_tokens = nullptr; @@ -540,6 +572,93 @@ ggml_tensor * llama_context::build_lora_mm_id( return res; } +bool llama_context::kv_self_update() { + bool need_reserve = false; + + auto & kv = kv_self; + + if (kv.has_shift) { + if (!kv.can_shift) { + GGML_ABORT("The current context does not support K-shift"); + } + + // apply K-shift if needed + if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { + prepare_k_shift(); + + ggml_backend_sched_reset(sched.get()); + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ggml_context * ctx0 = ggml_init(params); + + reset(); + + ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + build_k_shift(ctx0, gf); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + set_inputs({}); + + compute_graph(gf, false); + + ggml_free(ctx0); + + need_reserve = true; + } + + { + kv.has_shift = false; + + for (uint32_t i = 0; i < kv.size; ++i) { + kv.cells[i].delta = 0; + } + } + } + + // defragment the KV cache if needed + if (kv.do_defrag) { + prepare_defrag(); + + ggml_backend_sched_reset(sched.get()); + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + ggml_context * ctx0 = ggml_init(params); + + reset(); + + ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + + build_defrag(ctx0, gf); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + // no input + //set_inputs({}); + + compute_graph(gf, false); + + ggml_free(ctx0); + + need_reserve = true; + + kv.do_defrag = false; + } + + return need_reserve; +} + void llama_context::build_attn_inp( ggml_context * ctx0, int32_t n_tokens, diff --git a/src/llama-context.h b/src/llama-context.h index a2f41b5c8..bc33fc6ef 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -79,6 +79,13 @@ struct llama_context { ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; + // returns the result of ggml_backend_sched_graph_compute_async execution + enum ggml_status compute_graph( + ggml_cgraph * graph, + bool batched); + + llama_pos pos_max() const; + void reset(); void prepare_k_shift(); @@ -129,6 +136,9 @@ struct llama_context { struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] struct ggml_tensor * inp_K_shift; // I32 [kv_size] + // return true if need to reserve new worst-case graph + bool kv_self_update(); + void build_attn_inp( ggml_context * ctx0, int32_t n_tokens, diff --git a/src/llama.cpp b/src/llama.cpp index a2e5e0bea..6e2faa71c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -110,7 +110,6 @@ struct llm_build_context { const llama_hparams & hparams; const llama_cparams & cparams; const llama_ubatch & ubatch; - //const llama_kv_cache & kv_self; const llama_adapter_cvec & cvec; const llama_loras & loras; @@ -137,8 +136,6 @@ struct llm_build_context { const float norm_rms_eps; const int32_t n_tokens; - //const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size) - //const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_outputs; const int32_t n_outputs_enc; const int32_t n_ctx_orig; @@ -166,7 +163,6 @@ struct llm_build_context { hparams (model.hparams), cparams (lctx.cparams), ubatch (ubatch), - //kv_self (lctx.kv_self), cvec (lctx.cvec), loras (lctx.loras), n_embd (hparams.n_embd), @@ -190,8 +186,6 @@ struct llm_build_context { norm_eps (hparams.f_norm_eps), norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (ubatch.n_tokens), - //n_kv (worst_case ? kv_self.size : kv_self.n), - //kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), n_outputs (worst_case ? n_tokens : lctx.n_outputs), n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd), n_ctx_orig (cparams.n_ctx_orig_yarn), @@ -7532,40 +7526,6 @@ struct llm_build_context { } }; -static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx) { - llama_ubatch dummy = {}; - dummy.equal_seqs = true; - - llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; - - struct llm_build_context llm(lctx, dummy, cb, false); - - llm.init(); - - struct ggml_cgraph * result = llm.build_defrag(); - - llm.free(); - - return result; -} - -static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { - llama_ubatch dummy = {}; - dummy.equal_seqs = true; - - llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; - - struct llm_build_context llm(lctx, dummy, cb, false); - - llm.init(); - - struct ggml_cgraph * result = llm.build_k_shift(); - - llm.free(); - - return result; -} - static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_ubatch & ubatch, @@ -7836,33 +7796,6 @@ static struct ggml_cgraph * llama_build_graph( return result; } -// returns the result of ggml_backend_sched_graph_compute_async execution -static enum ggml_status llama_graph_compute( - llama_context & lctx, - ggml_cgraph * gf, - int n_threads, - ggml_threadpool * threadpool) { - if (lctx.backend_cpu != nullptr) { - auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu)); - auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); - set_threadpool_fn(lctx.backend_cpu, threadpool); - } - - // set the number of threads for all the backends - for (const auto & set_n_threads_fn : lctx.set_n_threads_fns) { - set_n_threads_fn.second(set_n_threads_fn.first, n_threads); - } - - auto status = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf); - if (status != GGML_STATUS_SUCCESS) { - LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); - } - - // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); - - return status; -} - // decode a batch of tokens by evaluating the transformer // in case of unsuccessful decoding (error or warning), // the kv_cache state will be returned to its original state @@ -7887,7 +7820,7 @@ static int llama_decode_impl( } // temporary allocate memory for the input batch if needed - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; const uint32_t n_tokens_all = batch.n_tokens; @@ -7989,16 +7922,11 @@ static int llama_decode_impl( lctx.n_outputs = n_outputs_new; } - int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; - ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; - - GGML_ASSERT(n_threads > 0); - lctx.prepare_decode(ubatch); // non-causal masks do not use the KV cache if (hparams.causal_attn) { - llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update() + llama_kv_self_update(&lctx); // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it @@ -8058,7 +7986,7 @@ static int llama_decode_impl( GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } - const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); + const auto compute_status = lctx.compute_graph(gf, n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { kv_slot_restorer.restore(kv_self); switch (compute_status) { @@ -8226,7 +8154,7 @@ static int llama_encode_impl( } // temporary allocate memory for the input batch if needed - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; const uint32_t n_tokens = batch.n_tokens; @@ -8274,11 +8202,6 @@ static int llama_encode_impl( lctx.inp_embd_enc = NULL; lctx.n_outputs = n_tokens; - int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; - ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; - - GGML_ASSERT(n_threads > 0); - lctx.prepare_decode(ubatch); ggml_backend_sched_reset(lctx.sched.get()); @@ -8310,7 +8233,7 @@ static int llama_encode_impl( } } - const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool); + const auto compute_status = lctx.compute_graph(gf, n_tokens > 1); switch (compute_status) { case GGML_STATUS_SUCCESS: break; @@ -8397,76 +8320,6 @@ static int llama_encode_impl( return 0; } -// TODO: move to llama_context -static void llama_kv_self_update_impl(llama_context & lctx) { - bool need_reserve = false; - - auto & kv = lctx.kv_self; - - if (kv.has_shift) { - if (!kv.can_shift) { - GGML_ABORT("The current context does not support K-shift"); - } - - // apply K-shift if needed - if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { - lctx.prepare_k_shift(); - - ggml_backend_sched_reset(lctx.sched.get()); - - ggml_cgraph * gf = llama_build_graph_k_shift(lctx); - - ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - - lctx.set_inputs({}); - - llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); - - need_reserve = true; - } - - { - kv.has_shift = false; - - for (uint32_t i = 0; i < kv.size; ++i) { - kv.cells[i].delta = 0; - } - } - } - - // defragment the KV cache if needed - if (kv.do_defrag) { - lctx.prepare_defrag(); - - ggml_backend_sched_reset(lctx.sched.get()); - - ggml_cgraph * gf = llama_build_graph_defrag(lctx); - - llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); - - need_reserve = true; - - kv.do_defrag = false; - } - - // reserve a worst case graph again - if (need_reserve) { - // TODO: extract to a function - // build worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch); - llama_token token = lctx.model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); - - // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(lctx.sched.get()); - if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - } - } -} - int32_t llama_set_adapter_lora( struct llama_context * ctx, struct llama_adapter_lora * adapter, @@ -9224,9 +9077,30 @@ void llama_kv_cache_update(llama_context * ctx) { llama_kv_self_update(ctx); } -// TODO: move to llama-context void llama_kv_self_update(llama_context * ctx) { - llama_kv_self_update_impl(*ctx); + const bool need_reserve = ctx->kv_self_update(); + + // reserve a worst case graph again + if (need_reserve) { + // TODO: extract to a function + const auto & cparams = ctx->cparams; + const auto & model = ctx->model; + + // build worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + ggml_cgraph * gf = llama_build_graph(*ctx, ubatch, true); + + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(ctx->sched.get()); + if (!ggml_backend_sched_reserve(ctx->sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + } + } } /// From f0713498fd05afe117647c76f536866640b77b90 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 11:51:35 +0200 Subject: [PATCH 13/28] context : add get_ctx_padding() ggml-ci --- src/llama-context.cpp | 4 ++++ src/llama-context.h | 3 +++ src/llama.cpp | 4 +++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index daea125fe..6a73659d0 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -64,6 +64,10 @@ llama_pos llama_context::pos_max() const { return kv_self.pos_max(); } +uint32_t llama_context::get_ctx_padding(const llama_cparams & cparams) const { + return kv_self.get_padding(cparams); +} + // TODO: improve void llama_context::reset() { inp_tokens = nullptr; diff --git a/src/llama-context.h b/src/llama-context.h index bc33fc6ef..45eaafaad 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -84,8 +84,11 @@ struct llama_context { ggml_cgraph * graph, bool batched); + // max token position across all sequences in the current context llama_pos pos_max() const; + uint32_t get_ctx_padding(const llama_cparams & cparams) const; + void reset(); void prepare_k_shift(); diff --git a/src/llama.cpp b/src/llama.cpp index 6e2faa71c..569c67c02 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7820,6 +7820,7 @@ static int llama_decode_impl( } // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; @@ -8154,6 +8155,7 @@ static int llama_encode_impl( } // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; @@ -8629,7 +8631,7 @@ struct llama_context * llama_init_from_model( cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; // this is necessary due to kv_self.n being padded later during inference - cparams.n_ctx = GGML_PAD(cparams.n_ctx, ctx->kv_self.get_padding(cparams)); + cparams.n_ctx = GGML_PAD(cparams.n_ctx, ctx->get_ctx_padding(cparams)); // with causal attention, the batch size is limited by the context size cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; From c75ba6851e1f6079ff7c823672908a2e5767418a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 12:41:16 +0200 Subject: [PATCH 14/28] context : move adapter code in the implementation [no ci] --- src/llama-context.cpp | 37 +++++++++++++++++++++++++++++++++++++ src/llama.cpp | 40 +++++----------------------------------- 2 files changed, 42 insertions(+), 35 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 6a73659d0..5cb31abc0 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1788,6 +1788,43 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id return it->second.data(); } +// llama adapter API + +int32_t llama_set_adapter_lora( + struct llama_context * ctx, + struct llama_adapter_lora * adapter, + float scale) { + ctx->loras[adapter] = scale; + return 0; +} + +int32_t llama_rm_adapter_lora( + struct llama_context * ctx, + struct llama_adapter_lora * adapter) { + auto pos = ctx->loras.find(adapter); + if (pos != ctx->loras.end()) { + ctx->loras.erase(pos); + return 0; + } + + return -1; +} + +void llama_clear_adapter_lora(struct llama_context * ctx) { + ctx->loras.clear(); +} + +int32_t llama_apply_adapter_cvec( + struct llama_context * ctx, + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end) { + return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end); +} + + // llama state API // deprecated diff --git a/src/llama.cpp b/src/llama.cpp index 569c67c02..b80b1c4d1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8322,40 +8322,6 @@ static int llama_encode_impl( return 0; } -int32_t llama_set_adapter_lora( - struct llama_context * ctx, - struct llama_adapter_lora * adapter, - float scale) { - ctx->loras[adapter] = scale; - return 0; -} - -int32_t llama_rm_adapter_lora( - struct llama_context * ctx, - struct llama_adapter_lora * adapter) { - auto pos = ctx->loras.find(adapter); - if (pos != ctx->loras.end()) { - ctx->loras.erase(pos); - return 0; - } - - return -1; -} - -void llama_clear_adapter_lora(struct llama_context * ctx) { - ctx->loras.clear(); -} - -int32_t llama_apply_adapter_cvec( - struct llama_context * ctx, - const float * data, - size_t len, - int32_t n_embd, - int32_t il_start, - int32_t il_end) { - return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end); -} - // // interface implementation // @@ -8924,7 +8890,7 @@ struct llama_context * llama_new_context_with_model( } // -// kv cache +// kv cache view // struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { @@ -8935,6 +8901,10 @@ void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * llama_kv_cache_view_update(view, ctx->kv_self); } +// +// kv cache +// + // deprecated int32_t llama_get_kv_cache_token_count(const llama_context * ctx) { return llama_kv_self_n_tokens(ctx); From 133ad6a7232914459afc902107a53342d3abfb3b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 14:42:09 +0200 Subject: [PATCH 15/28] context : initial need_reserve logic ggml-ci --- src/llama-context.cpp | 173 +++++++++++++++++++++- src/llama-context.h | 4 +- src/llama.cpp | 337 +++++++++++++----------------------------- 3 files changed, 269 insertions(+), 245 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 5cb31abc0..d696090cc 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -576,9 +576,7 @@ ggml_tensor * llama_context::build_lora_mm_id( return res; } -bool llama_context::kv_self_update() { - bool need_reserve = false; - +void llama_context::kv_self_update() { auto & kv = kv_self; if (kv.has_shift) { @@ -655,12 +653,14 @@ bool llama_context::kv_self_update() { ggml_free(ctx0); - need_reserve = true; - kv.do_defrag = false; - } - return need_reserve; + need_reserve = true; + } +} + +void llama_kv_self_update(llama_context * ctx) { + ctx->kv_self_update(); } void llama_context::build_attn_inp( @@ -1824,6 +1824,165 @@ int32_t llama_apply_adapter_cvec( return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end); } +// +// kv cache view +// + +struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { + return llama_kv_cache_view_init(ctx->kv_self, n_seq_max); +} + +void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) { + llama_kv_cache_view_update(view, ctx->kv_self); +} + +// +// kv cache +// + +// deprecated +int32_t llama_get_kv_cache_token_count(const llama_context * ctx) { + return llama_kv_self_n_tokens(ctx); +} + +int32_t llama_kv_self_n_tokens(const llama_context * ctx) { + return llama_kv_cache_n_tokens(&ctx->kv_self); +} + +// deprecated +int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) { + return llama_kv_self_used_cells(ctx); +} + +int32_t llama_kv_self_used_cells(const llama_context * ctx) { + return llama_kv_cache_used_cells(&ctx->kv_self); +} + +// deprecated +void llama_kv_cache_clear(llama_context * ctx) { + llama_kv_self_clear(ctx); +} + +void llama_kv_self_clear(llama_context * ctx) { + llama_kv_cache_clear(&ctx->kv_self); +} + +// deprecated +bool llama_kv_cache_seq_rm( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return llama_kv_self_seq_rm(ctx, seq_id, p0, p1); +} + +bool llama_kv_self_seq_rm( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1) { + return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1); +} + +// deprecated +void llama_kv_cache_seq_cp( + llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1); +} + +void llama_kv_self_seq_cp( + llama_context * ctx, + llama_seq_id seq_id_src, + llama_seq_id seq_id_dst, + llama_pos p0, + llama_pos p1) { + return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); +} + +// deprecated +void llama_kv_cache_seq_keep( + llama_context * ctx, + llama_seq_id seq_id) { + return llama_kv_self_seq_keep(ctx, seq_id); +} + +void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id); +} + +// deprecated +void llama_kv_cache_seq_add( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta); +} + +void llama_kv_self_seq_add( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + llama_pos delta) { + return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta); +} + +// deprecated +void llama_kv_cache_seq_div( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d); +} + +void llama_kv_self_seq_div( + llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d); +} + +// deprecated +llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_self_seq_pos_max(ctx, seq_id); +} + +llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { + return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id); +} + +// deprecated +void llama_kv_cache_defrag(llama_context * ctx) { + return llama_kv_self_defrag(ctx); +} + +void llama_kv_self_defrag(llama_context * ctx) { + return llama_kv_cache_defrag(&ctx->kv_self); +} + +// deprecated +bool llama_kv_cache_can_shift(const llama_context * ctx) { + return llama_kv_self_can_shift(ctx); +} + +bool llama_kv_self_can_shift(const llama_context * ctx) { + return llama_kv_cache_can_shift(&ctx->kv_self); +} + +// deprecated +void llama_kv_cache_update(llama_context * ctx) { + llama_kv_self_update(ctx); +} // llama state API diff --git a/src/llama-context.h b/src/llama-context.h index 45eaafaad..eb9a17391 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -62,6 +62,7 @@ struct llama_context { int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch bool logits_all = false; + bool need_reserve = false; // embeddings output (2-dimensional array: [n_outputs][n_embd]) // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE @@ -87,6 +88,7 @@ struct llama_context { // max token position across all sequences in the current context llama_pos pos_max() const; + // certain implementations could require a padding for the context size uint32_t get_ctx_padding(const llama_cparams & cparams) const; void reset(); @@ -140,7 +142,7 @@ struct llama_context { struct ggml_tensor * inp_K_shift; // I32 [kv_size] // return true if need to reserve new worst-case graph - bool kv_self_update(); + void kv_self_update(); void build_attn_inp( ggml_context * ctx0, diff --git a/src/llama.cpp b/src/llama.cpp index b80b1c4d1..5807fa388 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -28,57 +28,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback -static int llama_model_load(const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { - // loading time will be recalculated after the first eval, so - // we take page faults deferred by mmap() into consideration - model.t_load_us = 0; - time_meas tm(model.t_load_us); - - model.t_start_us = tm.t_start_us; - - try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); - - ml.print_info(); - - model.hparams.vocab_only = params.vocab_only; - - try { - model.load_arch(ml); - } catch(const std::exception & e) { - throw std::runtime_error("error loading model architecture: " + std::string(e.what())); - } - try { - model.load_hparams(ml); - } catch(const std::exception & e) { - throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); - } - try { - model.load_vocab(ml); - } catch(const std::exception & e) { - throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); - } - - model.load_stats(ml); - model.print_info(); - - if (params.vocab_only) { - LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); - return 0; - } - - if (!model.load_tensors(ml)) { - return -2; - } - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); - return -1; - } - - return 0; -} - // // llm_build // @@ -7951,6 +7900,30 @@ static int llama_decode_impl( } } + // reserve a worst case graph if needed + // TODO: extract to a function + if (lctx.need_reserve) { + const auto & cparams = lctx.cparams; + const auto & model = lctx.model; + + // build worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); + + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(lctx.sched.get()); + if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + } + + lctx.need_reserve = false; + } + //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); ggml_backend_sched_reset(lctx.sched.get()); @@ -8206,6 +8179,31 @@ static int llama_encode_impl( lctx.prepare_decode(ubatch); + // reserve a worst case graph if needed + // TODO: extract to a function + if (lctx.need_reserve) { + // TODO: extract to a function + const auto & cparams = lctx.cparams; + const auto & model = lctx.model; + + // build worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); + + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(lctx.sched.get()); + if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + } + + lctx.need_reserve = false; + } + ggml_backend_sched_reset(lctx.sched.get()); ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); @@ -8419,6 +8417,57 @@ int64_t llama_time_us(void) { return ggml_time_us(); } +// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback +static int llama_model_load(const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { + // loading time will be recalculated after the first eval, so + // we take page faults deferred by mmap() into consideration + model.t_load_us = 0; + time_meas tm(model.t_load_us); + + model.t_start_us = tm.t_start_us; + + try { + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); + + ml.print_info(); + + model.hparams.vocab_only = params.vocab_only; + + try { + model.load_arch(ml); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model architecture: " + std::string(e.what())); + } + try { + model.load_hparams(ml); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); + } + try { + model.load_vocab(ml); + } catch(const std::exception & e) { + throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); + } + + model.load_stats(ml); + model.print_info(); + + if (params.vocab_only) { + LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); + return 0; + } + + if (!model.load_tensors(ml)) { + return -2; + } + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); + return -1; + } + + return 0; +} + static struct llama_model * llama_model_load_from_file_impl( const std::string & path_model, std::vector & splits, @@ -8889,192 +8938,6 @@ struct llama_context * llama_new_context_with_model( return llama_init_from_model(model, params); } -// -// kv cache view -// - -struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { - return llama_kv_cache_view_init(ctx->kv_self, n_seq_max); -} - -void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) { - llama_kv_cache_view_update(view, ctx->kv_self); -} - -// -// kv cache -// - -// deprecated -int32_t llama_get_kv_cache_token_count(const llama_context * ctx) { - return llama_kv_self_n_tokens(ctx); -} - -int32_t llama_kv_self_n_tokens(const llama_context * ctx) { - return llama_kv_cache_n_tokens(&ctx->kv_self); -} - -// deprecated -int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) { - return llama_kv_self_used_cells(ctx); -} - -int32_t llama_kv_self_used_cells(const llama_context * ctx) { - return llama_kv_cache_used_cells(&ctx->kv_self); -} - -// deprecated -void llama_kv_cache_clear(llama_context * ctx) { - llama_kv_self_clear(ctx); -} - -void llama_kv_self_clear(llama_context * ctx) { - llama_kv_cache_clear(&ctx->kv_self); -} - -// deprecated -bool llama_kv_cache_seq_rm( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1) { - return llama_kv_self_seq_rm(ctx, seq_id, p0, p1); -} - -bool llama_kv_self_seq_rm( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1) { - return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1); -} - -// deprecated -void llama_kv_cache_seq_cp( - llama_context * ctx, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1) { - return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1); -} - -void llama_kv_self_seq_cp( - llama_context * ctx, - llama_seq_id seq_id_src, - llama_seq_id seq_id_dst, - llama_pos p0, - llama_pos p1) { - return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1); -} - -// deprecated -void llama_kv_cache_seq_keep( - llama_context * ctx, - llama_seq_id seq_id) { - return llama_kv_self_seq_keep(ctx, seq_id); -} - -void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) { - return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id); -} - -// deprecated -void llama_kv_cache_seq_add( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta) { - return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta); -} - -void llama_kv_self_seq_add( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - llama_pos delta) { - return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta); -} - -// deprecated -void llama_kv_cache_seq_div( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d) { - return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d); -} - -void llama_kv_self_seq_div( - llama_context * ctx, - llama_seq_id seq_id, - llama_pos p0, - llama_pos p1, - int d) { - return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d); -} - -// deprecated -llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { - return llama_kv_self_seq_pos_max(ctx, seq_id); -} - -llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { - return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id); -} - -// deprecated -void llama_kv_cache_defrag(llama_context * ctx) { - return llama_kv_self_defrag(ctx); -} - -void llama_kv_self_defrag(llama_context * ctx) { - return llama_kv_cache_defrag(&ctx->kv_self); -} - -// deprecated -bool llama_kv_cache_can_shift(const llama_context * ctx) { - return llama_kv_self_can_shift(ctx); -} - -bool llama_kv_self_can_shift(const llama_context * ctx) { - return llama_kv_cache_can_shift(&ctx->kv_self); -} - -// deprecated -void llama_kv_cache_update(llama_context * ctx) { - llama_kv_self_update(ctx); -} - -void llama_kv_self_update(llama_context * ctx) { - const bool need_reserve = ctx->kv_self_update(); - - // reserve a worst case graph again - if (need_reserve) { - // TODO: extract to a function - const auto & cparams = ctx->cparams; - const auto & model = ctx->model; - - // build worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - - llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - - ggml_cgraph * gf = llama_build_graph(*ctx, ubatch, true); - - // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(ctx->sched.get()); - if (!ggml_backend_sched_reserve(ctx->sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - } - } -} - /// int32_t llama_encode( From cb8f2095c6f74d9fbb9bdfbb2ae1bf6178472150 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 19:37:52 +0200 Subject: [PATCH 16/28] wip --- src/llama.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 5807fa388..6c8df8a11 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7773,6 +7773,7 @@ static int llama_decode_impl( llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); const llama_batch & batch = batch_allocr.batch; + const uint32_t n_tokens_all = batch.n_tokens; const auto & model = lctx.model; @@ -7800,9 +7801,6 @@ static int llama_decode_impl( } lctx.n_queued_tokens += n_tokens_all; - auto & kv_self = lctx.kv_self; - llama_kv_slot_restorer kv_slot_restorer(kv_self); - const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = vocab.n_tokens(); @@ -7828,16 +7826,19 @@ static int llama_decode_impl( n_outputs = 1; } - lctx.sbatch.from_batch(batch, n_embd, - /* simple_split */ !kv_self.recurrent, - /* logits_all */ n_outputs == n_tokens_all); - // reserve output buffer if (llama_output_reserve(lctx, n_outputs) < n_outputs) { LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs); return -2; }; + auto & kv_self = lctx.kv_self; + llama_kv_slot_restorer kv_slot_restorer(kv_self); + + lctx.sbatch.from_batch(batch, n_embd, + /* simple_split */ !kv_self.recurrent, + /* logits_all */ n_outputs == n_tokens_all); + while (lctx.sbatch.n_tokens > 0) { llama_ubatch ubatch; if (kv_self.recurrent) { @@ -8645,7 +8646,6 @@ struct llama_context * llama_init_from_model( cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; - // this is necessary due to kv_self.n being padded later during inference cparams.n_ctx = GGML_PAD(cparams.n_ctx, ctx->get_ctx_padding(cparams)); // with causal attention, the batch size is limited by the context size From 99422dfa3f0c686d89492958946a9b2ca91012da Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 20:30:16 +0200 Subject: [PATCH 17/28] context : introduce llama_batch_manager ggml-ci --- src/llama-context.cpp | 130 ++++++++++++++++++++++++++++++++++++++++-- src/llama-context.h | 18 +++++- src/llama.cpp | 87 ++++++---------------------- 3 files changed, 162 insertions(+), 73 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index d696090cc..de54321df 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -32,6 +32,132 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } +struct llama_batch_manager : public llama_batch_manager_i { + llama_batch_manager(llama_context & lctx, const llama_batch & batch, bool logits_all) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { + const auto & hparams = lctx.model.hparams; + const auto & n_embd = hparams.n_embd; + + const auto & kv_self = lctx.kv_self; + + lctx.sbatch.from_batch(batch, n_embd, + /* simple_split */ !kv_self.recurrent, + /* logits_all */ logits_all); + } + + ~llama_batch_manager() override { + } + + virtual llama_ubatch next() override { + ubatch = llama_ubatch(); + + const auto & cparams = lctx.cparams; + const auto & kv_self = lctx.kv_self; + + const auto & n_ubatch = cparams.n_ubatch; + + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + + if (kv_self.recurrent) { + if (embd_pooled) { + // Pooled embeddings cannot be split across ubatches (yet) + ubatch = lctx.sbatch.split_seq(n_ubatch); + } else { + // recurrent model architectures are easier to implement + // with equal-length sequences + ubatch = lctx.sbatch.split_equal(n_ubatch); + } + } else { + ubatch = lctx.sbatch.split_simple(n_ubatch); + } + + return ubatch; + } + + virtual bool prepare() override { + const auto & cparams = lctx.cparams; + const auto & hparams = lctx.model.hparams; + + auto & kv_self = lctx.kv_self; + + // non-causal masks do not use the KV cache + if (hparams.causal_attn) { + llama_kv_self_update(&lctx); + + // if we have enough unused cells before the current head -> + // better to start searching from the beginning of the cache, hoping to fill it + if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) { + kv_self.head = 0; + } + + const auto slot_info = kv_self.find_slot(ubatch); + if (!slot_info) { + return false; + } + + kv_slot_restorer.save(slot_info); + + if (!kv_self.recurrent) { + // a heuristic, to avoid attending the full cache if it is not yet utilized + // after enough generations, the benefit from this heuristic disappears + // if we start defragmenting the cache, the benefit from this will be more important + const uint32_t pad = kv_self.get_padding(cparams); + kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad))); + //kv_self.n = llama_kv_cache_cell_max(kv_self); + } + } + + return true; + } + + virtual void restore() override { + kv_slot_restorer.restore(lctx.kv_self); + } + + virtual void update() override { + auto & kv_self = lctx.kv_self; + + // update the kv ring buffer + { + kv_self.head += ubatch.n_tokens; + + // Ensure kv cache head points to a valid index. + if (kv_self.head >= kv_self.size) { + kv_self.head = 0; + } + } + } + + virtual void finalize() override { + const auto & cparams = lctx.cparams; + + auto & kv_self = lctx.kv_self; + + // decide if we need to defrag the kv cache + if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) { + const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f; + + // queue defragmentation for next llama_kv_cache_update + if (fragmentation > cparams.defrag_thold) { + //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation); + + kv_self.defrag(); + } + } + } + + llama_context & lctx; + + const llama_batch & batch; + + llama_ubatch ubatch; + + llama_kv_slot_restorer kv_slot_restorer; +}; + +std::unique_ptr llama_context::prepare_batch(const llama_batch & batch, bool logits_all) { + return std::make_unique(*this, batch, logits_all); +} + enum ggml_status llama_context::compute_graph( ggml_cgraph * graph, bool batched) { @@ -59,7 +185,6 @@ enum ggml_status llama_context::compute_graph( return status; } - llama_pos llama_context::pos_max() const { return kv_self.pos_max(); } @@ -94,9 +219,6 @@ void llama_context::prepare_k_shift() { void llama_context::prepare_defrag() { } -void llama_context::prepare_decode(const llama_ubatch & /*ubatch*/) { -} - // llama input void llama_context::set_inputs(const llama_ubatch & ubatch) { diff --git a/src/llama-context.h b/src/llama-context.h index eb9a17391..47233f4f5 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -16,6 +16,20 @@ using llama_loras = std::unordered_map; +// TODO: this is very WIP - improve +struct llama_batch_manager_i { + virtual ~llama_batch_manager_i() = default; + + //bool is_done() const; + + virtual llama_ubatch next() = 0; + + virtual bool prepare() = 0; + virtual void restore() = 0; + virtual void update() = 0; + virtual void finalize() = 0; +}; + struct llama_context { llama_context(const llama_model & model) : model(model) @@ -80,6 +94,9 @@ struct llama_context { ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; + // TODO: do not pass logits_all explicitly + std::unique_ptr prepare_batch(const llama_batch & batch, bool logits_all); + // returns the result of ggml_backend_sched_graph_compute_async execution enum ggml_status compute_graph( ggml_cgraph * graph, @@ -95,7 +112,6 @@ struct llama_context { void prepare_k_shift(); void prepare_defrag(); - void prepare_decode(const llama_ubatch & ubatch); void set_inputs(const llama_ubatch & ubatch); diff --git a/src/llama.cpp b/src/llama.cpp index 6c8df8a11..8f6de199a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7807,8 +7807,6 @@ static int llama_decode_impl( uint32_t n_outputs = 0; uint32_t n_outputs_prev = 0; - const auto n_ubatch = cparams.n_ubatch; - // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; @@ -7832,27 +7830,19 @@ static int llama_decode_impl( return -2; }; - auto & kv_self = lctx.kv_self; - llama_kv_slot_restorer kv_slot_restorer(kv_self); + const bool logits_all = n_outputs == n_tokens_all; - lctx.sbatch.from_batch(batch, n_embd, - /* simple_split */ !kv_self.recurrent, - /* logits_all */ n_outputs == n_tokens_all); + //auto & kv_self = lctx.kv_self; + //llama_kv_slot_restorer kv_slot_restorer(kv_self); + + //lctx.sbatch.from_batch(batch, n_embd, + // /* simple_split */ !kv_self.recurrent, + // /* logits_all */ logits_all); + + auto batch_manager = lctx.prepare_batch(batch, logits_all); while (lctx.sbatch.n_tokens > 0) { - llama_ubatch ubatch; - if (kv_self.recurrent) { - if (embd_pooled) { - // Pooled embeddings cannot be split across ubatches (yet) - ubatch = lctx.sbatch.split_seq(n_ubatch); - } else { - // recurrent model architectures are easier to implement - // with equal-length sequences - ubatch = lctx.sbatch.split_equal(n_ubatch); - } - } else { - ubatch = lctx.sbatch.split_simple(n_ubatch); - } + llama_ubatch ubatch = batch_manager->next(); const uint32_t n_tokens = ubatch.n_tokens; @@ -7873,32 +7863,10 @@ static int llama_decode_impl( lctx.n_outputs = n_outputs_new; } - lctx.prepare_decode(ubatch); - - // non-causal masks do not use the KV cache - if (hparams.causal_attn) { - llama_kv_self_update(&lctx); - - // if we have enough unused cells before the current head -> - // better to start searching from the beginning of the cache, hoping to fill it - if (kv_self.head > kv_self.used + 2*n_tokens) { - kv_self.head = 0; - } - - const auto slot_info = kv_self.find_slot(ubatch); - if (!slot_info) { - return 1; - } - kv_slot_restorer.save(slot_info); - - if (!kv_self.recurrent) { - // a heuristic, to avoid attending the full cache if it is not yet utilized - // after enough generations, the benefit from this heuristic disappears - // if we start defragmenting the cache, the benefit from this will be more important - const uint32_t pad = kv_self.get_padding(cparams); - kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad))); - //kv_self.n = llama_kv_cache_cell_max(kv_self); - } + if (!batch_manager->prepare()) { + LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); + batch_manager->restore(); + return -3; } // reserve a worst case graph if needed @@ -7963,7 +7931,7 @@ static int llama_decode_impl( const auto compute_status = lctx.compute_graph(gf, n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { - kv_slot_restorer.restore(kv_self); + batch_manager->restore(); switch (compute_status) { case GGML_STATUS_ABORTED: return 2; @@ -7975,15 +7943,7 @@ static int llama_decode_impl( } } - // update the kv ring buffer - { - kv_self.head += n_tokens; - - // Ensure kv cache head points to a valid index. - if (kv_self.head >= kv_self.size) { - kv_self.head = 0; - } - } + batch_manager->update(); // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { @@ -8061,6 +8021,7 @@ static int llama_decode_impl( } } } + n_outputs_prev += lctx.n_outputs; } @@ -8089,17 +8050,7 @@ static int llama_decode_impl( // wait for the computation to finish (automatically done when obtaining the model output) //llama_synchronize(&lctx); - // decide if we need to defrag the kv cache - if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) { - const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f; - - // queue defragmentation for next llama_kv_cache_update - if (fragmentation > cparams.defrag_thold) { - //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation); - - kv_self.defrag(); - } - } + batch_manager->finalize(); // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. @@ -8178,7 +8129,7 @@ static int llama_encode_impl( lctx.inp_embd_enc = NULL; lctx.n_outputs = n_tokens; - lctx.prepare_decode(ubatch); + //batch_manager->prepare(ubatch); // reserve a worst case graph if needed // TODO: extract to a function From a0c500b4dc91b87acba2529d2db7a2d28f1c3bb6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 17 Jan 2025 21:11:03 +0200 Subject: [PATCH 18/28] context : prepare for abstraction ggml-ci --- src/llama-context.cpp | 307 ++++++++++++++++++++++++++++++++++++++++- src/llama-context.h | 11 +- src/llama.cpp | 314 ++---------------------------------------- 3 files changed, 323 insertions(+), 309 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index de54321df..4e6033ff1 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -32,6 +32,309 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } +llama_context::llama_context(const llama_model & model, const llama_context_params & params, std::function fn_build_graph_worst) : + model(model), + t_start_us(model.t_start_us), + t_load_us (model.t_load_us) { + + const auto & hparams = model.hparams; + + cparams.n_seq_max = std::max(1u, params.n_seq_max); + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.defrag_thold = params.defrag_thold; + cparams.embeddings = params.embeddings; + cparams.offload_kqv = params.offload_kqv; + cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; + cparams.pooling_type = params.pooling_type; + + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; + + cparams.n_ctx = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams)); + + // with causal attention, the batch size is limited by the context size + cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; + + // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask + // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) + // ref: https://github.com/ggerganov/llama.cpp/pull/5021 + if (cparams.n_batch < GGML_KQ_MASK_PAD) { + LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); + cparams.n_batch = GGML_KQ_MASK_PAD; + } + + cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); + + cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : + hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn : + hparams.n_ctx_train; + + cparams.cb_eval = params.cb_eval; + cparams.cb_eval_user_data = params.cb_eval_user_data; + + auto rope_scaling_type = params.rope_scaling_type; + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { + rope_scaling_type = hparams.rope_scaling_type_train; + } + + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { + cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none + } + + if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; + } + + cparams.yarn_attn_factor *= hparams.rope_attn_factor; + + if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { + if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { + cparams.pooling_type = LLAMA_POOLING_TYPE_NONE; + } else { + cparams.pooling_type = hparams.pooling_type; + } + } + + if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) { + cparams.causal_attn = hparams.causal_attn; + } else { + cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; + } + + const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; + + LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); + LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); + LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); + LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); + LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); + LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); + LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); + LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); + + if (n_ctx_per_seq < hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", + __func__, n_ctx_per_seq, hparams.n_ctx_train); + } + + if (n_ctx_per_seq > hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", + __func__, n_ctx_per_seq, hparams.n_ctx_train); + } + + logits_all = params.logits_all; + + // build worst-case graph for encoder if a model contains encoder + is_encoding = llama_model_has_encoder(&model); // TODO: model.has_encoder() + + uint32_t kv_size = cparams.n_ctx; + ggml_type type_k = params.type_k; + ggml_type type_v = params.type_v; + + // Mamba only needs a constant number of KV cache cells per sequence + if (llama_model_is_recurrent(&model)) { + // Mamba needs at least as many KV cells as there are sequences kept at any time + kv_size = std::max((uint32_t) 1, params.n_seq_max); + // it's probably best to keep as much precision as possible for the states + type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states + type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states + } + + GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); + GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); + + if (!hparams.vocab_only) { + // GPU backends + for (auto * dev : model.devices) { + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); + throw std::runtime_error("failed to initialize backend"); + } + backends.emplace_back(backend); + } + + // add ACCEL backends (such as BLAS) + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) { + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); + throw std::runtime_error("failed to initialize backend"); + } + backends.emplace_back(backend); + } + } + + // add CPU backend + backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + if (backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + throw std::runtime_error("failed to initialize CPU backend"); + } + backends.emplace_back(backend_cpu); + + // create a list of the set_n_threads functions in the backends + for (auto & backend : backends) { + ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + if (reg) { + auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (ggml_backend_set_n_threads_fn) { + set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn); + } + } + } + + llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data); + + if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { + LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); + throw std::runtime_error("failed to initialize self-attention cache"); + } + + { + const size_t memory_size_k = kv_self.size_k_bytes(); + const size_t memory_size_v = kv_self.size_v_bytes(); + + LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, + (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), + ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), + ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); + } + + // graph outputs buffer + { + // resized during inference when a batch uses more outputs + if (llama_output_reserve(*this, params.n_seq_max) < params.n_seq_max) { + LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); + throw std::runtime_error("failed to reserve initial output buffer"); + } + + LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, + ggml_backend_buffer_name (buf_output.get()), + ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0); + } + + // scheduler and compute buffers + { + // buffer types used for the compute buffer of each backend + std::vector backend_buft; + std::vector backend_ptrs; + for (auto & backend : backends) { + auto * buft = ggml_backend_get_default_buffer_type(backend.get()); + auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) { + // use the host buffer of the first device CPU for faster transfer of the intermediate state + auto * dev = model.devices[0]; + auto * host_buft = ggml_backend_dev_host_buffer_type(dev); + if (host_buft) { + buft = host_buft; + } + } + backend_buft.push_back(buft); + backend_ptrs.push_back(backend.get()); + } + + const size_t max_nodes = model.max_nodes(); + + // buffer used to store the computation graph and the tensor meta data + buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); + + // TODO: move these checks to ggml_backend_sched + // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary + bool pipeline_parallel = + model.n_devices() > 1 && + model.params.n_gpu_layers > (int) model.hparams.n_layer && + model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && + params.offload_kqv; + + // pipeline parallelism requires support for async compute and events in all devices + if (pipeline_parallel) { + for (auto & backend : backends) { + auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + // ignore CPU backend + continue; + } + auto * dev = ggml_backend_get_device(backend.get()); + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + if (!props.caps.async || !props.caps.events) { + // device does not support async compute or events + pipeline_parallel = false; + break; + } + } + } + + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); + + if (pipeline_parallel) { + LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); + } + + // initialize scheduler with the worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + + llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + ggml_cgraph * gf_pp = fn_build_graph_worst(*this, ubatch_pp); + + // reserve pp graph first so that buffers are only allocated once + ggml_backend_sched_reserve(sched.get(), gf_pp); + int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get()); + int n_nodes_pp = ggml_graph_n_nodes(gf_pp); + + // reserve with tg graph to get the number of splits and nodes + llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + ggml_cgraph * gf_tg = fn_build_graph_worst(*this, ubatch_tg); + ggml_backend_sched_reserve(sched.get(), gf_tg); + int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); + int n_nodes_tg = ggml_graph_n_nodes(gf_tg); + + // reserve again with pp graph to avoid ggml-alloc reallocations during inference + gf_pp = fn_build_graph_worst(*this, ubatch_pp); + if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + throw std::runtime_error("failed to allocate compute buffers"); + } + + for (size_t i = 0; i < backend_ptrs.size(); ++i) { + ggml_backend_t backend = backend_ptrs[i]; + ggml_backend_buffer_type_t buft = backend_buft[i]; + size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend); + if (size > 1) { + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); + } + } + + if (n_nodes_pp == n_nodes_tg) { + LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp); + } else { + LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg); + } + if (n_splits_pp == n_splits_tg) { + LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp); + } else { + LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); + } + } + } + +} + struct llama_batch_manager : public llama_batch_manager_i { llama_batch_manager(llama_context & lctx, const llama_batch & batch, bool logits_all) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { const auto & hparams = lctx.model.hparams; @@ -81,7 +384,7 @@ struct llama_batch_manager : public llama_batch_manager_i { // non-causal masks do not use the KV cache if (hparams.causal_attn) { - llama_kv_self_update(&lctx); + lctx.kv_self_update(); // if we have enough unused cells before the current head -> // better to start searching from the beginning of the cache, hoping to fill it @@ -106,6 +409,8 @@ struct llama_batch_manager : public llama_batch_manager_i { } } + //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + return true; } diff --git a/src/llama-context.h b/src/llama-context.h index 47233f4f5..d0356e3ed 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -30,11 +30,14 @@ struct llama_batch_manager_i { virtual void finalize() = 0; }; +// TODO: make implementation details private +// TODO: become abstract base class, split the current implementation into different child classes struct llama_context { - llama_context(const llama_model & model) - : model(model) - , t_start_us(model.t_start_us) - , t_load_us (model.t_load_us) {} + // TODO: store the worst-case graph build function and reuse it later + llama_context( + const llama_model & model, + const llama_context_params & params, + std::function fn_build_graph_worst); const struct llama_model & model; diff --git a/src/llama.cpp b/src/llama.cpp index 8f6de199a..408bd9030 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7893,8 +7893,6 @@ static int llama_decode_impl( lctx.need_reserve = false; } - //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - ggml_backend_sched_reset(lctx.sched.get()); ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); @@ -8574,309 +8572,17 @@ struct llama_context * llama_init_from_model( return nullptr; } - llama_context * ctx = new llama_context(*model); + llama_context * ctx = nullptr; - const auto & hparams = model->hparams; - auto & cparams = ctx->cparams; - - cparams.n_seq_max = std::max(1u, params.n_seq_max); - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch; - cparams.yarn_ext_factor = params.yarn_ext_factor; - cparams.yarn_attn_factor = params.yarn_attn_factor; - cparams.yarn_beta_fast = params.yarn_beta_fast; - cparams.yarn_beta_slow = params.yarn_beta_slow; - cparams.defrag_thold = params.defrag_thold; - cparams.embeddings = params.embeddings; - cparams.offload_kqv = params.offload_kqv; - cparams.flash_attn = params.flash_attn; - cparams.no_perf = params.no_perf; - cparams.pooling_type = params.pooling_type; - - cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; - cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; - cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; - - cparams.n_ctx = GGML_PAD(cparams.n_ctx, ctx->get_ctx_padding(cparams)); - - // with causal attention, the batch size is limited by the context size - cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; - - // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask - // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) - // ref: https://github.com/ggerganov/llama.cpp/pull/5021 - if (cparams.n_batch < GGML_KQ_MASK_PAD) { - LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); - cparams.n_batch = GGML_KQ_MASK_PAD; - } - - cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); - - cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : - hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn : - hparams.n_ctx_train; - - cparams.cb_eval = params.cb_eval; - cparams.cb_eval_user_data = params.cb_eval_user_data; - - auto rope_scaling_type = params.rope_scaling_type; - if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { - rope_scaling_type = hparams.rope_scaling_type_train; - } - - if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { - cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none - } - - if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' - cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; - } - - cparams.yarn_attn_factor *= hparams.rope_attn_factor; - - if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { - if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { - cparams.pooling_type = LLAMA_POOLING_TYPE_NONE; - } else { - cparams.pooling_type = hparams.pooling_type; - } - } - - if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) { - cparams.causal_attn = hparams.causal_attn; - } else { - cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; - } - - const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; - - LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); - LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); - LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); - LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); - LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); - LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); - LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); - - if (n_ctx_per_seq < hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", - __func__, n_ctx_per_seq, hparams.n_ctx_train); - } - - if (n_ctx_per_seq > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", - __func__, n_ctx_per_seq, hparams.n_ctx_train); - } - - ctx->logits_all = params.logits_all; - - // build worst-case graph for encoder if a model contains encoder - ctx->is_encoding = llama_model_has_encoder(model); - - uint32_t kv_size = cparams.n_ctx; - ggml_type type_k = params.type_k; - ggml_type type_v = params.type_v; - - // Mamba only needs a constant number of KV cache cells per sequence - if (llama_model_is_recurrent(model)) { - // Mamba needs at least as many KV cells as there are sequences kept at any time - kv_size = std::max((uint32_t) 1, params.n_seq_max); - // it's probably best to keep as much precision as possible for the states - type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states - type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states - } - - GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); - GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); - - if (!hparams.vocab_only) { - // GPU backends - for (auto * dev : model->devices) { - ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); - llama_free(ctx); - return nullptr; - } - ctx->backends.emplace_back(backend); - } - - // add ACCEL backends (such as BLAS) - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - ggml_backend_dev_t dev = ggml_backend_dev_get(i); - if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) { - ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); - llama_free(ctx); - return nullptr; - } - ctx->backends.emplace_back(backend); - } - } - - // add CPU backend - ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); - if (ctx->backend_cpu == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); - llama_free(ctx); - return nullptr; - } - ctx->backends.emplace_back(ctx->backend_cpu); - - // create a list of the set_n_threads functions in the backends - for (auto & backend : ctx->backends) { - ggml_backend_dev_t dev = ggml_backend_get_device(backend.get()); - ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; - if (reg) { - auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); - if (ggml_backend_set_n_threads_fn) { - ctx->set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn); - } - } - } - - llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data); - - if (!ctx->kv_self.init(ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) { - LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); - llama_free(ctx); - return nullptr; - } - - { - const size_t memory_size_k = ctx->kv_self.size_k_bytes(); - const size_t memory_size_v = ctx->kv_self.size_v_bytes(); - - LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, - (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), - ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), - ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); - } - - // graph outputs buffer - { - // resized during inference when a batch uses more outputs - if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) { - LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); - llama_free(ctx); - return nullptr; - } - - LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, - ggml_backend_buffer_name(ctx->buf_output.get()), - ggml_backend_buffer_get_size(ctx->buf_output.get()) / 1024.0 / 1024.0); - } - - // scheduler and compute buffers - { - // buffer types used for the compute buffer of each backend - std::vector backend_buft; - std::vector backend_ptrs; - for (auto & backend : ctx->backends) { - auto * buft = ggml_backend_get_default_buffer_type(backend.get()); - auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); - if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) { - // use the host buffer of the first device CPU for faster transfer of the intermediate state - auto * dev = model->devices[0]; - auto * host_buft = ggml_backend_dev_host_buffer_type(dev); - if (host_buft) { - buft = host_buft; - } - } - backend_buft.push_back(buft); - backend_ptrs.push_back(backend.get()); - } - - const size_t max_nodes = model->max_nodes(); - - // buffer used to store the computation graph and the tensor meta data - ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); - - // TODO: move these checks to ggml_backend_sched - // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary - bool pipeline_parallel = - model->n_devices() > 1 && - model->params.n_gpu_layers > (int)model->hparams.n_layer && - model->params.split_mode == LLAMA_SPLIT_MODE_LAYER && - params.offload_kqv; - - // pipeline parallelism requires support for async compute and events in all devices - if (pipeline_parallel) { - for (auto & backend : ctx->backends) { - auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); - if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { - // ignore CPU backend - continue; - } - auto * dev = ggml_backend_get_device(backend.get()); - ggml_backend_dev_props props; - ggml_backend_dev_get_props(dev, &props); - if (!props.caps.async || !props.caps.events) { - // device does not support async compute or events - pipeline_parallel = false; - break; - } - } - } - - ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); - - if (pipeline_parallel) { - LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get())); - } - - // initialize scheduler with the worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - llama_token token = ctx->model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - - llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true); - - // reserve pp graph first so that buffers are only allocated once - ggml_backend_sched_reserve(ctx->sched.get(), gf_pp); - int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched.get()); - int n_nodes_pp = ggml_graph_n_nodes(gf_pp); - - // reserve with tg graph to get the number of splits and nodes - llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true); - ggml_backend_sched_reserve(ctx->sched.get(), gf_tg); - int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get()); - int n_nodes_tg = ggml_graph_n_nodes(gf_tg); - - // reserve again with pp graph to avoid ggml-alloc reallocations during inference - gf_pp = llama_build_graph(*ctx, ubatch_pp, true); - if (!ggml_backend_sched_reserve(ctx->sched.get(), gf_pp)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - llama_free(ctx); - return nullptr; - } - - for (size_t i = 0; i < backend_ptrs.size(); ++i) { - ggml_backend_t backend = backend_ptrs[i]; - ggml_backend_buffer_type_t buft = backend_buft[i]; - size_t size = ggml_backend_sched_get_buffer_size(ctx->sched.get(), backend); - if (size > 1) { - LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); - } - } - - if (n_nodes_pp == n_nodes_tg) { - LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp); - } else { - LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg); - } - if (n_splits_pp == n_splits_tg) { - LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp); - } else { - LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg); - } - } + try { + // TODO: add logic which llama_context implementation to construct + ctx = new llama_context(*model, params, + [](llama_context & lctx, const llama_ubatch & ubatch) { + return llama_build_graph(lctx, ubatch, true); + }); + } catch (const std::exception & e) { + LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what()); + return nullptr; } return ctx; From 918885697e4409208b8157ffd18a6c347ca5b04d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Jan 2025 14:45:04 +0200 Subject: [PATCH 19/28] llama : resolve rwkv conflict ggml-ci --- src/llama.cpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index f410f7a2f..0ca8070cd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7076,19 +7076,13 @@ struct llm_build_context { // 1 // ); + // struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); // ggml_build_forward_expand( // gf, // ggml_cpy( // ctx0, - // wkv_states, - // ggml_view_1d( - // ctx0, - // kv_self.v_l[il], - // hparams.n_embd_v_s() * n_seqs, - // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - // ) - // ) - // ); + // ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0), + // ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv())); // ggml_build_forward_expand(gf, ffn_inp); From 3e23be7911704f8474e7dcb32424bb043be63b06 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 2 Feb 2025 10:17:42 +0200 Subject: [PATCH 20/28] context : store graph build function callback ggml-ci --- src/llama-context.cpp | 37 +++++++++++++++++++++++++++++++++---- src/llama-context.h | 8 ++++++-- src/llama.cpp | 4 ++-- 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 37e43213a..1cd168db2 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -33,8 +33,12 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } -llama_context::llama_context(const llama_model & model, const llama_context_params & params, std::function fn_build_graph_worst) : +llama_context::llama_context( + const llama_model & model, + const llama_context_params & params, + build_graph_callback && cb_build_graph) : model(model), + cb_build_graph(std::move(cb_build_graph)), t_start_us(model.t_start_us), t_load_us (model.t_load_us) { @@ -289,7 +293,7 @@ llama_context::llama_context(const llama_model & model, const llama_context_para llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_pp = fn_build_graph_worst(*this, ubatch_pp); + ggml_cgraph * gf_pp = this->cb_build_graph(*this, ubatch_pp, true); // reserve pp graph first so that buffers are only allocated once ggml_backend_sched_reserve(sched.get(), gf_pp); @@ -298,13 +302,13 @@ llama_context::llama_context(const llama_model & model, const llama_context_para // reserve with tg graph to get the number of splits and nodes llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - ggml_cgraph * gf_tg = fn_build_graph_worst(*this, ubatch_tg); + ggml_cgraph * gf_tg = this->cb_build_graph(*this, ubatch_tg, true); ggml_backend_sched_reserve(sched.get(), gf_tg); int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); int n_nodes_tg = ggml_graph_n_nodes(gf_tg); // reserve again with pp graph to avoid ggml-alloc reallocations during inference - gf_pp = fn_build_graph_worst(*this, ubatch_pp); + gf_pp = this->cb_build_graph(*this, ubatch_pp, true); if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); throw std::runtime_error("failed to allocate compute buffers"); @@ -475,6 +479,31 @@ struct llama_batch_manager : public llama_batch_manager_i { //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + // reserve a worst case graph if needed + if (lctx.need_reserve) { + LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__); + + const auto & cparams = lctx.cparams; + const auto & model = lctx.model; + + // build worst-case graph + uint32_t n_seqs = 1; // TODO: worst-case number of sequences + uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + + llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; + + ggml_cgraph * gf = lctx.cb_build_graph(lctx, ubatch, true); + + // initialize scheduler with the worst-case graph + ggml_backend_sched_reset(lctx.sched.get()); + if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); + } + + lctx.need_reserve = false; + } + return true; } diff --git a/src/llama-context.h b/src/llama-context.h index 1277645de..5958deaef 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -36,11 +36,13 @@ struct llama_batch_manager_i { // TODO: make implementation details private // TODO: become abstract base class, split the current implementation into different child classes struct llama_context { - // TODO: store the worst-case graph build function and reuse it later + // TODO: tmp until llama-model starts implementing the graph build function + typedef std::function build_graph_callback; + llama_context( const llama_model & model, const llama_context_params & params, - std::function fn_build_graph_worst); + build_graph_callback && cb_build_graph); const struct llama_model & model; @@ -49,6 +51,8 @@ struct llama_context { llama_adapter_cvec cvec; llama_loras loras; + build_graph_callback cb_build_graph; + std::vector backends; std::vector> set_n_threads_fns; diff --git a/src/llama.cpp b/src/llama.cpp index 0ca8070cd..6268249f2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8508,8 +8508,8 @@ struct llama_context * llama_init_from_model( try { // TODO: add logic which llama_context implementation to construct ctx = new llama_context(*model, params, - [](llama_context & lctx, const llama_ubatch & ubatch) { - return llama_build_graph(lctx, ubatch, true); + [](llama_context & lctx, const llama_ubatch & ubatch, bool worst_case) { + return llama_build_graph(lctx, ubatch, worst_case); }); } catch (const std::exception & e) { LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what()); From 1eca8916b51a6952a304e68f312b63649a6cead9 Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Mon, 3 Feb 2025 20:17:50 +0800 Subject: [PATCH 21/28] llama : fix rwkv inference (#11618) Signed-off-by: Molly Sophia --- src/llama-context.cpp | 222 ++++++++++++++++++ src/llama-context.h | 27 +++ src/llama.cpp | 509 +++++++++++++----------------------------- 3 files changed, 409 insertions(+), 349 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 1cd168db2..3bc0513ca 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1970,6 +1970,228 @@ ggml_tensor * llama_context::build_mamba_layer( } +ggml_tensor * llama_context::build_rwkv_token_shift_load( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto token_shift_count = hparams.token_shift_count; + + const auto & n_tokens = ubatch.n_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + struct ggml_tensor * token_shift_all = kv_self.k_l[il]; + + struct ggml_tensor * token_shift = build_copy_mask_state( + ctx0, graph, token_shift_all, state_copy, state_mask, + n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); + + token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs); + + return token_shift; +} + + +ggml_tensor * llama_context::build_rwkv_token_shift_store( + ggml_context * ctx0, + ggml_tensor * token_shift, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto token_shift_count = hparams.token_shift_count; + const auto n_embd = hparams.n_embd; + + const auto & n_tokens = ubatch.n_tokens; + const int64_t n_seqs = ubatch.n_seqs; + + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + return ggml_cpy( + ctx0, + ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0), + ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) + ); +} + + +ggml_tensor * llama_context::build_rwkv6_time_mix( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * x_prev, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case) { + const auto & hparams = model.hparams; + + const auto n_tokens = ubatch.n_tokens; + const auto n_seqs = ubatch.n_seqs; + const auto n_embd = hparams.n_embd; + const auto head_size = hparams.wkv_head_size; + const auto n_head = n_embd / head_size; + const auto n_head_kv = hparams.n_head_kv(il); + + const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head; + + const auto layer = &model.layers[il]; + + bool is_qrwkv = layer->time_mix_first == nullptr; + + struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur); + + xxx = ggml_reshape_4d( + ctx0, + ggml_tanh( + ctx0, + ggml_mul_mat(ctx0, layer->time_mix_w1, xxx) + ), + layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens + ); + + xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2)); + + xxx = ggml_mul_mat( + ctx0, + ggml_reshape_4d( + ctx0, + layer->time_mix_w2, + layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 + ), + xxx + ); + + struct ggml_tensor *xw, *xk, *xv, *xr, *xg; + if (layer->time_mix_lerp_fused) { + // fusing these weights makes some performance improvement + sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens); + cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); + xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur); + xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + } else { + // for backward compatibility + xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + + xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur); + xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur); + xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur); + xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur); + xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur); + } + + struct ggml_tensor * r = build_lora_mm(ctx0, layer->time_mix_receptance, xr); + struct ggml_tensor * k = build_lora_mm(ctx0, layer->time_mix_key, xk); + struct ggml_tensor * v = build_lora_mm(ctx0, layer->time_mix_value, xv); + if (layer->time_mix_receptance_b) { + r = ggml_add(ctx0, r, layer->time_mix_receptance_b); + } + if (layer->time_mix_key_b) { + k = ggml_add(ctx0, k, layer->time_mix_key_b); + } + if (layer->time_mix_value_b) { + v = ggml_add(ctx0, v, layer->time_mix_value_b); + } + + struct ggml_tensor * g = build_lora_mm(ctx0, layer->time_mix_gate, xg); + if (is_qrwkv) { + g = ggml_sigmoid(ctx0, g); + } else { + g = ggml_silu(ctx0, g); + } + + if (n_head_kv != 0 && n_head_kv != n_head) { + GGML_ASSERT(n_head % n_head_kv == 0); + k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens); + v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens); + struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens); + k = ggml_repeat(ctx0, k, tmp); + v = ggml_repeat(ctx0, v, tmp); + } + + k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens); + v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens); + r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens); + + struct ggml_tensor * w = ggml_mul_mat( + ctx0, + layer->time_mix_decay_w2, + ggml_tanh( + ctx0, + ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw) + ) + ); + + w = ggml_add(ctx0, w, layer->time_mix_decay); + w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w))); + w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens); + + if (is_qrwkv) { + // k = k * (1 - w) + k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w)); + } + + struct ggml_tensor * wkv_state = build_copy_mask_state( + ctx0, graph, kv_self.v_l[il], state_copy, state_mask, + n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); + + struct ggml_tensor * wkv_output; + if (is_qrwkv) { + wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f)); + } else { + wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, wkv_state); + } + cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0); + wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); + + ggml_build_forward_expand( + graph, + ggml_cpy( + ctx0, + wkv_state, + ggml_view_1d( + ctx0, + kv_self.v_l[il], + hparams.n_embd_v_s() * n_seqs, + hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) + ) + ) + ); + + if (!is_qrwkv) { + // group norm with head_count groups + cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens); + cur = ggml_norm(ctx0, cur, 64e-5f); + + // Convert back to regular vectors. + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b); + } else { + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + } + + cur = ggml_mul(ctx0, cur, g); + cur = build_lora_mm(ctx0, layer->time_mix_output, cur); + + return cur; +} + // llama output size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { diff --git a/src/llama-context.h b/src/llama-context.h index 5958deaef..4cf4a6312 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -248,6 +248,33 @@ struct llama_context { int il, bool worst_case); + ggml_tensor * build_rwkv_token_shift_load( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case); + + ggml_tensor * build_rwkv_token_shift_store( + ggml_context * ctx0, + ggml_tensor * token_shift, + const llama_ubatch & ubatch, + int il, + bool worst_case); + + ggml_tensor * build_rwkv6_time_mix( + ggml_context * ctx0, + ggml_cgraph * graph, + ggml_tensor * cur, + ggml_tensor * x_prev, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il, + bool worst_case); + struct ggml_tensor * inp_s_copy; // I32 [kv_size] struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] diff --git a/src/llama.cpp b/src/llama.cpp index 64a5efd2d..171ea2017 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -574,175 +574,34 @@ struct llm_build_context { return cur; } - //struct ggml_tensor * build_rwkv6_time_mix( - // const struct llama_layer * layer, - // struct ggml_tensor * cur, - // struct ggml_tensor * x_prev, - // struct ggml_tensor ** wkv_state, - // size_t wkv_head_size, - // size_t head_count_kv) { - // size_t n_embd = cur->ne[0]; - // size_t n_seq_tokens = cur->ne[1]; - // size_t n_seqs = cur->ne[2]; + struct ggml_tensor * build_rwkv_channel_mix( + const struct llama_layer * layer, + struct ggml_tensor * cur, + struct ggml_tensor * x_prev, + const llm_arch arch) { + struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + switch (arch) { + case LLM_ARCH_RWKV6: + { + struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur); + struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur); - // size_t head_size = wkv_head_size; - // size_t head_count = n_embd / head_size; + struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr)); + struct ggml_tensor * k = ggml_sqr( + ctx0, + ggml_relu( + ctx0, + build_lora_mm(layer->channel_mix_key, xk) + ) + ); + cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k)); + } break; + default: + GGML_ABORT("fatal error"); + } - // size_t n_tokens = n_seqs * n_seq_tokens; - - // bool is_qrwkv = layer->time_mix_first == nullptr; - - // struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); - - // sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens); - // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - - // struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur); - - // xxx = ggml_reshape_4d( - // ctx0, - // ggml_tanh( - // ctx0, - // ggml_mul_mat(ctx0, layer->time_mix_w1, xxx) - // ), - // layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens - // ); - - // xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2)); - - // xxx = ggml_mul_mat( - // ctx0, - // ggml_reshape_4d( - // ctx0, - // layer->time_mix_w2, - // layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5 - // ), - // xxx - // ); - - // struct ggml_tensor *xw, *xk, *xv, *xr, *xg; - // if (layer->time_mix_lerp_fused) { - // // fusing these weights makes some performance improvement - // sx = ggml_reshape_3d(ctx0, sx, n_embd, 1, n_tokens); - // cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); - // xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur); - // xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); - // xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - // xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - // xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - // xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - // } else { - // // for backward compatibility - // xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); - // xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); - // xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); - // xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); - // xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); - - // xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur); - // xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur); - // xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur); - // xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur); - // xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur); - // } - - // struct ggml_tensor * r = build_lora_mm(layer->time_mix_receptance, xr); - // struct ggml_tensor * k = build_lora_mm(layer->time_mix_key, xk); - // struct ggml_tensor * v = build_lora_mm(layer->time_mix_value, xv); - // if (layer->time_mix_receptance_b) { - // r = ggml_add(ctx0, r, layer->time_mix_receptance_b); - // } - // if (layer->time_mix_key_b) { - // k = ggml_add(ctx0, k, layer->time_mix_key_b); - // } - // if (layer->time_mix_value_b) { - // v = ggml_add(ctx0, v, layer->time_mix_value_b); - // } - - // struct ggml_tensor * g = build_lora_mm(layer->time_mix_gate, xg); - // if (is_qrwkv) { - // g = ggml_sigmoid(ctx0, g); - // } else { - // g = ggml_silu(ctx0, g); - // } - - // if (head_count_kv != head_count) { - // GGML_ASSERT(head_count % head_count_kv == 0); - // k = ggml_reshape_4d(ctx0, k, head_size, 1, head_count_kv, n_tokens); - // v = ggml_reshape_4d(ctx0, v, head_size, 1, head_count_kv, n_tokens); - // struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens); - // k = ggml_repeat(ctx0, k, tmp); - // v = ggml_repeat(ctx0, v, tmp); - // } - - // k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens); - // v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens); - // r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens); - - // struct ggml_tensor * w = ggml_mul_mat( - // ctx0, - // layer->time_mix_decay_w2, - // ggml_tanh( - // ctx0, - // ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw) - // ) - // ); - - // w = ggml_add(ctx0, w, layer->time_mix_decay); - // w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w))); - // w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens); - - // if (is_qrwkv) { - // // k = k * (1 - w) - // k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w)); - // } - - // struct ggml_tensor * wkv_output; - // if (!layer->time_mix_first) { - // wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, *wkv_state, pow(head_size, -0.5f)); - // } else { - // wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, *wkv_state); - // } - // cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0); - // *wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); - - // if (!is_qrwkv) { - // // group norm with head_count groups - // cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens); - // cur = ggml_norm(ctx0, cur, 64e-5f); - - // // Convert back to regular vectors. - // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - // cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b); - // } else { - // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - // } - - // cur = ggml_mul(ctx0, cur, g); - // cur = build_lora_mm(layer->time_mix_output, cur); - - // return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs); - //} - - //struct ggml_tensor * build_rwkv6_channel_mix( - // const struct llama_layer * layer, - // struct ggml_tensor * cur, - // struct ggml_tensor * x_prev) { - // struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); - // struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur); - // struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur); - - // struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr)); - // struct ggml_tensor * k = ggml_sqr( - // ctx0, - // ggml_relu( - // ctx0, - // build_lora_mm(layer->channel_mix_key, xk) - // ) - // ); - - // return ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k)); - //} + return cur; + } struct ggml_cgraph * build_k_shift() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -6935,226 +6794,178 @@ struct llm_build_context { return gf; } - //ggml_cgraph * build_rwkv6() { - // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + ggml_cgraph * build_rwkv6() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // // Token shift state dimensions should be 2 * n_emb - // GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); + GGML_ASSERT(hparams.token_shift_count == 2); - // const int64_t n_seqs = ubatch.n_seqs; - // const int64_t n_seq_tokens = ubatch.n_seq_tokens; - // const int64_t n_tokens = ubatch.n_tokens; - // GGML_ASSERT(n_seqs != 0); - // GGML_ASSERT(ubatch.equal_seqs); - // GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - // struct ggml_tensor * cur; - // struct ggml_tensor * inpL; - // struct ggml_tensor * state_copy = build_inp_s_copy(); - // struct ggml_tensor * state_mask = build_inp_s_mask(); + inpL = build_inp_embd(model.tok_embd); + inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - // inpL = build_inp_embd(model.tok_embd); - // inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); + struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case); - // for (int il = 0; il < n_layer; ++il) { - // const llama_layer * layer = &model.layers[il]; + const auto n_embd = hparams.n_embd; + const auto n_seq_tokens = ubatch.n_seq_tokens; + const auto n_seqs = ubatch.n_seqs; - // // (ab)using the KV cache to store the states - // struct ggml_tensor * token_shift = build_copy_mask_state( - // gf, kv_self.k_l[il], state_copy, state_mask, - // hparams.n_embd_k_s(), n_seqs); + for (int il = 0; il < n_layer; ++il) { + const llama_layer * layer = &model.layers[il]; - // struct ggml_tensor * wkv_states = build_copy_mask_state( - // gf, kv_self.v_l[il], state_copy, state_mask, - // hparams.n_embd_v_s(), n_seqs); + struct ggml_tensor * token_shift = lctx.build_rwkv_token_shift_load( + ctx0, gf, state_copy, state_mask, ubatch, il, worst_case + ); - // cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - // token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs); + struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); + struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); - // struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); - // struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); + struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); + cb(att_norm, "attn_norm", il); - // struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); - // struct ggml_tensor * x_prev = ggml_concat( - // ctx0, - // att_shift, - // ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), - // 1 - // ); + struct ggml_tensor * x_prev = ggml_concat( + ctx0, + att_shift, + ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), + 1 + ); - // cur = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size)); - // ggml_build_forward_expand(gf, cur); - // ggml_build_forward_expand( - // gf, - // ggml_cpy( - // ctx0, - // wkv_states, - // ggml_view_1d( - // ctx0, - // kv_self.v_l[il], - // hparams.n_embd_v_s() * n_seqs, - // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - // ) - // ) - // ); + cur = lctx.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); - // struct ggml_tensor * x_norm_ffn = build_norm(cur, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); - // x_prev = ggml_concat( - // ctx0, - // ffn_shift, - // ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0), - // 1 - // ); - // cur = ggml_add(ctx0, cur, build_rwkv6_channel_mix(layer, x_norm_ffn, x_prev)); - // ggml_build_forward_expand(gf, cur); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); - // struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); - // struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn)); + struct ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); + cb(ffn_norm, "ffn_norm", il); - // token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1); + x_prev = ggml_concat( + ctx0, + ffn_shift, + ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), + 1 + ); - // ggml_build_forward_expand( - // gf, - // ggml_cpy( - // ctx0, - // ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0), - // ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) - // ) - // ); + cur = build_rwkv_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6); + cur = ggml_add(ctx0, cur, ffn_inp); - // if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { - // cur = ggml_scale(ctx0, cur, 0.5F); - // } + token_shift = ggml_concat(ctx0, + ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)), + ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)), + 1 + ); + ggml_build_forward_expand(gf, lctx.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); - // cur = lctx.cvec.apply_to(ctx0, cur, il); - // cb(cur, "l_out", il); + if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { + cur = ggml_scale(ctx0, cur, 0.5F); + } - // // input for next layer - // inpL = cur; - // } + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); - // cur = inpL; - // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // input for next layer + inpL = cur; + } - // cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); - // cb(cur, "result_norm", -1); + cur = inpL; + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); - // cur = build_lora_mm(model.output, cur); - // cb(cur, "result_output", -1); + cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); + cb(cur, "result_norm", -1); - // ggml_build_forward_expand(gf, cur); + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); - // return gf; - //} + ggml_build_forward_expand(gf, cur); + + return gf; + } // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py - //ggml_cgraph * build_rwkv6qwen2() { - // struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); + ggml_cgraph * build_rwkv6qwen2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - // GGML_ASSERT(n_embd == hparams.n_embd_k_s()); + GGML_ASSERT(n_embd == hparams.n_embd_k_s()); - // const int64_t n_seqs = ubatch.n_seqs; - // const int64_t n_seq_tokens = ubatch.n_seq_tokens; - // const int64_t n_tokens = ubatch.n_tokens; - // GGML_ASSERT(n_seqs != 0); - // GGML_ASSERT(ubatch.equal_seqs); - // GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); + struct ggml_tensor * cur; + struct ggml_tensor * inpL; - // struct ggml_tensor * cur; - // struct ggml_tensor * inpL; - // struct ggml_tensor * state_copy = build_inp_s_copy(); - // struct ggml_tensor * state_mask = build_inp_s_mask(); + inpL = build_inp_embd(model.tok_embd); - // inpL = build_inp_embd(model.tok_embd); + struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case); + struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case); - // for (int il = 0; il < n_layer; ++il) { - // const llama_layer * layer = &model.layers[il]; + const auto n_embd = hparams.n_embd; + const auto n_seq_tokens = ubatch.n_seq_tokens; + const auto n_seqs = ubatch.n_seqs; - // // (ab)using the KV cache to store the states - // struct ggml_tensor * token_shift = build_copy_mask_state( - // gf, kv_self.k_l[il], state_copy, state_mask, - // hparams.n_embd_k_s(), n_seqs); + inpL = build_inp_embd(model.tok_embd); - // struct ggml_tensor * wkv_states = build_copy_mask_state( - // gf, kv_self.v_l[il], state_copy, state_mask, - // hparams.n_embd_v_s(), n_seqs); + for (int il = 0; il < n_layer; ++il) { + const llama_layer * layer = &model.layers[il]; - // cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - // token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs); + struct ggml_tensor * token_shift = lctx.build_rwkv_token_shift_load( + ctx0, gf, state_copy, state_mask, ubatch, il, worst_case + ); - // struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); - // struct ggml_tensor * x_prev = ggml_concat( - // ctx0, - // token_shift, - // ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0), - // 1 - // ); + struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); + cb(att_norm, "attn_norm", il); - // struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att)); - // ggml_build_forward_expand( - // gf, - // ggml_cpy( - // ctx0, - // ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0), - // ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il])) + struct ggml_tensor * x_prev = ggml_concat( + ctx0, + token_shift, + ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), + 1 + ); - // struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv())); - // ggml_build_forward_expand(gf, ffn_inp); - // ggml_build_forward_expand( - // gf, - // ggml_cpy( - // ctx0, - // wkv_states, - // ggml_view_1d( - // ctx0, - // kv_self.v_l[il], - // hparams.n_embd_v_s() * n_seqs, - // hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il]) - // ) - // ) - // ); + cur = lctx.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case); - // cb(ffn_inp, "ffn_inp", il); + token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); + ggml_build_forward_expand(gf, lctx.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case)); - // // feed-forward network - // cur = build_norm(ffn_inp, - // model.layers[il].ffn_norm, NULL, - // LLM_NORM_RMS, il); - // cb(cur, "ffn_norm", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); - // cur = build_ffn(cur, - // model.layers[il].ffn_up, NULL, NULL, - // model.layers[il].ffn_gate, NULL, NULL, - // model.layers[il].ffn_down, NULL, NULL, - // NULL, - // LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - // cb(cur, "ffn_out", il); + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); - // cur = ggml_add(ctx0, cur, ffn_inp); - // cur = lctx.cvec.apply_to(ctx0, cur, il); - // cb(cur, "l_out", il); + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); - // // input for next layer - // inpL = cur; - // } + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); - // cur = inpL; - // struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - // cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - // cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // input for next layer + inpL = cur; + } - // cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); - // cb(cur, "result_norm", -1); + cur = inpL; + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); - // cur = build_lora_mm(model.output, cur); - // cb(cur, "result_output", -1); + cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); - // ggml_build_forward_expand(gf, cur); + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); - // return gf; - //} + ggml_build_forward_expand(gf, cur); + + return gf; + } // ref: https://github.com/facebookresearch/chameleon // based on the original build_llama() function, changes: @@ -7726,14 +7537,14 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_exaone(); } break; - //case LLM_ARCH_RWKV6: - // { - // result = llm.build_rwkv6(); - // } break; - //case LLM_ARCH_RWKV6QWEN2: - // { - // result = llm.build_rwkv6qwen2(); - // } break; + case LLM_ARCH_RWKV6: + { + result = llm.build_rwkv6(); + } break; + case LLM_ARCH_RWKV6QWEN2: + { + result = llm.build_rwkv6qwen2(); + } break; case LLM_ARCH_CHAMELEON: { result = llm.build_chameleon(); From e0d913fccbffe7913b2fa6a00590ca68800c9b59 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Feb 2025 10:02:50 +0200 Subject: [PATCH 22/28] llama : clear whitespaces --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 171ea2017..f03386af4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6850,7 +6850,7 @@ struct llm_build_context { cur = build_rwkv_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6); cur = ggml_add(ctx0, cur, ffn_inp); - token_shift = ggml_concat(ctx0, + token_shift = ggml_concat(ctx0, ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)), ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)), 1 From b15fede7a9a044d0a15da03b9ceb08f7007bfc95 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Feb 2025 14:34:45 +0200 Subject: [PATCH 23/28] kv-cache : fix defrag condition ggml-ci --- src/llama-context.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 3bc0513ca..719622eaa 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -532,11 +532,13 @@ struct llama_batch_manager : public llama_batch_manager_i { // decide if we need to defrag the kv cache if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) { - const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f; + // - do not defrag small contexts (i.e. < 2048 tokens) + // - count the padding towards the number of used tokens + const float fragmentation = kv_self.n >= 2048 ? 1.0f - float(kv_self.used + lctx.get_ctx_padding(cparams))/float(kv_self.n) : 0.0f; // queue defragmentation for next llama_kv_cache_update if (fragmentation > cparams.defrag_thold) { - //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation); + LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); kv_self.defrag(); } From f9971ef2e1754f8dde65d5fc0602b7719a0c5326 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Feb 2025 14:59:51 +0200 Subject: [PATCH 24/28] llama : dedup reserve code --- src/llama.cpp | 50 ++------------------------------------------------ 1 file changed, 2 insertions(+), 48 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 465938cf0..e89e70bbe 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7629,30 +7629,6 @@ static int llama_decode_impl( return -3; } - // reserve a worst case graph if needed - // TODO: extract to a function - if (lctx.need_reserve) { - const auto & cparams = lctx.cparams; - const auto & model = lctx.model; - - // build worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - - llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - - ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); - - // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(lctx.sched.get()); - if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - } - - lctx.need_reserve = false; - } - ggml_backend_sched_reset(lctx.sched.get()); ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); @@ -7889,30 +7865,8 @@ static int llama_encode_impl( //batch_manager->prepare(ubatch); - // reserve a worst case graph if needed - // TODO: extract to a function - if (lctx.need_reserve) { - // TODO: extract to a function - const auto & cparams = lctx.cparams; - const auto & model = lctx.model; - - // build worst-case graph - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); - - llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph - llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - - ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true); - - // initialize scheduler with the worst-case graph - ggml_backend_sched_reset(lctx.sched.get()); - if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); - } - - lctx.need_reserve = false; - } + // TODO: do reserve + GGML_ASSERT(lctx.need_reserve == false); ggml_backend_sched_reset(lctx.sched.get()); ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); From 879ba82777b93f30c32eca731d0bf03e7fd20be7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Feb 2025 15:00:02 +0200 Subject: [PATCH 25/28] server : increase context size for the tests ggml-ci --- examples/server/tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index ce0680662..97d650a9c 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -280,7 +280,7 @@ class ServerPreset: server.model_hf_repo = "ggml-org/models" server.model_hf_file = "tinyllamas/stories260K.gguf" server.model_alias = "tinyllama-2" - server.n_ctx = 256 + server.n_ctx = 512 server.n_batch = 32 server.n_slots = 2 server.n_predict = 64 From ef358ee78f08e4d7af3916e0d101925c5bc6e122 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Feb 2025 16:11:17 +0200 Subject: [PATCH 26/28] context : add decode/encode ggml-ci --- src/llama-context.cpp | 630 ++++++++++++++++++++++++++++++++++-------- src/llama-context.h | 32 +-- src/llama.cpp | 386 +------------------------- 3 files changed, 526 insertions(+), 522 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7705d583b..5d21dd5ef 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -9,6 +9,121 @@ #include #include +// llama output (TMP) + +// Make sure enough space is available for outputs. +// Returns max number of outputs for which space was reserved. +static size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { + const auto & cparams = lctx.cparams; + const auto & hparams = lctx.model.hparams; + const auto & vocab = lctx.model.vocab; + + const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); + + const auto n_batch = cparams.n_batch; + const auto n_vocab = vocab.n_tokens(); + const auto n_embd = hparams.n_embd; + + // TODO: use a per-batch flag for logits presence instead + const bool has_logits = !cparams.embeddings; + const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); + + const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; + const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; + + if (lctx.output_ids.empty()) { + // init, never resized afterwards + lctx.output_ids.resize(n_batch); + } + + const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0; + const size_t new_size = (logits_size + embd_size) * sizeof(float); + + // alloc only when more than the current capacity is required + // TODO: also consider shrinking the buffer + if (!lctx.buf_output || prev_size < new_size) { + if (lctx.buf_output) { +#ifndef NDEBUG + // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) + LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + lctx.buf_output = nullptr; + lctx.logits = nullptr; + lctx.embd = nullptr; + } + + auto * buft = ggml_backend_cpu_buffer_type(); + // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory + auto * output_dev = lctx.model.dev_output(); + auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; + if (output_dev_host_buft) { + buft = output_dev_host_buft; + } + lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); + if (lctx.buf_output == nullptr) { + LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); + return 0; + } + } + + float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get()); + + lctx.logits = has_logits ? output_base : nullptr; + lctx.embd = has_embd ? output_base + logits_size : nullptr; + + lctx.output_size = n_outputs_max; + lctx.logits_size = logits_size; + lctx.embd_size = embd_size; + + // set all ids as invalid (negative) + std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1); + + ggml_backend_buffer_clear(lctx.buf_output.get(), 0); + + lctx.n_outputs = 0; + + return n_outputs_max; +} + +// make the outputs have the same order they had in the user-provided batch +static void llama_output_reorder(struct llama_context & ctx) { + std::vector & out_ids = ctx.sbatch.out_ids; + if (!out_ids.empty()) { + const uint32_t n_vocab = ctx.model.vocab.n_tokens(); + const uint32_t n_embd = ctx.model.hparams.n_embd; + + const int32_t n_outputs = ctx.n_outputs; + GGML_ASSERT((size_t) n_outputs == out_ids.size()); + + // TODO: is there something more efficient which also minimizes swaps? + // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) + for (int32_t i = 0; i < n_outputs - 1; ++i) { + int32_t j_min = i; + for (int32_t j = i + 1; j < n_outputs; ++j) { + if (out_ids[j] < out_ids[j_min]) { + j_min = j; + } + } + if (j_min == i) { continue; } + std::swap(out_ids[i], out_ids[j_min]); + if (ctx.logits_size > 0) { + for (uint32_t k = 0; k < n_vocab; k++) { + std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]); + } + } + if (ctx.embd_size > 0) { + for (uint32_t k = 0; k < n_embd; k++) { + std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]); + } + } + } + std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1); + for (int32_t i = 0; i < n_outputs; ++i) { + ctx.output_ids[out_ids[i]] = i; + } + out_ids.clear(); + } +} static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { // TODO move to hparams if a T5 variant appears that uses a different value const int64_t max_distance = 128; @@ -340,6 +455,20 @@ llama_context::llama_context( } +struct llama_batch_manager_i { + virtual ~llama_batch_manager_i() = default; + + virtual bool is_done() const = 0; + virtual llama_ubatch next() = 0; + virtual bool prepare() = 0; + virtual void restore() = 0; + virtual void update() = 0; + virtual void finalize() = 0; + + // TODO: might be temporary + int64_t n_outputs_all = 0; +}; + struct llama_batch_manager : public llama_batch_manager_i { llama_batch_manager(llama_context & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { const auto & model = lctx.model; @@ -398,6 +527,10 @@ struct llama_batch_manager : public llama_batch_manager_i { ~llama_batch_manager() override { } + virtual bool is_done() const override { + return lctx.sbatch.n_tokens == 0; + } + virtual llama_ubatch next() override { ubatch = llama_ubatch(); @@ -558,6 +691,390 @@ std::unique_ptr llama_context::prepare_batch(const llama_ return std::make_unique(*this, batch); } +int llama_context::decode(llama_batch & inp_batch) { + is_encoding = false; + + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; + } + + // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); + + const llama_batch & batch = batch_allocr.batch; + + const auto & vocab = model.vocab; + const auto & hparams = model.hparams; + + const int32_t n_vocab = vocab.n_tokens(); + const int64_t n_embd = hparams.n_embd; + + // TODO: try catch + auto bman = prepare_batch(batch); + + const auto n_outputs_all = bman->n_outputs_all; + + // reserve output buffer + // TODO: move to batch manager? + if (llama_output_reserve(*this, bman->n_outputs_all) < (size_t) n_outputs_all) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); + return -2; + }; + + int64_t n_outputs_prev = 0; + + while (!bman->is_done()) { + llama_ubatch ubatch = bman->next(); + + if (!bman->prepare()) { + LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); + bman->restore(); + return -3; + } + + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + ggml_cgraph * gf = cb_build_graph(*this, ubatch, false); + + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + set_inputs(ubatch); + + // the output is always the last tensor in the graph + struct ggml_tensor * t_logits = ggml_graph_node(gf, -1); + struct ggml_tensor * t_embd = ggml_graph_node(gf, -2); + + if (n_outputs == 0) { + // no output + t_logits = nullptr; + t_embd = nullptr; + } else if (cparams.embeddings) { + t_logits = nullptr; // do not extract logits for embedding case + t_embd = nullptr; + for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { + if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { + t_embd = ggml_graph_node(gf, i); + break; + } + } + GGML_ASSERT(t_embd != nullptr && "missing embeddings tensor"); + } else { + t_embd = nullptr; // do not extract embeddings when not needed + GGML_ASSERT(strcmp(t_logits->name, "result_output") == 0 && "missing result_output tensor"); + } + + const auto compute_status = compute_graph(gf, ubatch.n_tokens > 1); + if (compute_status != GGML_STATUS_SUCCESS) { + bman->restore(); + switch (compute_status) { + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } + } + + bman->update(); + + // plot the computation graph in dot format (for debugging purposes) + //if (n_past%100 == 0) { + // ggml_graph_dump_dot(gf, NULL, "llama.dot"); + //} + + // extract logits + if (t_logits) { + ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits); + GGML_ASSERT(backend_res != nullptr); + GGML_ASSERT(logits != nullptr); + + float * logits_out = logits + n_outputs_prev*n_vocab; + const int32_t n_outputs_new = n_outputs; + + if (n_outputs_new) { + GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all); + GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) logits_size); + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs_new*n_vocab*sizeof(float)); + } + } + + // extract embeddings + if (t_embd) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); + GGML_ASSERT(backend_embd != nullptr); + + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + GGML_ASSERT(embd != nullptr); + float * embd_out = embd + n_outputs_prev*n_embd; + const int32_t n_outputs_new = n_outputs; + + if (n_outputs_new) { + GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all); + GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings (cleared before processing each batch) + auto & embd_seq_out = embd_seq; + + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_RANK: + { + // extract the rerank score - a single float per sequence + auto & embd_seq_out = embd_seq; + + for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(1); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } + } + } + + n_outputs_prev += n_outputs; + } + + // set output mappings + { + bool sorted_output = true; + + GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all); + + for (size_t i = 0; i < (size_t) n_outputs_all; ++i) { + size_t out_id = sbatch.out_ids[i]; + output_ids[out_id] = i; + if (out_id != i) { + sorted_output = false; + } + } + + if (sorted_output) { + sbatch.out_ids.clear(); + } + } + + // set to total number of outputs in the batch, for use in llama_get_logits_ith + n_outputs = n_outputs_all; + + // wait for the computation to finish (automatically done when obtaining the model output) + //llama_synchronize(&; + + bman->finalize(); + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + + return 0; +} + +int llama_context::encode(llama_batch & inp_batch) { + is_encoding = true; + + if (inp_batch.n_tokens == 0) { + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); + return -1; + } + + // temporary allocate memory for the input batch if needed + // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1); + + const llama_batch & batch = batch_allocr.batch; + const uint32_t n_tokens = batch.n_tokens; + + const auto & hparams = model.hparams; + + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + + if (batch.token) { + for (uint32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); + return -1; + } + } + } + + // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot + GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); + + if (t_compute_start_us == 0) { + t_compute_start_us = ggml_time_us(); + } + + n_queued_tokens += n_tokens; + + const int64_t n_embd = hparams.n_embd; + + sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); + + const llama_ubatch ubatch = sbatch.split_simple(n_tokens); + + // reserve output buffer + if (llama_output_reserve(*this, n_tokens) < n_tokens) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); + return -2; + }; + + for (uint32_t i = 0; i < n_tokens; ++i) { + output_ids[i] = i; + } + + inp_embd_enc = NULL; + n_outputs = n_tokens; + + //batch_manager->prepare(ubatch); + + // TODO: do reserve + GGML_ASSERT(need_reserve == false); + + ggml_backend_sched_reset(sched.get()); + ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + + ggml_cgraph * gf = cb_build_graph(*this, ubatch, false); + + ggml_backend_sched_alloc_graph(sched.get(), gf); + + set_inputs(ubatch); + + // the output embeddings after the final encoder normalization + struct ggml_tensor * t_embd = nullptr; + + // there are two cases here + if (llama_model_has_decoder(&model)) { + // first case is an encoder-decoder T5 model where embeddings are passed to decoder + t_embd = ggml_graph_node(gf, -1); + GGML_ASSERT(strcmp(t_embd->name, "result_norm") == 0 && "missing result_output tensor"); + } else { + // second case is an encoder-only T5 model + if (cparams.embeddings) { + // only output embeddings if required + t_embd = ggml_graph_node(gf, -1); + if (strcmp(t_embd->name, "result_embd_pooled") != 0) { + t_embd = ggml_graph_node(gf, -2); + } + GGML_ASSERT(strcmp(t_embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); + } + } + + const auto compute_status = compute_graph(gf, n_tokens > 1); + switch (compute_status) { + case GGML_STATUS_SUCCESS: + break; + case GGML_STATUS_ABORTED: + return 2; + case GGML_STATUS_ALLOC_FAILED: + return -2; + case GGML_STATUS_FAILED: + default: + return -3; + } + + // extract embeddings + if (t_embd) { + ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd); + GGML_ASSERT(backend_embd != nullptr); + + if (llama_model_has_decoder(&model)) { + embd_enc.resize(n_tokens*n_embd); + float * embd_out = embd_enc.data(); + + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + + // remember the sequence ids used during the encoding - needed for cross attention later + seq_ids_enc.resize(n_tokens); + for (uint32_t i = 0; i < n_tokens; i++) { + for (int s = 0; s < ubatch.n_seq_id[i]; s++) { + llama_seq_id seq_id = ubatch.seq_id[i][s]; + seq_ids_enc[i].insert(seq_id); + } + } + } else { + GGML_ASSERT(embd != nullptr); + + switch (cparams.pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + { + // extract token embeddings + GGML_ASSERT(embd != nullptr); + float * embd_out = embd; + + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); + } break; + case LLAMA_POOLING_TYPE_MEAN: + case LLAMA_POOLING_TYPE_CLS: + case LLAMA_POOLING_TYPE_LAST: + { + // extract sequence embeddings + auto & embd_seq_out = embd_seq; + embd_seq_out.clear(); + + GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits + + for (uint32_t i = 0; i < n_tokens; i++) { + const llama_seq_id seq_id = ubatch.seq_id[i][0]; + if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { + continue; + } + embd_seq_out[seq_id].resize(n_embd); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); + } + } break; + case LLAMA_POOLING_TYPE_RANK: + { + // TODO: this likely should be the same logic as in llama_decoder_internal, but better to + // wait for an encoder model that requires this pooling type in order to test it + // https://github.com/ggerganov/llama.cpp/pull/9510 + GGML_ABORT("RANK pooling not implemented yet"); + } + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ABORT("unknown pooling type"); + } + } + } + } + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + + return 0; +} + enum ggml_status llama_context::compute_graph( ggml_cgraph * graph, bool batched) { @@ -2194,119 +2711,6 @@ ggml_tensor * llama_context::build_rwkv6_time_mix( return cur; } -// llama output - -size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { - const auto & cparams = lctx.cparams; - const auto & hparams = lctx.model.hparams; - const auto & vocab = lctx.model.vocab; - - const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); - - const auto n_batch = cparams.n_batch; - const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; - - // TODO: use a per-batch flag for logits presence instead - const bool has_logits = !cparams.embeddings; - const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); - - const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; - const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; - - if (lctx.output_ids.empty()) { - // init, never resized afterwards - lctx.output_ids.resize(n_batch); - } - - const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0; - const size_t new_size = (logits_size + embd_size) * sizeof(float); - - // alloc only when more than the current capacity is required - // TODO: also consider shrinking the buffer - if (!lctx.buf_output || prev_size < new_size) { - if (lctx.buf_output) { -#ifndef NDEBUG - // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) - LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); -#endif - lctx.buf_output = nullptr; - lctx.logits = nullptr; - lctx.embd = nullptr; - } - - auto * buft = ggml_backend_cpu_buffer_type(); - // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory - auto * output_dev = lctx.model.dev_output(); - auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; - if (output_dev_host_buft) { - buft = output_dev_host_buft; - } - lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); - if (lctx.buf_output == nullptr) { - LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); - return 0; - } - } - - float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get()); - - lctx.logits = has_logits ? output_base : nullptr; - lctx.embd = has_embd ? output_base + logits_size : nullptr; - - lctx.output_size = n_outputs_max; - lctx.logits_size = logits_size; - lctx.embd_size = embd_size; - - // set all ids as invalid (negative) - std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1); - - ggml_backend_buffer_clear(lctx.buf_output.get(), 0); - - lctx.n_outputs = 0; - - return n_outputs_max; -} - -void llama_output_reorder(struct llama_context & ctx) { - std::vector & out_ids = ctx.sbatch.out_ids; - if (!out_ids.empty()) { - const uint32_t n_vocab = ctx.model.vocab.n_tokens(); - const uint32_t n_embd = ctx.model.hparams.n_embd; - - const int32_t n_outputs = ctx.n_outputs; - GGML_ASSERT((size_t) n_outputs == out_ids.size()); - - // TODO: is there something more efficient which also minimizes swaps? - // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) - for (int32_t i = 0; i < n_outputs - 1; ++i) { - int32_t j_min = i; - for (int32_t j = i + 1; j < n_outputs; ++j) { - if (out_ids[j] < out_ids[j_min]) { - j_min = j; - } - } - if (j_min == i) { continue; } - std::swap(out_ids[i], out_ids[j_min]); - if (ctx.logits_size > 0) { - for (uint32_t k = 0; k < n_vocab; k++) { - std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]); - } - } - if (ctx.embd_size > 0) { - for (uint32_t k = 0; k < n_embd; k++) { - std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]); - } - } - } - std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1); - for (int32_t i = 0; i < n_outputs; ++i) { - ctx.output_ids[out_ids[i]] = i; - } - out_ids.clear(); - } -} - // // interface implementation // diff --git a/src/llama-context.h b/src/llama-context.h index 4cf4a6312..f6d63eb3c 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -16,22 +16,7 @@ using llama_loras = std::unordered_map; -// TODO: this is very WIP - improve -struct llama_batch_manager_i { - virtual ~llama_batch_manager_i() = default; - - //bool is_done() const; - - virtual llama_ubatch next() = 0; - - virtual bool prepare() = 0; - virtual void restore() = 0; - virtual void update() = 0; - virtual void finalize() = 0; - - // TODO: might be temporary - int64_t n_outputs_all = 0; -}; +struct llama_batch_manager_i; // TODO: make implementation details private // TODO: become abstract base class, split the current implementation into different child classes @@ -44,6 +29,8 @@ struct llama_context { const llama_context_params & params, build_graph_callback && cb_build_graph); + virtual ~llama_context() = default; + const struct llama_model & model; llama_cparams cparams; @@ -104,8 +91,10 @@ struct llama_context { ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; - // TODO: do not pass logits_all explicitly - std::unique_ptr prepare_batch(const llama_batch & batch); + virtual std::unique_ptr prepare_batch(const llama_batch & batch); + + virtual int decode(llama_batch & inp_batch); + virtual int encode(llama_batch & inp_batch); // returns the result of ggml_backend_sched_graph_compute_async execution enum ggml_status compute_graph( @@ -286,13 +275,6 @@ struct llama_context { int n_pos_per_token = 1; }; -// Make sure enough space is available for outputs. -// Returns max number of outputs for which space was reserved. -size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs); - -// make the outputs have the same order they had in the user-provided batch -void llama_output_reorder(struct llama_context & ctx); - // For internal test use // TODO: remove const std::vector> & llama_internal_get_tensor_map(struct llama_context * ctx); diff --git a/src/llama.cpp b/src/llama.cpp index e89e70bbe..ed5e1e525 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7584,213 +7584,7 @@ static struct ggml_cgraph * llama_build_graph( static int llama_decode_impl( llama_context & lctx, llama_batch inp_batch) { - - lctx.is_encoding = false; - - if (inp_batch.n_tokens == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); - return -1; - } - - // temporary allocate memory for the input batch if needed - // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); - - const llama_batch & batch = batch_allocr.batch; - - const auto & model = lctx.model; - const auto & vocab = model.vocab; - const auto & cparams = lctx.cparams; - const auto & hparams = lctx.model.hparams; - - const int32_t n_vocab = vocab.n_tokens(); - const int64_t n_embd = hparams.n_embd; - - // TODO: try catch - auto bman = lctx.prepare_batch(batch); - - const auto n_outputs_all = bman->n_outputs_all; - - // reserve output buffer - // TODO: move to batch manager? - if (llama_output_reserve(lctx, bman->n_outputs_all) < (size_t) n_outputs_all) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); - return -2; - }; - - int64_t n_outputs_prev = 0; - - while (lctx.sbatch.n_tokens > 0) { - llama_ubatch ubatch = bman->next(); - - if (!bman->prepare()) { - LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); - bman->restore(); - return -3; - } - - ggml_backend_sched_reset(lctx.sched.get()); - ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); - - ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); - - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - - ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - - lctx.set_inputs(ubatch); - - // the output is always the last tensor in the graph - struct ggml_tensor * res = ggml_graph_node(gf, -1); - struct ggml_tensor * embd = ggml_graph_node(gf, -2); - - if (lctx.n_outputs == 0) { - // no output - res = nullptr; - embd = nullptr; - } else if (cparams.embeddings) { - res = nullptr; // do not extract logits for embedding case - embd = nullptr; - for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) { - if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) { - embd = ggml_graph_node(gf, i); - break; - } - } - GGML_ASSERT(embd != nullptr && "missing embeddings tensor"); - } else { - embd = nullptr; // do not extract embeddings when not needed - GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); - } - - const auto compute_status = lctx.compute_graph(gf, ubatch.n_tokens > 1); - if (compute_status != GGML_STATUS_SUCCESS) { - bman->restore(); - switch (compute_status) { - case GGML_STATUS_ABORTED: - return 2; - case GGML_STATUS_ALLOC_FAILED: - return -2; - case GGML_STATUS_FAILED: - default: - return -3; - } - } - - bman->update(); - - // plot the computation graph in dot format (for debugging purposes) - //if (n_past%100 == 0) { - // ggml_graph_dump_dot(gf, NULL, "llama.dot"); - //} - - // extract logits - if (res) { - ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), res); - GGML_ASSERT(backend_res != nullptr); - GGML_ASSERT(lctx.logits != nullptr); - - float * logits_out = lctx.logits + n_outputs_prev*n_vocab; - const int32_t n_outputs_new = lctx.n_outputs; - - if (n_outputs_new) { - GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all); - GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size); - ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float)); - } - } - - // extract embeddings - if (embd) { - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd); - GGML_ASSERT(backend_embd != nullptr); - - switch (cparams.pooling_type) { - case LLAMA_POOLING_TYPE_NONE: - { - // extract token embeddings - GGML_ASSERT(lctx.embd != nullptr); - float * embd_out = lctx.embd + n_outputs_prev*n_embd; - const int32_t n_outputs_new = lctx.n_outputs; - - if (n_outputs_new) { - GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all); - GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size); - ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_MEAN: - case LLAMA_POOLING_TYPE_CLS: - case LLAMA_POOLING_TYPE_LAST: - { - // extract sequence embeddings (cleared before processing each batch) - auto & embd_seq_out = lctx.embd_seq; - - for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_RANK: - { - // extract the rerank score - a single float per sequence - auto & embd_seq_out = lctx.embd_seq; - - for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(1); - ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_UNSPECIFIED: - { - GGML_ABORT("unknown pooling type"); - } - } - } - - n_outputs_prev += lctx.n_outputs; - } - - // set output mappings - { - bool sorted_output = true; - - GGML_ASSERT(lctx.sbatch.out_ids.size() == (size_t) n_outputs_all); - - for (size_t i = 0; i < (size_t) n_outputs_all; ++i) { - size_t out_id = lctx.sbatch.out_ids[i]; - lctx.output_ids[out_id] = i; - if (out_id != i) { - sorted_output = false; - } - } - - if (sorted_output) { - lctx.sbatch.out_ids.clear(); - } - } - - // set to total number of outputs in the batch, for use in llama_get_logits_ith - lctx.n_outputs = n_outputs_all; - - // wait for the computation to finish (automatically done when obtaining the model output) - //llama_synchronize(&lctx); - - bman->finalize(); - - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to - // overlap with device computation. - ggml_backend_sched_reset(lctx.sched.get()); - - return 0; + return lctx.decode(inp_batch); } // encode a batch of tokens by evaluating the encoder part of the transformer @@ -7805,183 +7599,7 @@ static int llama_decode_impl( static int llama_encode_impl( llama_context & lctx, llama_batch inp_batch) { - - lctx.is_encoding = true; - - if (inp_batch.n_tokens == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); - return -1; - } - - // temporary allocate memory for the input batch if needed - // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1); - - const llama_batch & batch = batch_allocr.batch; - const uint32_t n_tokens = batch.n_tokens; - - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT - - if (batch.token) { - for (uint32_t i = 0; i < n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); - return -1; - } - } - } - - // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot - GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); - - if (lctx.t_compute_start_us == 0) { - lctx.t_compute_start_us = ggml_time_us(); - } - - lctx.n_queued_tokens += n_tokens; - - const int64_t n_embd = hparams.n_embd; - - lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); - - const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens); - - // reserve output buffer - if (llama_output_reserve(lctx, n_tokens) < n_tokens) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); - return -2; - }; - - for (uint32_t i = 0; i < n_tokens; ++i) { - lctx.output_ids[i] = i; - } - - lctx.inp_embd_enc = NULL; - lctx.n_outputs = n_tokens; - - //batch_manager->prepare(ubatch); - - // TODO: do reserve - GGML_ASSERT(lctx.need_reserve == false); - - ggml_backend_sched_reset(lctx.sched.get()); - ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); - - ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); - - ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); - - lctx.set_inputs(ubatch); - - // the output embeddings after the final encoder normalization - struct ggml_tensor * embd = nullptr; - - // there are two cases here - if (llama_model_has_decoder(&lctx.model)) { - // first case is an encoder-decoder T5 model where embeddings are passed to decoder - embd = ggml_graph_node(gf, -1); - GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor"); - } else { - // second case is an encoder-only T5 model - if (cparams.embeddings) { - // only output embeddings if required - embd = ggml_graph_node(gf, -1); - if (strcmp(embd->name, "result_embd_pooled") != 0) { - embd = ggml_graph_node(gf, -2); - } - GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); - } - } - - const auto compute_status = lctx.compute_graph(gf, n_tokens > 1); - switch (compute_status) { - case GGML_STATUS_SUCCESS: - break; - case GGML_STATUS_ABORTED: - return 2; - case GGML_STATUS_ALLOC_FAILED: - return -2; - case GGML_STATUS_FAILED: - default: - return -3; - } - - // extract embeddings - if (embd) { - ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd); - GGML_ASSERT(backend_embd != nullptr); - - if (llama_model_has_decoder(&lctx.model)) { - lctx.embd_enc.resize(n_tokens*n_embd); - float * embd_out = lctx.embd_enc.data(); - - ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - - // remember the sequence ids used during the encoding - needed for cross attention later - lctx.seq_ids_enc.resize(n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - for (int s = 0; s < ubatch.n_seq_id[i]; s++) { - llama_seq_id seq_id = ubatch.seq_id[i][s]; - lctx.seq_ids_enc[i].insert(seq_id); - } - } - } else { - GGML_ASSERT(lctx.embd != nullptr); - - switch (cparams.pooling_type) { - case LLAMA_POOLING_TYPE_NONE: - { - // extract token embeddings - GGML_ASSERT(lctx.embd != nullptr); - float * embd_out = lctx.embd; - - GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size); - ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float)); - } break; - case LLAMA_POOLING_TYPE_MEAN: - case LLAMA_POOLING_TYPE_CLS: - case LLAMA_POOLING_TYPE_LAST: - { - // extract sequence embeddings - auto & embd_seq_out = lctx.embd_seq; - embd_seq_out.clear(); - - GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits - - for (uint32_t i = 0; i < n_tokens; i++) { - const llama_seq_id seq_id = ubatch.seq_id[i][0]; - if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { - continue; - } - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float)); - } - } break; - case LLAMA_POOLING_TYPE_RANK: - { - // TODO: this likely should be the same logic as in llama_decoder_internal, but better to - // wait for an encoder model that requires this pooling type in order to test it - // https://github.com/ggerganov/llama.cpp/pull/9510 - GGML_ABORT("RANK pooling not implemented yet"); - } - case LLAMA_POOLING_TYPE_UNSPECIFIED: - { - GGML_ABORT("unknown pooling type"); - } - } - } - } - - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to - // overlap with device computation. - ggml_backend_sched_reset(lctx.sched.get()); - - return 0; + return lctx.encode(inp_batch); } // From d1d8d530083a9bf3ada2427bf59e97fa58667365 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Feb 2025 16:50:14 +0200 Subject: [PATCH 27/28] bman : remove ubatch member ggml-ci --- src/llama-context.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 5d21dd5ef..4387128fe 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -460,9 +460,9 @@ struct llama_batch_manager_i { virtual bool is_done() const = 0; virtual llama_ubatch next() = 0; - virtual bool prepare() = 0; + virtual bool prepare(const llama_ubatch & ubatch) = 0; virtual void restore() = 0; - virtual void update() = 0; + virtual void update(const llama_ubatch & ubatch) = 0; virtual void finalize() = 0; // TODO: might be temporary @@ -532,7 +532,7 @@ struct llama_batch_manager : public llama_batch_manager_i { } virtual llama_ubatch next() override { - ubatch = llama_ubatch(); + llama_ubatch ubatch = llama_ubatch(); const auto & cparams = lctx.cparams; const auto & kv_self = lctx.kv_self; @@ -557,7 +557,7 @@ struct llama_batch_manager : public llama_batch_manager_i { return ubatch; } - virtual bool prepare() override { + virtual bool prepare(const llama_ubatch & ubatch) override { const auto & cparams = lctx.cparams; const auto & hparams = lctx.model.hparams; const auto & batch = lctx.sbatch.batch; @@ -644,7 +644,7 @@ struct llama_batch_manager : public llama_batch_manager_i { kv_slot_restorer.restore(lctx.kv_self); } - virtual void update() override { + virtual void update(const llama_ubatch & ubatch) override { auto & kv_self = lctx.kv_self; // update the kv ring buffer @@ -682,8 +682,6 @@ struct llama_batch_manager : public llama_batch_manager_i { const llama_batch & batch; - llama_ubatch ubatch; - llama_kv_slot_restorer kv_slot_restorer; }; @@ -728,7 +726,7 @@ int llama_context::decode(llama_batch & inp_batch) { while (!bman->is_done()) { llama_ubatch ubatch = bman->next(); - if (!bman->prepare()) { + if (!bman->prepare(ubatch)) { LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); bman->restore(); return -3; @@ -782,7 +780,7 @@ int llama_context::decode(llama_batch & inp_batch) { } } - bman->update(); + bman->update(ubatch); // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { From 2cd8a903c84b9fbf91f256a6349e05e492a47421 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 10 Feb 2025 17:01:27 +0200 Subject: [PATCH 28/28] context : make output functions members ggml-ci --- src/llama-context.cpp | 238 ++++++++++++++++++++---------------------- src/llama-context.h | 8 ++ 2 files changed, 122 insertions(+), 124 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4387128fe..87d6642da 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -9,121 +9,6 @@ #include #include -// llama output (TMP) - -// Make sure enough space is available for outputs. -// Returns max number of outputs for which space was reserved. -static size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { - const auto & cparams = lctx.cparams; - const auto & hparams = lctx.model.hparams; - const auto & vocab = lctx.model.vocab; - - const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); - - const auto n_batch = cparams.n_batch; - const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; - - // TODO: use a per-batch flag for logits presence instead - const bool has_logits = !cparams.embeddings; - const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); - - const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; - const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; - - if (lctx.output_ids.empty()) { - // init, never resized afterwards - lctx.output_ids.resize(n_batch); - } - - const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0; - const size_t new_size = (logits_size + embd_size) * sizeof(float); - - // alloc only when more than the current capacity is required - // TODO: also consider shrinking the buffer - if (!lctx.buf_output || prev_size < new_size) { - if (lctx.buf_output) { -#ifndef NDEBUG - // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) - LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); -#endif - lctx.buf_output = nullptr; - lctx.logits = nullptr; - lctx.embd = nullptr; - } - - auto * buft = ggml_backend_cpu_buffer_type(); - // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory - auto * output_dev = lctx.model.dev_output(); - auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; - if (output_dev_host_buft) { - buft = output_dev_host_buft; - } - lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); - if (lctx.buf_output == nullptr) { - LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); - return 0; - } - } - - float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get()); - - lctx.logits = has_logits ? output_base : nullptr; - lctx.embd = has_embd ? output_base + logits_size : nullptr; - - lctx.output_size = n_outputs_max; - lctx.logits_size = logits_size; - lctx.embd_size = embd_size; - - // set all ids as invalid (negative) - std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1); - - ggml_backend_buffer_clear(lctx.buf_output.get(), 0); - - lctx.n_outputs = 0; - - return n_outputs_max; -} - -// make the outputs have the same order they had in the user-provided batch -static void llama_output_reorder(struct llama_context & ctx) { - std::vector & out_ids = ctx.sbatch.out_ids; - if (!out_ids.empty()) { - const uint32_t n_vocab = ctx.model.vocab.n_tokens(); - const uint32_t n_embd = ctx.model.hparams.n_embd; - - const int32_t n_outputs = ctx.n_outputs; - GGML_ASSERT((size_t) n_outputs == out_ids.size()); - - // TODO: is there something more efficient which also minimizes swaps? - // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) - for (int32_t i = 0; i < n_outputs - 1; ++i) { - int32_t j_min = i; - for (int32_t j = i + 1; j < n_outputs; ++j) { - if (out_ids[j] < out_ids[j_min]) { - j_min = j; - } - } - if (j_min == i) { continue; } - std::swap(out_ids[i], out_ids[j_min]); - if (ctx.logits_size > 0) { - for (uint32_t k = 0; k < n_vocab; k++) { - std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]); - } - } - if (ctx.embd_size > 0) { - for (uint32_t k = 0; k < n_embd; k++) { - std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]); - } - } - } - std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1); - for (int32_t i = 0; i < n_outputs; ++i) { - ctx.output_ids[out_ids[i]] = i; - } - out_ids.clear(); - } -} static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { // TODO move to hparams if a T5 variant appears that uses a different value const int64_t max_distance = 128; @@ -334,7 +219,7 @@ llama_context::llama_context( // graph outputs buffer { // resized during inference when a batch uses more outputs - if (llama_output_reserve(*this, params.n_seq_max) < params.n_seq_max) { + if (reserve_outputs(params.n_seq_max) < params.n_seq_max) { LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); throw std::runtime_error("failed to reserve initial output buffer"); } @@ -716,7 +601,7 @@ int llama_context::decode(llama_batch & inp_batch) { // reserve output buffer // TODO: move to batch manager? - if (llama_output_reserve(*this, bman->n_outputs_all) < (size_t) n_outputs_all) { + if (reserve_outputs(bman->n_outputs_all) < (size_t) n_outputs_all) { LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); return -2; }; @@ -940,7 +825,7 @@ int llama_context::encode(llama_batch & inp_batch) { const llama_ubatch ubatch = sbatch.split_simple(n_tokens); // reserve output buffer - if (llama_output_reserve(*this, n_tokens) < n_tokens) { + if (reserve_outputs(n_tokens) < n_tokens) { LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); return -2; }; @@ -1555,6 +1440,113 @@ void llama_context::set_inputs(const llama_ubatch & ubatch) { } } +void llama_context::reorder_outputs() { + std::vector & out_ids = sbatch.out_ids; + if (!out_ids.empty()) { + const uint32_t n_vocab = model.vocab.n_tokens(); + const uint32_t n_embd = model.hparams.n_embd; + + GGML_ASSERT((size_t) n_outputs == out_ids.size()); + + // TODO: is there something more efficient which also minimizes swaps? + // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) + for (int32_t i = 0; i < n_outputs - 1; ++i) { + int32_t j_min = i; + for (int32_t j = i + 1; j < n_outputs; ++j) { + if (out_ids[j] < out_ids[j_min]) { + j_min = j; + } + } + if (j_min == i) { continue; } + std::swap(out_ids[i], out_ids[j_min]); + if (logits_size > 0) { + for (uint32_t k = 0; k < n_vocab; k++) { + std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]); + } + } + if (embd_size > 0) { + for (uint32_t k = 0; k < n_embd; k++) { + std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]); + } + } + } + std::fill(output_ids.begin(), output_ids.end(), -1); + for (int32_t i = 0; i < n_outputs; ++i) { + output_ids[out_ids[i]] = i; + } + out_ids.clear(); + } +} + +size_t llama_context::reserve_outputs(size_t n_outputs) { + const auto & hparams = model.hparams; + const auto & vocab = model.vocab; + + const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); + + const auto n_batch = cparams.n_batch; + const auto n_vocab = vocab.n_tokens(); + const auto n_embd = hparams.n_embd; + + // TODO: use a per-batch flag for logits presence instead + const bool has_logits = !cparams.embeddings; + const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); + + logits_size = has_logits ? n_vocab*n_outputs_max : 0; + embd_size = has_embd ? n_embd*n_outputs_max : 0; + + if (output_ids.empty()) { + // init, never resized afterwards + output_ids.resize(n_batch); + } + + const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; + const size_t new_size = (logits_size + embd_size) * sizeof(float); + + // alloc only when more than the current capacity is required + // TODO: also consider shrinking the buffer + if (!buf_output || prev_size < new_size) { + if (buf_output) { +#ifndef NDEBUG + // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) + LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + buf_output = nullptr; + logits = nullptr; + embd = nullptr; + } + + auto * buft = ggml_backend_cpu_buffer_type(); + // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory + auto * output_dev = model.dev_output(); + auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr; + if (output_dev_host_buft) { + buft = output_dev_host_buft; + } + buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size)); + if (buf_output == nullptr) { + LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); + return 0; + } + } + + float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get()); + + logits = has_logits ? output_base : nullptr; + embd = has_embd ? output_base + logits_size : nullptr; + + output_size = n_outputs_max; + + // set all ids as invalid (negative) + std::fill(output_ids.begin(), output_ids.end(), -1); + + ggml_backend_buffer_clear(buf_output.get(), 0); + + n_outputs = 0; + + return n_outputs_max; +} + // do mat_mul, while optionally apply lora ggml_tensor * llama_context::build_lora_mm( ggml_context * ctx0, @@ -2827,8 +2819,7 @@ float * llama_get_logits(struct llama_context * ctx) { llama_synchronize(ctx); // reorder logits for backward compatibility - // TODO: maybe deprecate this - llama_output_reorder(*ctx); + ctx->reorder_outputs(); return ctx->logits; } @@ -2877,8 +2868,7 @@ float * llama_get_embeddings(struct llama_context * ctx) { llama_synchronize(ctx); // reorder embeddings for backward compatibility - // TODO: maybe deprecate this - llama_output_reorder(*ctx); + ctx->reorder_outputs(); return ctx->embd; } @@ -3187,7 +3177,7 @@ struct llama_data_write { //} void write_output_ids(struct llama_context * ctx) { - llama_output_reorder(*ctx); + ctx->reorder_outputs(); const uint32_t n_outputs = ctx->n_outputs; @@ -3281,7 +3271,7 @@ struct llama_data_read { uint32_t n_outputs; read_to(&n_outputs, sizeof(n_outputs)); - if (n_outputs > llama_output_reserve(*ctx, n_outputs)) { + if (n_outputs > ctx->reserve_outputs(n_outputs)) { throw std::runtime_error("could not reserve outputs"); } diff --git a/src/llama-context.h b/src/llama-context.h index f6d63eb3c..8f22fd3b1 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -114,6 +114,14 @@ struct llama_context { void set_inputs(const llama_ubatch & ubatch); + // make the outputs have the same order they had in the user-provided batch + // TODO: maybe deprecate this + void reorder_outputs(); + + // Make sure enough space is available for outputs. + // Returns max number of outputs for which space was reserved. + size_t reserve_outputs(size_t n_outputs); + ggml_tensor * build_lora_mm( ggml_context * ctx0, ggml_tensor * w,