diff --git a/src/llama.cpp b/src/llama.cpp index 1813dd29b..6411598ff 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -10030,7 +10030,7 @@ struct llm_build_context { llama_context & lctx; const llama_hparams & hparams; const llama_cparams & cparams; - const llama_ubatch & batch; + const llama_ubatch & ubatch; const llama_kv_cache & kv_self; const int64_t n_embd; @@ -10076,14 +10076,14 @@ struct llm_build_context { // TODO: consider making the entire interface noexcept llm_build_context( llama_context & lctx, - const llama_ubatch & batch, + const llama_ubatch & ubatch, const llm_build_cb & cb, bool worst_case) : model (lctx.model), lctx (lctx), hparams (model.hparams), cparams (lctx.cparams), - batch (batch), + ubatch (ubatch), kv_self (lctx.kv_self), n_embd (hparams.n_embd), n_layer (hparams.n_layer), @@ -10105,7 +10105,7 @@ struct llm_build_context { beta_slow (cparams.yarn_beta_slow), norm_eps (hparams.f_norm_eps), norm_rms_eps (hparams.f_norm_rms_eps), - n_tokens (batch.n_tokens), + n_tokens (ubatch.n_tokens), n_kv (worst_case ? kv_self.size : kv_self.n), n_outputs (worst_case ? n_tokens : lctx.n_outputs), n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd), @@ -10474,7 +10474,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -10634,7 +10634,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr; @@ -10749,7 +10749,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -10853,7 +10853,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -10975,7 +10975,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // multiply by embedding_multiplier_scale of 78.38367176906169 inpL = ggml_scale(ctx0, inpL, 78.38367176906169f); @@ -11133,7 +11133,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -11255,7 +11255,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -11358,7 +11358,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); @@ -11460,7 +11460,7 @@ struct llm_build_context { } // construct input embeddings (token, type, position) - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // token types are hardcoded to zero ("Sentence A") struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); @@ -11647,7 +11647,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); @@ -11749,7 +11749,7 @@ struct llm_build_context { struct ggml_tensor * pos; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); @@ -11887,7 +11887,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -12037,7 +12037,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -12150,7 +12150,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -12265,7 +12265,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -12410,7 +12410,7 @@ struct llm_build_context { struct ggml_tensor * ffn_output; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -12529,7 +12529,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -12657,7 +12657,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -12762,7 +12762,7 @@ struct llm_build_context { struct ggml_tensor * pos; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -12867,7 +12867,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -12977,7 +12977,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -13095,7 +13095,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -13222,7 +13222,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // scale the input embeddings inpL = ggml_scale(ctx0, inpL, scale_embd); @@ -13366,7 +13366,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // scale the input embeddings inpL = ggml_scale(ctx0, inpL, scale_embd); @@ -13567,7 +13567,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); @@ -13675,7 +13675,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); @@ -13813,7 +13813,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -13929,7 +13929,7 @@ struct llm_build_context { struct ggml_tensor * inpL; // {n_embd, n_tokens} - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); struct ggml_tensor * state_copy = build_inp_s_copy(); struct ggml_tensor * state_mask = build_inp_s_mask(); @@ -13941,7 +13941,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il); - cur = llm_build_mamba(ctx0, lctx, batch, gf, cur, + cur = llm_build_mamba(ctx0, lctx, ubatch, gf, cur, state_copy, state_mask, kv_head, n_kv, cb, il); @@ -13987,7 +13987,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -14144,7 +14144,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -14272,7 +14272,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -14391,7 +14391,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -14518,7 +14518,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -14663,7 +14663,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -14804,7 +14804,7 @@ struct llm_build_context { struct ggml_tensor * inpL; // {n_embd, n_tokens} - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -15019,7 +15019,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -15173,7 +15173,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); GGML_ASSERT(lctx.is_encoding); struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false); @@ -15305,7 +15305,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); GGML_ASSERT(!lctx.is_encoding); GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first"); @@ -15507,7 +15507,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); @@ -15599,7 +15599,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -15713,7 +15713,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -15837,7 +15837,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -15957,11 +15957,11 @@ struct llm_build_context { // Token shift state dimensions should be 2 * n_emb GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); - const int64_t n_seqs = batch.n_seqs; - const int64_t n_seq_tokens = batch.n_seq_tokens; - const int64_t n_tokens = batch.n_tokens; + const int64_t n_seqs = ubatch.n_seqs; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_tokens = ubatch.n_tokens; GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(batch.equal_seqs); + GGML_ASSERT(ubatch.equal_seqs); GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); struct ggml_tensor * cur; @@ -15969,7 +15969,7 @@ struct llm_build_context { struct ggml_tensor * state_copy = build_inp_s_copy(); struct ggml_tensor * state_mask = build_inp_s_mask(); - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); for (int il = 0; il < n_layer; ++il) { @@ -16083,7 +16083,7 @@ struct llm_build_context { struct ggml_tensor * cur; struct ggml_tensor * inpL; - inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); // inp_pos - contains the positions struct ggml_tensor * inp_pos = build_inp_pos(); @@ -16279,7 +16279,7 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { static struct ggml_cgraph * llama_build_graph( llama_context & lctx, - const llama_ubatch & batch, + const llama_ubatch & ubatch, bool worst_case) { const auto & model = lctx.model; @@ -16301,7 +16301,7 @@ static struct ggml_cgraph * llama_build_graph( // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer; - if (batch.n_tokens < 32 || full_offload) { + if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { for (auto * backend : lctx.backends) { if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) && @@ -16316,7 +16316,7 @@ static struct ggml_cgraph * llama_build_graph( struct ggml_cgraph * result = NULL; - struct llm_build_context llm(lctx, batch, cb, worst_case); + struct llm_build_context llm(lctx, ubatch, cb, worst_case); llm.init(); @@ -16567,7 +16567,7 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } -static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { +static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) { // // set input data // @@ -16576,28 +16576,28 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { const auto & cparams = lctx.cparams; const auto & kv_self = lctx.kv_self; - if (batch.token) { - const int64_t n_tokens = batch.n_tokens; + if (ubatch.token) { + const int64_t n_tokens = ubatch.n_tokens; - ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens)); + ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens)); } - if (batch.embd) { + if (ubatch.embd) { const int64_t n_embd = hparams.n_embd; - const int64_t n_tokens = batch.n_tokens; + const int64_t n_tokens = ubatch.n_tokens; - ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd)); + ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd)); } - if (batch.pos && lctx.inp_pos) { - const int64_t n_tokens = batch.n_tokens; + if (ubatch.pos && lctx.inp_pos) { + const int64_t n_tokens = ubatch.n_tokens; - ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); + ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); } if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); - const int64_t n_tokens = batch.n_tokens; + const int64_t n_tokens = ubatch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer)); int32_t * data = (int32_t *) lctx.inp_out_ids->data; @@ -16606,10 +16606,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { for (int i = 0; i < n_tokens; ++i) { data[i] = i; } - } else if (batch.output) { + } else if (ubatch.output) { int32_t n_outputs = 0; for (int i = 0; i < n_tokens; ++i) { - if (batch.output[i]) { + if (ubatch.output[i]) { data[n_outputs++] = i; } } @@ -16634,9 +16634,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. if (cparams.causal_attn && !lctx.is_encoding) { const int64_t n_kv = kv_self.n; - const int64_t n_tokens = batch.n_tokens; - const int64_t n_seq_tokens = batch.n_seq_tokens; - const int64_t n_seqs = batch.n_seqs; + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; float * data = nullptr; @@ -16653,14 +16653,14 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { } // For causal attention, use only the previous KV cells - // of the correct sequence for each token of the batch. + // of the correct sequence for each token of the ubatch. // It's assumed that if a token in the batch has multiple sequences, they are equivalent. for (int h = 0; h < 1; ++h) { for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = batch.seq_id[s][0]; + const llama_seq_id seq_id = ubatch.seq_id[s][0]; for (int j = 0; j < n_seq_tokens; ++j) { - const llama_pos pos = batch.pos[s*n_seq_tokens + j]; + const llama_pos pos = ubatch.pos[s*n_seq_tokens + j]; for (int i = 0; i < n_kv; ++i) { float f; @@ -16706,9 +16706,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { } } } else { - const int64_t n_tokens = batch.n_tokens; - const int64_t n_seq_tokens = batch.n_seq_tokens; - const int64_t n_seqs = batch.n_seqs; + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; // when using kv cache, the mask needs to match the kv cache size const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens; @@ -16718,7 +16718,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { for (int h = 0; h < 1; ++h) { for (int s1 = 0; s1 < n_seqs; ++s1) { - const llama_seq_id seq_id = batch.seq_id[s1][0]; + const llama_seq_id seq_id = ubatch.seq_id[s1][0]; for (int j = 0; j < n_seq_tokens; ++j) { const int32_t tj = s1*n_seq_tokens + j; @@ -16728,10 +16728,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { const int32_t ti = s0*n_seq_tokens + i; float f = -INFINITY; - for (int s = 0; s < batch.n_seq_id[s0]; ++s) { - if (batch.seq_id[s0][s] == seq_id) { + for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) { + if (ubatch.seq_id[s0][s] == seq_id) { if (hparams.use_alibi) { - f = -std::abs(batch.pos[ti] - batch.pos[tj]); + f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]); } else { f = 0.0f; } @@ -16753,9 +16753,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { } if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { - const int64_t n_tokens = batch.n_tokens; - const int64_t n_seq_tokens = batch.n_seq_tokens; - const int64_t n_seqs = batch.n_seqs; + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; GGML_ASSERT(lctx.inp_mean); GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); @@ -16766,12 +16766,12 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { std::vector sum(n_tokens, 0); for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = batch.seq_id[s][0]; + const llama_seq_id seq_id = ubatch.seq_id[s][0]; - // TODO: adapt limits to n_seqs when batch.equal_seqs is true + // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); - sum[seq_id] += batch.n_seq_tokens; + sum[seq_id] += ubatch.n_seq_tokens; } std::vector div(n_tokens, 0.0f); @@ -16783,7 +16783,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { } for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = batch.seq_id[s][0]; + const llama_seq_id seq_id = ubatch.seq_id[s][0]; for (int i = 0; i < n_seq_tokens; ++i) { data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; @@ -16794,9 +16794,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { if (cparams.embeddings && ( cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { - const int64_t n_tokens = batch.n_tokens; - const int64_t n_seq_tokens = batch.n_seq_tokens; - const int64_t n_seqs = batch.n_seqs; + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; GGML_ASSERT(lctx.inp_cls); GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); @@ -16805,13 +16805,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = batch.seq_id[s][0]; + const llama_seq_id seq_id = ubatch.seq_id[s][0]; - // TODO: adapt limits to n_seqs when batch.equal_seqs is true + // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = batch.pos[s*n_seq_tokens + i]; + const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; if (pos == 0) { data[seq_id] = s*n_seq_tokens + i; @@ -16821,9 +16821,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { } if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { - const int64_t n_tokens = batch.n_tokens; - const int64_t n_seq_tokens = batch.n_seq_tokens; - const int64_t n_seqs = batch.n_seqs; + const int64_t n_tokens = ubatch.n_tokens; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; GGML_ASSERT(lctx.inp_cls); GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); @@ -16835,13 +16835,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { std::vector last_row(n_tokens, -1); for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = batch.seq_id[s][0]; + const llama_seq_id seq_id = ubatch.seq_id[s][0]; - // TODO: adapt limits to n_seqs when batch.equal_seqs is true + // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = batch.pos[s*n_seq_tokens + i]; + const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; if (pos >= last_pos[seq_id]) { last_pos[seq_id] = pos; @@ -16903,10 +16903,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { } if (lctx.inp_pos_bucket) { - const int64_t n_tokens = batch.n_tokens; + const int64_t n_tokens = ubatch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer)); - GGML_ASSERT(!batch.equal_seqs); // TODO: use batch.n_seqs instead of failing + GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing int32_t * data = (int32_t *) lctx.inp_pos_bucket->data; @@ -16915,7 +16915,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_kv; ++i) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding); + data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding); } } } @@ -16923,7 +16923,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_tokens; ++i) { - data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(batch.pos[i], batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding); + data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding); } } } @@ -16939,10 +16939,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) { const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd; - const int64_t n_tokens = batch.n_tokens; + const int64_t n_tokens = ubatch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer)); - GGML_ASSERT(!batch.equal_seqs); // TODO: use batch.n_seqs instead of failing + GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing float * data = (float *) lctx.inp_KQ_mask_cross->data; @@ -16950,8 +16950,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { for (int j = 0; j < n_tokens; ++j) { for (int i = 0; i < n_output_enc; ++i) { float f = -INFINITY; - for (int s = 0; s < batch.n_seq_id[j]; ++s) { - const llama_seq_id seq_id = batch.seq_id[j][s]; + for (int s = 0; s < ubatch.n_seq_id[j]; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[j][s]; if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) { f = 0.0f; }