ggml : add ALiBi support for ggml_soft_max_ext (#5488)
* ggml : avoid recomputing alibi slopes (CPU) * llama : reuse hparams.f_max_alibi_bias in all cases ggml-ci * ggml : support alibi bias in ggml_soft_max_ext (CPU + Metal) ggml-ci * ggml : handle all SRCs (do not break on first null) ggml-ci * tests : do not use slope for large soft_max accumulates too much error ggml-ci * ggml : alternative ALiBi without extra tensor We compute the slopes in the kernel ggml-ci * cuda : add ALiBi support in ggml_soft_max_ext ggml-ci * ggml : deprecate ggml_alibi * ggml : support multi-sequence ALiBi (Metal) ggml-ci * cuda : add multi-seq ALiBi + remote F16 soft_max ggml-ci * ggml : update deprecation message * ggml : fix pos ptr when no ALiBi ggml-ci * cuda : fix performance (pow -> powf) * cuda : precompute ALiBi constants * metal : pre-compute ALiBi slopes ggml-ci * llama : init kq_pos only if needed ggml-ci * test-backend-ops : add null pos test to soft_max test-backend-ops : replace soft_max tests ggml-ci --------- Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
		
							parent
							
								
									6e4e973b26
								
							
						
					
					
						commit
						8f1be0d42f
					
				
					 9 changed files with 348 additions and 357 deletions
				
			
		
							
								
								
									
										133
									
								
								llama.cpp
									
										
									
									
									
								
							
							
						
						
									
										133
									
								
								llama.cpp
									
										
									
									
									
								
							|  | @ -1557,12 +1557,13 @@ struct llama_hparams { | |||
|     uint32_t n_yarn_orig_ctx; | ||||
|     int32_t  rope_scaling_type_train; | ||||
| 
 | ||||
|     float f_clamp_kqv; | ||||
|     float f_max_alibi_bias; | ||||
|     float f_clamp_kqv      = 0.0f; | ||||
|     float f_max_alibi_bias = 0.0f; | ||||
| 
 | ||||
|     bool causal_attn = true; | ||||
|     uint32_t pooling_type = LLAMA_POOLING_NONE; | ||||
|     bool need_kq_pos = false; | ||||
| 
 | ||||
|     uint32_t pooling_type = LLAMA_POOLING_NONE; | ||||
| 
 | ||||
|     bool operator!=(const llama_hparams & other) const { | ||||
|         if (this->vocab_only    != other.vocab_only)    return true; | ||||
|  | @ -1923,6 +1924,7 @@ struct llama_context { | |||
|     struct ggml_tensor * inp_embd;      // F32 [n_embd, n_batch]
 | ||||
|     struct ggml_tensor * inp_pos;       // I32 [n_batch]
 | ||||
|     struct ggml_tensor * inp_KQ_mask;   // F32 [n_ctx, n_batch]
 | ||||
|     struct ggml_tensor * inp_KQ_pos;    // F32 [n_ctx]
 | ||||
|     struct ggml_tensor * inp_K_shift;   // I32 [n_ctx]
 | ||||
|     struct ggml_tensor * inp_mean;      // F32 [n_batch, n_batch]
 | ||||
|     struct ggml_tensor * inp_cls;       // I32 [n_batch]
 | ||||
|  | @ -3054,6 +3056,11 @@ static void llm_load_hparams( | |||
|                     case 40: model.type = e_model::MODEL_13B; break; | ||||
|                     default: model.type = e_model::MODEL_UNKNOWN; | ||||
|                 } | ||||
| 
 | ||||
|                 if (model.type == e_model::MODEL_13B) { | ||||
|                     // TODO: become GGUF KV parameter
 | ||||
|                     hparams.f_max_alibi_bias = 8.0f; | ||||
|                 } | ||||
|             } break; | ||||
|         case LLM_ARCH_STARCODER: | ||||
|             { | ||||
|  | @ -3081,6 +3088,9 @@ static void llm_load_hparams( | |||
|                     case 32: model.type = e_model::MODEL_1B; break; | ||||
|                     default: model.type = e_model::MODEL_UNKNOWN; | ||||
|                 } | ||||
| 
 | ||||
|                 // TODO: become GGUF KV parameter
 | ||||
|                 hparams.f_max_alibi_bias = 8.0f; | ||||
|             } break; | ||||
|         case LLM_ARCH_BERT: | ||||
|             { | ||||
|  | @ -3126,11 +3136,12 @@ static void llm_load_hparams( | |||
|                             case 4096: model.type = e_model::MODEL_7B; break; | ||||
|                         } break; | ||||
|                 } | ||||
| 
 | ||||
|                 // TODO: become GGUF KV parameter
 | ||||
|                 hparams.f_max_alibi_bias = 8.0f; | ||||
|             } break; | ||||
|         case LLM_ARCH_MPT: | ||||
|             { | ||||
|                 hparams.f_clamp_kqv = 0.0f; | ||||
| 
 | ||||
|                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps); | ||||
|                 ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false); | ||||
|                 ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias); | ||||
|  | @ -3232,6 +3243,10 @@ static void llm_load_hparams( | |||
|     } | ||||
| 
 | ||||
|     model.ftype = ml.ftype; | ||||
| 
 | ||||
|     if (hparams.f_max_alibi_bias > 0.0f) { | ||||
|         hparams.need_kq_pos = true; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| // TODO: This should probably be in llama.h
 | ||||
|  | @ -4774,10 +4789,10 @@ static struct ggml_tensor * llm_build_kqv( | |||
|          struct ggml_tensor * wo_b, | ||||
|          struct ggml_tensor * q_cur, | ||||
|          struct ggml_tensor * kq_mask, | ||||
|          struct ggml_tensor * kq_pos, | ||||
|                     int64_t   n_ctx, | ||||
|                     int32_t   n_tokens, | ||||
|                     int32_t   n_kv, | ||||
|                     float     max_alibi_bias, | ||||
|                     float     kq_scale, | ||||
|          const llm_build_cb & cb, | ||||
|                     int       il) { | ||||
|  | @ -4807,26 +4822,26 @@ static struct ggml_tensor * llm_build_kqv( | |||
|         ggml_mul_mat_set_prec(kq, GGML_PREC_F32); | ||||
|     } | ||||
| 
 | ||||
|     if (max_alibi_bias > 0.0f) { | ||||
|         // temporary branch until we figure out how to handle ggml_alibi through ggml_add
 | ||||
| #if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL) | ||||
| #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL") | ||||
| #pragma message("      Falling back to ggml_alibi(). Will become an error in Mar 2024") | ||||
| #pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5488")
 | ||||
|     if (hparams.f_max_alibi_bias > 0.0f) { | ||||
|         kq = ggml_scale(ctx, kq, kq_scale); | ||||
|         cb(kq, "kq_scaled", il); | ||||
| 
 | ||||
|         if (max_alibi_bias > 0.0f) { | ||||
|             // TODO: n_head or n_head_kv
 | ||||
|             // TODO: K-shift is likely not working
 | ||||
|             // TODO: change to ggml_add
 | ||||
|             kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias); | ||||
|             cb(kq, "kq_scaled_alibi", il); | ||||
|         } | ||||
|         kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias); | ||||
|         cb(kq, "kq_scaled_alibi", il); | ||||
| 
 | ||||
|         kq = ggml_add(ctx, kq, kq_mask); | ||||
|         cb(kq, "kq_masked", il); | ||||
| 
 | ||||
|         kq = ggml_soft_max(ctx, kq); | ||||
|         cb(kq, "kq_soft_max", il); | ||||
|     } else { | ||||
|         kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale); | ||||
|     } else | ||||
| #endif | ||||
|     { | ||||
|         kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias); | ||||
|         cb(kq, "kq_soft_max_ext", il); | ||||
|     } | ||||
| 
 | ||||
|  | @ -4874,11 +4889,11 @@ static struct ggml_tensor * llm_build_kv( | |||
|          struct ggml_tensor * v_cur, | ||||
|          struct ggml_tensor * q_cur, | ||||
|          struct ggml_tensor * kq_mask, | ||||
|          struct ggml_tensor * kq_pos, | ||||
|                     int64_t   n_ctx, | ||||
|                     int32_t   n_tokens, | ||||
|                     int32_t   kv_head, | ||||
|                     int32_t   n_kv, | ||||
|                     float     max_alibi_bias, | ||||
|                     float     kq_scale, | ||||
|          const llm_build_cb & cb, | ||||
|                     int       il) { | ||||
|  | @ -4892,9 +4907,8 @@ static struct ggml_tensor * llm_build_kv( | |||
|     llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il); | ||||
| 
 | ||||
|     struct ggml_tensor * cur; | ||||
|     cur  = llm_build_kqv(ctx, model, hparams, kv, graph, | ||||
|             wo, wo_b, | ||||
|             q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il); | ||||
|     cur  = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b, | ||||
|             q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il); | ||||
|     cb(cur, "kqv_out", il); | ||||
| 
 | ||||
|     return cur; | ||||
|  | @ -5062,7 +5076,7 @@ struct llm_build_context { | |||
|                 } | ||||
| 
 | ||||
|                 Qcur = ggml_rope_custom( | ||||
|                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, | ||||
|                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, | ||||
|                     hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, | ||||
|                     ext_factor, attn_factor, beta_fast, beta_slow | ||||
|                 ); | ||||
|  | @ -5077,7 +5091,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -5207,6 +5221,10 @@ struct llm_build_context { | |||
|         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); | ||||
|         cb(KQ_mask, "KQ_mask", -1); | ||||
| 
 | ||||
|         // positions of the tokens in the KV cache
 | ||||
|         struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); | ||||
|         cb(KQ_pos, "KQ_pos", -1); | ||||
| 
 | ||||
|         // shift the entire K-cache if needed
 | ||||
|         if (do_rope_shift) { | ||||
|             llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); | ||||
|  | @ -5255,12 +5273,9 @@ struct llm_build_context { | |||
|                 cb(Kcur, "Kcur", il); | ||||
| 
 | ||||
| 
 | ||||
|                 // apply ALiBi for 13B model
 | ||||
|                 const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; | ||||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -5384,7 +5399,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -5483,7 +5498,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -5688,7 +5703,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -5750,6 +5765,10 @@ struct llm_build_context { | |||
|         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); | ||||
|         cb(KQ_mask, "KQ_mask", -1); | ||||
| 
 | ||||
|         // positions of the tokens in the KV cache
 | ||||
|         struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); | ||||
|         cb(KQ_pos, "KQ_pos", -1); | ||||
| 
 | ||||
|         for (int il = 0; il < n_layer; ++il) { | ||||
|             struct ggml_tensor * inpSA = inpL; | ||||
| 
 | ||||
|  | @ -5777,7 +5796,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -5878,7 +5897,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } else { | ||||
|                 // compute Q and K and RoPE them
 | ||||
|  | @ -5909,7 +5928,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -5985,6 +6004,10 @@ struct llm_build_context { | |||
|         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); | ||||
|         cb(KQ_mask, "KQ_mask", -1); | ||||
| 
 | ||||
|         // positions of the tokens in the KV cache
 | ||||
|         struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); | ||||
|         cb(KQ_pos, "KQ_pos", -1); | ||||
| 
 | ||||
|         inpL = llm_build_norm(ctx0, inpL, hparams, | ||||
|                 model.tok_norm, | ||||
|                 model.tok_norm_b, | ||||
|  | @ -6018,7 +6041,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -6078,6 +6101,10 @@ struct llm_build_context { | |||
|         struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); | ||||
|         cb(KQ_mask, "KQ_mask", -1); | ||||
| 
 | ||||
|         // positions of the tokens in the KV cache
 | ||||
|         struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); | ||||
|         cb(KQ_pos, "KQ_pos", -1); | ||||
| 
 | ||||
|         for (int il = 0; il < n_layer; ++il) { | ||||
|             struct ggml_tensor * attn_norm; | ||||
| 
 | ||||
|  | @ -6111,7 +6138,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -6233,7 +6260,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -6348,7 +6375,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -6469,7 +6496,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -6596,7 +6623,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -6699,7 +6726,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|             struct ggml_tensor * sa_out = cur; | ||||
|  | @ -6798,7 +6825,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -6907,7 +6934,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -7025,7 +7052,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -7144,7 +7171,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -7276,7 +7303,7 @@ struct llm_build_context { | |||
| 
 | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
| 
 | ||||
|  | @ -7507,6 +7534,18 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { | |||
|         } | ||||
|     } | ||||
| 
 | ||||
|     if (hparams.need_kq_pos) { | ||||
|         const int64_t n_kv = kv_self.n; | ||||
| 
 | ||||
|         assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer)); | ||||
| 
 | ||||
|         float * data = (float *) lctx.inp_KQ_pos->data; | ||||
| 
 | ||||
|         for (int i = 0; i < n_kv; ++i) { | ||||
|             data[i] = float(lctx.kv_self.cells[i].pos); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     if (kv_self.has_shift) { | ||||
|         const int64_t n_ctx = cparams.n_ctx; | ||||
| 
 | ||||
|  | @ -11434,7 +11473,7 @@ struct llama_context * llama_new_context_with_model( | |||
|         // graph inputs
 | ||||
|         { | ||||
|             ggml_init_params init_params = { | ||||
|                 /* .mem_size   */ ggml_tensor_overhead()*7, | ||||
|                 /* .mem_size   */ ggml_tensor_overhead()*8, | ||||
|                 /* .mem_buffer */ nullptr, | ||||
|                 /* .no_alloc   */ true, | ||||
|             }; | ||||
|  | @ -11444,6 +11483,7 @@ struct llama_context * llama_new_context_with_model( | |||
|             ctx->inp_embd    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch); | ||||
|             ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); | ||||
|             ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); | ||||
|             ctx->inp_KQ_pos  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx); | ||||
|             ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); | ||||
|             ctx->inp_mean    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); | ||||
|             ctx->inp_cls     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); | ||||
|  | @ -11452,6 +11492,7 @@ struct llama_context * llama_new_context_with_model( | |||
|             ggml_set_name(ctx->inp_embd,    "inp_embd"); | ||||
|             ggml_set_name(ctx->inp_pos,     "inp_pos"); | ||||
|             ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask"); | ||||
|             ggml_set_name(ctx->inp_KQ_pos,  "inp_KQ_pos"); | ||||
|             ggml_set_name(ctx->inp_K_shift, "inp_K_shift"); | ||||
|             ggml_set_name(ctx->inp_mean,    "inp_mean"); | ||||
|             ggml_set_name(ctx->inp_cls,     "inp_cls"); | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue