llama : fix Gemma-2 Query scaling factors (#8473)
* 9B - query_pre_attn_scalar = 256 not 224
See 03e657582d
Gemma 9b should use 256 and not 224 (self.config.hidden_size // self.config.num_attention_heads)
* llama : fix Gemma-2 Query scaling factor
ggml-ci
---------
Co-authored-by: Daniel Han <danielhanchen@gmail.com>
			
			
This commit is contained in:
		
							parent
							
								
									e236528e76
								
							
						
					
					
						commit
						73cf442e7b
					
				
					 2 changed files with 6 additions and 6 deletions
				
			
		|  | @ -11680,7 +11680,12 @@ struct llm_build_context { | |||
|                         ext_factor, attn_factor, beta_fast, beta_slow); | ||||
|                 cb(Qcur, "Qcur", il); | ||||
| 
 | ||||
|                 Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); | ||||
|                 // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
 | ||||
|                 switch (model.type) { | ||||
|                     case e_model::MODEL_9B:  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break; | ||||
|                     case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; | ||||
|                     default: GGML_ASSERT(false); | ||||
|                 }; | ||||
|                 cb(Qcur, "Qcur_scaled", il); | ||||
| 
 | ||||
|                 Kcur = ggml_rope_ext( | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue