llama : allow gguf RoPE keys to be overridden with defaults (#3240)
This commit is contained in:
		
							parent
							
								
									65c2c1c5ab
								
							
						
					
					
						commit
						a5661d7e71
					
				
					 3 changed files with 27 additions and 39 deletions
				
			
		|  | @ -647,9 +647,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { | ||||||
|     printf("  --cfg-negative-prompt-file FNAME\n"); |     printf("  --cfg-negative-prompt-file FNAME\n"); | ||||||
|     printf("                        negative prompt file to use for guidance. (default: empty)\n"); |     printf("                        negative prompt file to use for guidance. (default: empty)\n"); | ||||||
|     printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); |     printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); | ||||||
|     printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale); |     printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n"); | ||||||
|     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base); |     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); | ||||||
|     printf("  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale); |     printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n"); | ||||||
|     printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); |     printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); | ||||||
|     printf("  --no-penalize-nl      do not penalize newline token\n"); |     printf("  --no-penalize-nl      do not penalize newline token\n"); | ||||||
|     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n"); |     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n"); | ||||||
|  |  | ||||||
|  | @ -701,8 +701,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, | ||||||
|     printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); |     printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); | ||||||
|     printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads); |     printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads); | ||||||
|     printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx); |     printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx); | ||||||
|     printf("  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base); |     printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n"); | ||||||
|     printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); |     printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n"); | ||||||
|     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch); |     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch); | ||||||
|     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n"); |     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n"); | ||||||
|     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n"); |     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n"); | ||||||
|  |  | ||||||
							
								
								
									
										54
									
								
								llama.cpp
									
										
									
									
									
								
							
							
						
						
									
										54
									
								
								llama.cpp
									
										
									
									
									
								
							|  | @ -929,23 +929,22 @@ static const size_t kB = 1024; | ||||||
| static const size_t MB = kB*kB; | static const size_t MB = kB*kB; | ||||||
| static const size_t GB = kB*kB*kB; | static const size_t GB = kB*kB*kB; | ||||||
| 
 | 
 | ||||||
| // default hparams (LLaMA 7B)
 |  | ||||||
| struct llama_hparams { | struct llama_hparams { | ||||||
|     uint32_t n_vocab     = 32000; |     uint32_t n_vocab; | ||||||
|     uint32_t n_ctx_train = 2048;  // the context size used during training
 |     uint32_t n_ctx_train; // context size the model was trained on
 | ||||||
|     uint32_t n_ctx       = 512;   // the context size used during inference
 |     uint32_t n_ctx;       // context size used during inference
 | ||||||
|     uint32_t n_embd      = 4096; |     uint32_t n_embd; | ||||||
|     uint32_t n_head      = 32; |     uint32_t n_head; | ||||||
|     uint32_t n_head_kv   = 32; |     uint32_t n_head_kv; | ||||||
|     uint32_t n_layer     = 32; |     uint32_t n_layer; | ||||||
|     uint32_t n_rot       = 64; |     uint32_t n_rot; | ||||||
|     uint32_t n_ff        = 11008; |     uint32_t n_ff; | ||||||
| 
 | 
 | ||||||
|     float f_norm_eps     = 1e-5; |     float f_norm_eps; | ||||||
|     float f_norm_rms_eps = 1e-5; |     float f_norm_rms_eps; | ||||||
| 
 | 
 | ||||||
|     float rope_freq_base  = 10000.0f; |     float rope_freq_base; | ||||||
|     float rope_freq_scale = 1.0f; |     float rope_freq_scale; | ||||||
| 
 | 
 | ||||||
|     bool operator!=(const llama_hparams & other) const { |     bool operator!=(const llama_hparams & other) const { | ||||||
|         return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
 |         return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
 | ||||||
|  | @ -1076,7 +1075,7 @@ struct llama_model { | ||||||
| 
 | 
 | ||||||
|     std::string name = "n/a"; |     std::string name = "n/a"; | ||||||
| 
 | 
 | ||||||
|     llama_hparams hparams; |     llama_hparams hparams = {}; | ||||||
|     llama_vocab   vocab; |     llama_vocab   vocab; | ||||||
| 
 | 
 | ||||||
|     struct ggml_tensor * tok_embeddings; |     struct ggml_tensor * tok_embeddings; | ||||||
|  | @ -1674,29 +1673,18 @@ static void llm_load_hparams( | ||||||
|     hparams.n_head_kv = hparams.n_head; |     hparams.n_head_kv = hparams.n_head; | ||||||
|     GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); |     GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); | ||||||
| 
 | 
 | ||||||
|     // TODO: manually setting rope freq base and scale should override this
 |     // rope_freq_base (optional)
 | ||||||
|     // FIXME: partial fix when the param specified is not the default value, but
 |     if (rope_freq_base == 0.0f) { | ||||||
|     //        will not work for overriding the model value to the params default
 |         rope_freq_base = 10000.0f; | ||||||
| 
 |         GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); | ||||||
|     llama_context_params defaults = llama_context_default_params(); |  | ||||||
| 
 |  | ||||||
|     // rope_freq_base
 |  | ||||||
|     { |  | ||||||
|         float ropebase = 10000.0f; |  | ||||||
|         GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); |  | ||||||
|         if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) { |  | ||||||
|             rope_freq_base = ropebase; |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     // rope_freq_scale (inverse of the kv) is optional
 |     // rope_freq_scale (inverse of the kv) is optional
 | ||||||
|     { |     if (rope_freq_scale == 0.0f) { | ||||||
|         float ropescale = 1.0f; |         float ropescale = 1.0f; | ||||||
|         GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); |         GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); | ||||||
|         if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) { |  | ||||||
|         rope_freq_scale = 1.0f/ropescale; |         rope_freq_scale = 1.0f/ropescale; | ||||||
|     } |     } | ||||||
|     } |  | ||||||
| 
 | 
 | ||||||
|     // sanity check for n_rot (optional)
 |     // sanity check for n_rot (optional)
 | ||||||
|     { |     { | ||||||
|  | @ -6188,8 +6176,8 @@ struct llama_context_params llama_context_default_params() { | ||||||
|         /*.n_gpu_layers                =*/ 0, |         /*.n_gpu_layers                =*/ 0, | ||||||
|         /*.main_gpu                    =*/ 0, |         /*.main_gpu                    =*/ 0, | ||||||
|         /*.tensor_split                =*/ nullptr, |         /*.tensor_split                =*/ nullptr, | ||||||
|         /*.rope_freq_base              =*/ 10000.0f, |         /*.rope_freq_base              =*/ 0.0f, | ||||||
|         /*.rope_freq_scale             =*/ 1.0f, |         /*.rope_freq_scale             =*/ 0.0f, | ||||||
|         /*.progress_callback           =*/ nullptr, |         /*.progress_callback           =*/ nullptr, | ||||||
|         /*.progress_callback_user_data =*/ nullptr, |         /*.progress_callback_user_data =*/ nullptr, | ||||||
|         /*.low_vram                    =*/ false, |         /*.low_vram                    =*/ false, | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue