sampling : refactor init to use llama_sampling_params (#3696)
* sampling : refactor init to use llama_sampling_params * llama : combine repetition, frequency and presence penalties in 1 call * examples : remove embd-input and gptneox-wip * sampling : rename penalty params + reduce size of "prev" vector * sampling : add llama_sampling_print helper * sampling : hide prev behind API and apply #3661 ggml-ci
This commit is contained in:
		
							parent
							
								
									8cf19d60dc
								
							
						
					
					
						commit
						d1031cf49c
					
				
					 30 changed files with 365 additions and 4502 deletions
				
			
		
							
								
								
									
										94
									
								
								llama.cpp
									
										
									
									
									
								
							
							
						
						
									
										94
									
								
								llama.cpp
									
										
									
									
									
								
							|  | @ -1018,8 +1018,8 @@ enum e_model { | |||
| }; | ||||
| 
 | ||||
| static const size_t kB = 1024; | ||||
| static const size_t MB = kB*kB; | ||||
| static const size_t GB = kB*kB*kB; | ||||
| static const size_t MB = 1024*kB; | ||||
| static const size_t GB = 1024*MB; | ||||
| 
 | ||||
| struct llama_hparams { | ||||
|     bool     vocab_only; | ||||
|  | @ -1042,21 +1042,21 @@ struct llama_hparams { | |||
|     float f_max_alibi_bias; | ||||
| 
 | ||||
|     bool operator!=(const llama_hparams & other) const { | ||||
|         if (this->vocab_only != other.vocab_only) return true; | ||||
|         if (this->n_vocab != other.n_vocab) return true; | ||||
|         if (this->vocab_only  != other.vocab_only)  return true; | ||||
|         if (this->n_vocab     != other.n_vocab)     return true; | ||||
|         if (this->n_ctx_train != other.n_ctx_train) return true; | ||||
|         if (this->n_embd != other.n_embd) return true; | ||||
|         if (this->n_head != other.n_head) return true; | ||||
|         if (this->n_head_kv != other.n_head_kv) return true; | ||||
|         if (this->n_layer != other.n_layer) return true; | ||||
|         if (this->n_rot != other.n_rot) return true; | ||||
|         if (this->n_ff != other.n_ff) return true; | ||||
|         if (this->n_embd      != other.n_embd)      return true; | ||||
|         if (this->n_head      != other.n_head)      return true; | ||||
|         if (this->n_head_kv   != other.n_head_kv)   return true; | ||||
|         if (this->n_layer     != other.n_layer)     return true; | ||||
|         if (this->n_rot       != other.n_rot)       return true; | ||||
|         if (this->n_ff        != other.n_ff)        return true; | ||||
| 
 | ||||
|         const float EPSILON = 1e-9; | ||||
| 
 | ||||
|         if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; | ||||
|         if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; | ||||
|         if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; | ||||
|         if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true; | ||||
|         if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true; | ||||
|         if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true; | ||||
|         if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; | ||||
| 
 | ||||
|         return false; | ||||
|  | @ -1195,11 +1195,11 @@ struct llama_vocab { | |||
|     id special_sep_id = -1; | ||||
|     id special_pad_id = -1; | ||||
| 
 | ||||
|     id linefeed_id = 13; | ||||
|     id linefeed_id       = 13; | ||||
|     id special_prefix_id = 32007; | ||||
|     id special_middle_id = 32009; | ||||
|     id special_suffix_id = 32008; | ||||
|     id special_eot_id = 32010; | ||||
|     id special_eot_id    = 32010; | ||||
| 
 | ||||
|     int find_bpe_rank(std::string token_left, std::string token_right) const { | ||||
|         replace_all(token_left,  " ",  "\u0120"); | ||||
|  | @ -1359,10 +1359,7 @@ static bool llama_kv_cache_init( | |||
|     cache.cells.clear(); | ||||
|     cache.cells.resize(n_ctx); | ||||
| 
 | ||||
|     // TODO: this should be:
 | ||||
|     //       cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
 | ||||
|     //       change it and test that it works
 | ||||
|     cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); | ||||
|     cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead()); | ||||
|     memset(cache.buf.data, 0, cache.buf.size); | ||||
| 
 | ||||
|     struct ggml_init_params params; | ||||
|  | @ -7417,37 +7414,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array | |||
|     llama_sample_temp(ctx, candidates_p, temp); | ||||
| } | ||||
| 
 | ||||
| void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) { | ||||
|     if (last_tokens_size == 0 || penalty == 1.0f) { | ||||
|         return; | ||||
|     } | ||||
| 
 | ||||
|     const int64_t t_start_sample_us = ggml_time_us(); | ||||
| 
 | ||||
|     for (size_t i = 0; i < candidates->size; ++i) { | ||||
|         const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id); | ||||
|         if (token_iter == last_tokens + last_tokens_size) { | ||||
|             continue; | ||||
|         } | ||||
| 
 | ||||
|         // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
 | ||||
|         // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
 | ||||
|         if (candidates->data[i].logit <= 0) { | ||||
|             candidates->data[i].logit *= penalty; | ||||
|         } else { | ||||
|             candidates->data[i].logit /= penalty; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     candidates->sorted = false; | ||||
| 
 | ||||
|     if (ctx) { | ||||
|         ctx->t_sample_us += ggml_time_us() - t_start_sample_us; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) { | ||||
|     if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) { | ||||
| void llama_sample_repetition_penalties( | ||||
|             struct llama_context * ctx, | ||||
|           llama_token_data_array * candidates, | ||||
|                const llama_token * last_tokens, | ||||
|                           size_t   penalty_last_n, | ||||
|                            float   penalty_repeat, | ||||
|                            float   penalty_freq, | ||||
|                            float   penalty_present) { | ||||
|     if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) { | ||||
|         return; | ||||
|     } | ||||
| 
 | ||||
|  | @ -7455,19 +7430,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l | |||
| 
 | ||||
|     // Create a frequency map to count occurrences of each token in last_tokens
 | ||||
|     std::unordered_map<llama_token, int> token_count; | ||||
|     for (size_t i = 0; i < last_tokens_size; ++i) { | ||||
|         token_count[last_tokens_p[i]]++; | ||||
|     for (size_t i = 0; i < penalty_last_n; ++i) { | ||||
|         token_count[last_tokens[i]]++; | ||||
|     } | ||||
| 
 | ||||
|     // Apply frequency and presence penalties to the candidates
 | ||||
|     for (size_t i = 0; i < candidates->size; ++i) { | ||||
|         auto token_iter = token_count.find(candidates->data[i].id); | ||||
|         const auto token_iter = token_count.find(candidates->data[i].id); | ||||
|         if (token_iter == token_count.end()) { | ||||
|             continue; | ||||
|         } | ||||
| 
 | ||||
|         int count = token_iter->second; | ||||
|         candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence; | ||||
|         const int count = token_iter->second; | ||||
| 
 | ||||
|         // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
 | ||||
|         // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
 | ||||
|         if (candidates->data[i].logit <= 0) { | ||||
|             candidates->data[i].logit *= penalty_repeat; | ||||
|         } else { | ||||
|             candidates->data[i].logit /= penalty_repeat; | ||||
|         } | ||||
| 
 | ||||
|         candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present; | ||||
|     } | ||||
| 
 | ||||
|     candidates->sorted = false; | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue