sampling : refactor init to use llama_sampling_params
This commit is contained in:
parent
8cf19d60dc
commit
cd1e937821
12 changed files with 110 additions and 142 deletions
|
@ -128,7 +128,7 @@ bool eval_string(struct MyModel * mymodel,const char* str){
|
|||
llama_token sampling_id(struct MyModel* mymodel) {
|
||||
llama_context* ctx = mymodel->ctx;
|
||||
gpt_params params = mymodel->params;
|
||||
llama_sampling_params & sparams = params.sampling_params;
|
||||
llama_sampling_params & sparams = params.sparams;
|
||||
// int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
// out of user input, sample next token
|
||||
|
|
|
@ -39,8 +39,8 @@ static gpt_params * g_params;
|
|||
static std::vector<llama_token> * g_input_tokens;
|
||||
static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
|
||||
static bool is_interacting = false;
|
||||
|
||||
static void write_logfile(
|
||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||
|
@ -104,7 +104,7 @@ static void sigint_handler(int signo) {
|
|||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
llama_sampling_params & sparams = params.sampling_params;
|
||||
llama_sampling_params & sparams = params.sparams;
|
||||
g_params = ¶ms;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
|
@ -363,31 +363,6 @@ int main(int argc, char ** argv) {
|
|||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
struct llama_grammar * grammar = NULL;
|
||||
grammar_parser::parse_state parsed_grammar;
|
||||
|
||||
if (!params.grammar.empty()) {
|
||||
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
||||
// will be empty (default) if there are parse errors
|
||||
if (parsed_grammar.rules.empty()) {
|
||||
return 1;
|
||||
}
|
||||
LOG_TEE("%s: grammar:\n", __func__);
|
||||
grammar_parser::print_grammar(stderr, parsed_grammar);
|
||||
LOG_TEE("\n");
|
||||
|
||||
{
|
||||
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
|
||||
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
|
||||
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
||||
grammar = llama_grammar_init(
|
||||
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||
}
|
||||
|
||||
LOG_TEE("\n##### Infill mode #####\n\n");
|
||||
if (params.infill) {
|
||||
printf("\n************\n");
|
||||
|
@ -430,7 +405,7 @@ int main(int argc, char ** argv) {
|
|||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> embd_guidance;
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
||||
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
// predict
|
||||
|
@ -740,15 +715,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
if (n_past > 0) {
|
||||
if (is_interacting) {
|
||||
// reset grammar state if we're restarting generation
|
||||
if (grammar != NULL) {
|
||||
llama_grammar_free(grammar);
|
||||
|
||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
||||
grammar = llama_grammar_init(
|
||||
grammar_rules.data(), grammar_rules.size(),
|
||||
parsed_grammar.symbol_ids.at("root"));
|
||||
}
|
||||
llama_sampling_reset(ctx_sampling);
|
||||
}
|
||||
is_interacting = false;
|
||||
}
|
||||
|
@ -778,9 +745,7 @@ int main(int argc, char ** argv) {
|
|||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
if (grammar != NULL) {
|
||||
llama_grammar_free(grammar);
|
||||
}
|
||||
llama_sampling_free(ctx_sampling);
|
||||
llama_backend_free();
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
|
|
|
@ -58,28 +58,30 @@ inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n
|
|||
|
||||
// TODO: use common/sampling.h
|
||||
inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
||||
// out of user input, sample next token
|
||||
const float temp = params.sampling_params.temp;
|
||||
const int32_t top_k = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
|
||||
const float top_p = params.sampling_params.top_p;
|
||||
const float tfs_z = params.sampling_params.tfs_z;
|
||||
const float typical_p = params.sampling_params.typical_p;
|
||||
// const int32_t repeat_last_n = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n;
|
||||
// const float repeat_penalty = params.sampling_params.repeat_penalty;
|
||||
// const float alpha_presence = params.sampling_params.presence_penalty;
|
||||
// const float alpha_frequency = params.sampling_params.frequency_penalty;
|
||||
const int mirostat = params.sampling_params.mirostat;
|
||||
const float mirostat_tau = params.sampling_params.mirostat_tau;
|
||||
const float mirostat_eta = params.sampling_params.mirostat_eta;
|
||||
// const bool penalize_nl = params.sampling_params.penalize_nl;
|
||||
auto & sparams = params.sparams;
|
||||
|
||||
// out of user input, sample next token
|
||||
const float temp = sparams.temp;
|
||||
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
|
||||
const float top_p = sparams.top_p;
|
||||
const float tfs_z = sparams.tfs_z;
|
||||
const float typical_p = sparams.typical_p;
|
||||
// const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
|
||||
// const float repeat_penalty = sparams.repeat_penalty;
|
||||
// const float alpha_presence = sparams.presence_penalty;
|
||||
// const float alpha_frequency = sparams.frequency_penalty;
|
||||
const int mirostat = sparams.mirostat;
|
||||
const float mirostat_tau = sparams.mirostat_tau;
|
||||
const float mirostat_eta = sparams.mirostat_eta;
|
||||
// const bool penalize_nl = sparams.penalize_nl;
|
||||
|
||||
llama_token id = 0;
|
||||
{
|
||||
auto logits = llama_get_logits(ctx_llama);
|
||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) {
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
|
||||
|
@ -91,18 +93,18 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
|||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
// TODO: Apply penalties
|
||||
// float nl_logit = logits[llama_token_nl(ctx)];
|
||||
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, repeat_penalty);
|
||||
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, alpha_frequency, alpha_presence);
|
||||
// if (!penalize_nl) {
|
||||
// logits[llama_token_nl(ctx)] = nl_logit;
|
||||
// }
|
||||
// TODO: Apply penalties
|
||||
// float nl_logit = logits[llama_token_nl(ctx)];
|
||||
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, repeat_penalty);
|
||||
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, alpha_frequency, alpha_presence);
|
||||
// if (!penalize_nl) {
|
||||
// logits[llama_token_nl(ctx)] = nl_logit;
|
||||
// }
|
||||
|
||||
if (temp <= 0) {
|
||||
// Greedy sampling
|
||||
|
|
|
@ -108,7 +108,7 @@ int main(int argc, char ** argv) {
|
|||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
return 1;
|
||||
}
|
||||
llama_sampling_params & sparams = params.sampling_params;
|
||||
llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("main", "log"));
|
||||
|
@ -459,7 +459,7 @@ int main(int argc, char ** argv) {
|
|||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> embd_guidance;
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
||||
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
// predict
|
||||
|
|
|
@ -157,7 +157,7 @@ int main(int argc, char ** argv) {
|
|||
for (size_t i = 0; i < clients.size(); ++i) {
|
||||
auto & client = clients[i];
|
||||
client.id = i;
|
||||
client.ctx_sampling = llama_sampling_init(params);
|
||||
client.ctx_sampling = llama_sampling_init(params.sparams);
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokens_system;
|
||||
|
|
|
@ -232,7 +232,7 @@ struct llama_server_context
|
|||
void rewind()
|
||||
{
|
||||
params.antiprompt.clear();
|
||||
params.grammar.clear();
|
||||
params.sparams.grammar.clear();
|
||||
num_prompt_tokens = 0;
|
||||
num_tokens_predicted = 0;
|
||||
generated_text = "";
|
||||
|
@ -250,7 +250,7 @@ struct llama_server_context
|
|||
if (ctx_sampling != nullptr) {
|
||||
llama_sampling_free(ctx_sampling);
|
||||
}
|
||||
ctx_sampling = llama_sampling_init(params);
|
||||
ctx_sampling = llama_sampling_init(params.sparams);
|
||||
}
|
||||
|
||||
bool loadModel(const gpt_params ¶ms_)
|
||||
|
@ -313,7 +313,7 @@ struct llama_server_context
|
|||
|
||||
bool loadGrammar()
|
||||
{
|
||||
ctx_sampling = llama_sampling_init(params);
|
||||
ctx_sampling = llama_sampling_init(params.sparams);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -530,8 +530,8 @@ struct llama_server_context
|
|||
|
||||
llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };
|
||||
|
||||
const int32_t n_probs = params.sampling_params.n_probs;
|
||||
if (params.sampling_params.temp <= 0 && n_probs > 0)
|
||||
const int32_t n_probs = params.sparams.n_probs;
|
||||
if (params.sparams.temp <= 0 && n_probs > 0)
|
||||
{
|
||||
// For llama_sample_token_greedy we need to sort candidates
|
||||
llama_sample_softmax(ctx, &cur_p);
|
||||
|
@ -606,7 +606,7 @@ struct llama_server_context
|
|||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
||||
generated_text += token_text;
|
||||
|
||||
if (params.sampling_params.n_probs > 0)
|
||||
if (params.sparams.n_probs > 0)
|
||||
{
|
||||
generated_token_probs.push_back(token_with_probs);
|
||||
}
|
||||
|
@ -1004,7 +1004,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
|
||||
static json format_generation_settings(llama_server_context &llama)
|
||||
{
|
||||
const auto & sparams = llama.params.sampling_params;
|
||||
const auto & sparams = llama.params.sparams;
|
||||
const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
|
||||
const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
|
||||
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||
|
@ -1033,7 +1033,7 @@ static json format_generation_settings(llama_server_context &llama)
|
|||
{"stream", llama.stream},
|
||||
{"logit_bias", sparams.logit_bias},
|
||||
{"n_probs", sparams.n_probs},
|
||||
{"grammar", llama.params.grammar},
|
||||
{"grammar", llama.params.sparams.grammar},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -1081,7 +1081,7 @@ static json format_final_response(llama_server_context &llama, const std::string
|
|||
{"timings", format_timings(llama)},
|
||||
};
|
||||
|
||||
if (llama.params.sampling_params.n_probs > 0)
|
||||
if (llama.params.sparams.n_probs > 0)
|
||||
{
|
||||
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
||||
}
|
||||
|
@ -1097,7 +1097,7 @@ static json format_partial_response(
|
|||
{"stop", false},
|
||||
};
|
||||
|
||||
if (llama.params.sampling_params.n_probs > 0)
|
||||
if (llama.params.sparams.n_probs > 0)
|
||||
{
|
||||
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
||||
}
|
||||
|
@ -1129,11 +1129,13 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
|||
static void parse_options_completion(const json &body, llama_server_context &llama)
|
||||
{
|
||||
gpt_params default_params;
|
||||
const auto & default_sparams = default_params.sampling_params;
|
||||
auto & sparams = llama.params.sampling_params;
|
||||
const auto & default_sparams = default_params.sparams;
|
||||
|
||||
auto & params = llama.params;
|
||||
auto & sparams = llama.params.sparams;
|
||||
|
||||
llama.stream = json_value(body, "stream", false);
|
||||
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
|
||||
params.n_predict = json_value(body, "n_predict", default_params.n_predict);
|
||||
sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
|
||||
sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
|
||||
sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
|
||||
|
@ -1147,9 +1149,9 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
|||
sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
|
||||
sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
|
||||
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
||||
llama.params.seed = json_value(body, "seed", default_params.seed);
|
||||
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
||||
params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
||||
params.seed = json_value(body, "seed", default_params.seed);
|
||||
sparams.grammar = json_value(body, "grammar", default_sparams.grammar);
|
||||
sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
|
||||
|
||||
if (body.count("prompt") != 0)
|
||||
|
@ -1204,7 +1206,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
|||
}
|
||||
}
|
||||
|
||||
llama.ctx_sampling = llama_sampling_init(llama.params);
|
||||
llama.ctx_sampling = llama_sampling_init(llama.params.sparams);
|
||||
|
||||
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
||||
}
|
||||
|
@ -1414,7 +1416,7 @@ int main(int argc, char **argv)
|
|||
}
|
||||
|
||||
auto probs = llama.generated_token_probs;
|
||||
if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
|
||||
if (llama.params.sparams.n_probs > 0 && llama.stopped_word) {
|
||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
||||
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
||||
}
|
||||
|
@ -1466,7 +1468,7 @@ int main(int argc, char **argv)
|
|||
|
||||
std::vector<completion_token_output> probs_output = {};
|
||||
|
||||
if (llama.params.sampling_params.n_probs > 0) {
|
||||
if (llama.params.sparams.n_probs > 0) {
|
||||
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
||||
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
||||
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
||||
|
@ -1587,7 +1589,7 @@ int main(int argc, char **argv)
|
|||
|
||||
std::vector<completion_token_output> probs_output = {};
|
||||
|
||||
if (llama.params.sampling_params.n_probs > 0) {
|
||||
if (llama.params.sparams.n_probs > 0) {
|
||||
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
||||
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
||||
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
||||
|
|
|
@ -112,16 +112,16 @@ int main(int argc, char ** argv) {
|
|||
bool has_eos = false;
|
||||
|
||||
// target model sampling context
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
||||
|
||||
// draft sequence data
|
||||
std::vector<seq_draft> drafts(n_seq_dft);
|
||||
|
||||
params.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||
params.sampling_params.temp = std::max(0.01f, params.sampling_params.temp);
|
||||
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||
params.sparams.temp = std::max(0.01f, params.sparams.temp);
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
drafts[s].ctx_sampling = llama_sampling_init(params);
|
||||
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
||||
}
|
||||
|
||||
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue