From ebeb65194b3754319830d515cbbfefb28737ff19 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Sep 2024 10:25:33 +0300 Subject: [PATCH] sampling : change _cp/copy to clone --- common/sampling.cpp | 10 +++---- common/sampling.h | 2 +- examples/speculative/speculative.cpp | 4 +-- include/llama.h | 17 +++++++++--- src/llama-grammar.cpp | 2 +- src/llama-grammar.h | 2 +- src/llama-sampling.cpp | 40 ++++++++++++++-------------- src/llama-sampling.h | 16 ++--------- src/llama.cpp | 8 +++--- 9 files changed, 50 insertions(+), 51 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 2887207f1..914b579a0 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -114,13 +114,13 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) { } } -struct gpt_sampler * gpt_sampler_cp(gpt_sampler * gsmpl) { +struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) { return new gpt_sampler { /* .params = */ gsmpl->params, - /* .bias = */ llama_constraint_cp(gsmpl->bias), - /* .pnlt = */ llama_constraint_cp(gsmpl->pnlt), - /* .grmr = */ llama_constraint_cp(gsmpl->grmr), - /* .smpl = */ llama_sampler_cp (gsmpl->smpl) + /* .bias = */ llama_constraint_clone(gsmpl->bias), + /* .pnlt = */ llama_constraint_clone(gsmpl->pnlt), + /* .grmr = */ llama_constraint_clone(gsmpl->grmr), + /* .smpl = */ llama_sampler_clone (gsmpl->smpl) }; } diff --git a/common/sampling.h b/common/sampling.h index 87673efa3..c260ef055 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -68,7 +68,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st void gpt_sampler_free(struct gpt_sampler * gsmpl); -struct gpt_sampler * gpt_sampler_cp(gpt_sampler * gsmpl); +struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl); void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool apply_grammar); void gpt_sampler_reset (struct gpt_sampler * gsmpl); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index d51c76849..9f596ec91 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -451,7 +451,7 @@ int main(int argc, char ** argv) { if (drafts[0].smpl) { gpt_sampler_free(drafts[0].smpl); } - drafts[0].smpl = gpt_sampler_cp(smpl); + drafts[0].smpl = gpt_sampler_clone(smpl); int n_seq_cur = 1; int n_past_cur = n_past_dft; @@ -523,7 +523,7 @@ int main(int argc, char ** argv) { if (drafts[n_seq_cur].smpl) { gpt_sampler_free(drafts[n_seq_cur].smpl); } - drafts[n_seq_cur].smpl = gpt_sampler_cp(drafts[s].smpl); + drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl); sa.push_back(n_seq_cur); diff --git a/include/llama.h b/include/llama.h index 02f7a8491..0fc45bef3 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1032,7 +1032,7 @@ extern "C" { void (*accept)( struct llama_constraint * cnstr, llama_token token); // can be NULL void (*apply) ( struct llama_constraint * cnstr, llama_token_data_array * cur_p); // required void (*reset) ( struct llama_constraint * cnstr); // can be NULL - struct llama_constraint * (*copy) (const struct llama_constraint * cnstr); // can be NULL if ctx is NULL + struct llama_constraint * (*clone) (const struct llama_constraint * cnstr); // can be NULL if ctx is NULL void (*free) ( struct llama_constraint * cnstr); // can be NULL if ctx is NULL // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph @@ -1053,11 +1053,22 @@ extern "C" { LLAMA_API struct llama_constraint * llama_constraint_init_temp (float t); LLAMA_API struct llama_constraint * llama_constraint_init_temp_ext (float t, float delta, float exponent); + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. + /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. + /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. + /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. + /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. LLAMA_API struct llama_constraint * llama_constraint_init_mirostat( const struct llama_model * model, float tau, float eta); + /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. + /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. + /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. + /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. LLAMA_API struct llama_constraint * llama_constraint_init_mirostat_v2( float tau, float eta); @@ -1081,7 +1092,7 @@ extern "C" { int32_t n_logit_bias, const llama_logit_bias * logit_bias); - LLAMA_API struct llama_constraint * llama_constraint_cp(const struct llama_constraint * cnstr); + LLAMA_API struct llama_constraint * llama_constraint_clone(const struct llama_constraint * cnstr); // important: do not call if the constraint has been added to a llama_sampler (via llama_sampler_constraint_add) LLAMA_API void llama_constraint_free(struct llama_constraint * cnstr); @@ -1094,7 +1105,7 @@ extern "C" { LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_model * model, struct llama_sampler_params params); LLAMA_API void llama_sampler_free ( struct llama_sampler * smpl); - LLAMA_API struct llama_sampler * llama_sampler_cp (const struct llama_sampler * smpl); + LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl); LLAMA_API void llama_sampler_reset ( struct llama_sampler * smpl); LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token); LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p); diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index a9813ebbf..09f756fbe 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -1050,7 +1050,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) { delete grammar; } -struct llama_grammar * llama_grammar_cp_impl(const struct llama_grammar & grammar) { +struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) { llama_grammar * result = new llama_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, }; // redirect elements in stacks to point to new rules diff --git a/src/llama-grammar.h b/src/llama-grammar.h index 6b9a2af8d..419a616d6 100644 --- a/src/llama-grammar.h +++ b/src/llama-grammar.h @@ -131,7 +131,7 @@ struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, void llama_grammar_free_impl(struct llama_grammar * grammar); -struct llama_grammar * llama_grammar_cp_impl(const struct llama_grammar & grammar); +struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar); // TODO: move the API below as member functions of llama_grammar void llama_grammar_apply_impl( diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index c07c509bc..bf71f98f1 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -433,7 +433,7 @@ static struct llama_constraint_i llama_constraint_softmax_i = { llama_constraint_softmax_impl(cur_p); }, /* .reset = */ nullptr, - /* .copy = */ nullptr, + /* .clone = */ nullptr, /* .free = */ nullptr, }; @@ -458,7 +458,7 @@ static struct llama_constraint_i llama_constraint_top_k_i = { llama_constraint_top_k_impl(cur_p, ctx->k); }, /* .reset = */ nullptr, - /* .copy = */ [](const struct llama_constraint * cnstr) { + /* .clone = */ [](const struct llama_constraint * cnstr) { const auto * ctx = (const llama_constraint_context_top_k *) cnstr->ctx; return llama_constraint_init_top_k_impl(ctx->k); }, @@ -491,7 +491,7 @@ static struct llama_constraint_i llama_constraint_top_p_i = { llama_constraint_top_p_impl(cur_p, ctx->p, ctx->min_keep); }, /* .reset = */ nullptr, - /* .copy = */ [](const struct llama_constraint * cnstr) { + /* .clone = */ [](const struct llama_constraint * cnstr) { const auto * ctx = (const llama_constraint_context_top_p *) cnstr->ctx; return llama_constraint_init_top_p_impl(ctx->p, ctx->min_keep); }, @@ -525,7 +525,7 @@ static struct llama_constraint_i llama_constraint_min_p_i = { llama_constraint_min_p_impl(cur_p, ctx->p, ctx->min_keep); }, /* .reset = */ nullptr, - /* .copy = */ [](const struct llama_constraint * cnstr) { + /* .clone = */ [](const struct llama_constraint * cnstr) { const auto * ctx = (const llama_constraint_context_min_p *) cnstr->ctx; return llama_constraint_init_min_p_impl(ctx->p, ctx->min_keep); }, @@ -559,7 +559,7 @@ static struct llama_constraint_i llama_constraint_tail_free_i = { llama_constraint_tail_free_impl(cur_p, ctx->z, ctx->min_keep); }, /* .reset = */ nullptr, - /* .copy = */ [](const struct llama_constraint * cnstr) { + /* .clone = */ [](const struct llama_constraint * cnstr) { const auto * ctx = (const llama_constraint_context_tail_free *) cnstr->ctx; return llama_constraint_init_tail_free_impl(ctx->z, ctx->min_keep); }, @@ -593,7 +593,7 @@ static struct llama_constraint_i llama_constraint_typical_i = { llama_constraint_typical_impl(cur_p, ctx->p, ctx->min_keep); }, /* .reset = */ nullptr, - /* .copy = */ [](const struct llama_constraint * cnstr) { + /* .clone = */ [](const struct llama_constraint * cnstr) { const auto * ctx = (const llama_constraint_context_typical *) cnstr->ctx; return llama_constraint_init_typical_impl(ctx->p, ctx->min_keep); }, @@ -626,7 +626,7 @@ static struct llama_constraint_i llama_constraint_temp_i = { llama_constraint_temp_impl(cur_p, ctx->temp); }, /* .reset = */ nullptr, - /* .copy = */ [](const struct llama_constraint * cnstr) { + /* .clone = */ [](const struct llama_constraint * cnstr) { const auto * ctx = (const llama_constraint_context_temp *) cnstr->ctx; return llama_constraint_init_temp_impl(ctx->temp); }, @@ -667,7 +667,7 @@ static struct llama_constraint_i llama_constraint_temp_ext_i = { } }, /* .reset = */ nullptr, - /* .copy = */ [](const struct llama_constraint * cnstr) { + /* .clone = */ [](const struct llama_constraint * cnstr) { const auto * ctx = (const llama_constraint_context_temp_ext *) cnstr->ctx; return llama_constraint_init_temp_ext_impl(ctx->temp, ctx->delta, ctx->exponent); }, @@ -754,7 +754,7 @@ static struct llama_constraint_i llama_constraint_mirostat_i = { auto * ctx = (llama_constraint_context_mirostat *) cnstr->ctx; ctx->mu = 2.0f*ctx->tau; }, - /* .copy = */ [](const struct llama_constraint * cnstr) { + /* .clone = */ [](const struct llama_constraint * cnstr) { const auto * ctx = (const llama_constraint_context_mirostat *) cnstr->ctx; return llama_constraint_init_mirostat_impl(*ctx->vocab, ctx->tau, ctx->eta, ctx->m); }, @@ -834,7 +834,7 @@ static struct llama_constraint_i llama_constraint_mirostat_v2_i = { auto * ctx = (llama_constraint_context_mirostat_v2 *) cnstr->ctx; ctx->mu = 2.0f*ctx->tau; }, - /* .copy = */ [](const struct llama_constraint * cnstr) { + /* .clone = */ [](const struct llama_constraint * cnstr) { const auto * ctx = (const llama_constraint_context_mirostat_v2 *) cnstr->ctx; return llama_constraint_init_mirostat_v2_impl(ctx->tau, ctx->eta); }, @@ -891,7 +891,7 @@ static struct llama_constraint_i llama_constraint_grammar_i = { llama_grammar_free_impl(ctx->grammar); ctx->grammar = grammar_new; }, - /* .copy = */ [](const struct llama_constraint * cnstr) { + /* .clone = */ [](const struct llama_constraint * cnstr) { const auto * ctx_src = (const llama_constraint_context_grammar *) cnstr->ctx; auto * result = llama_constraint_init_grammar_impl(*ctx_src->vocab, nullptr, nullptr); @@ -901,7 +901,7 @@ static struct llama_constraint_i llama_constraint_grammar_i = { ctx_dst->grammar_str = ctx_src->grammar_str; ctx_dst->grammar_root = ctx_src->grammar_root; - ctx_dst->grammar = llama_grammar_cp_impl(*ctx_src->grammar); + ctx_dst->grammar = llama_grammar_clone_impl(*ctx_src->grammar); } return result; @@ -998,7 +998,7 @@ static struct llama_constraint_i llama_constraint_penalties_i = { auto * ctx = (llama_constraint_context_penalties *) cnstr->ctx; ctx->prev.clear(); }, - /* .copy = */ [](const struct llama_constraint * cnstr) { + /* .clone = */ [](const struct llama_constraint * cnstr) { const auto * ctx_src = (const llama_constraint_context_penalties *) cnstr->ctx; auto * result = llama_constraint_init_penalties_impl( *ctx_src->vocab, @@ -1059,7 +1059,7 @@ static struct llama_constraint_i llama_constraint_logit_bias_i = { } }, /* .reset = */ nullptr, - /* .copy = */ [](const struct llama_constraint * cnstr) { + /* .clone = */ [](const struct llama_constraint * cnstr) { const auto * ctx_src = (const llama_constraint_context_logit_bias *) cnstr->ctx; return llama_constraint_init_logit_bias_impl(*ctx_src->vocab, ctx_src->logit_bias.size(), ctx_src->logit_bias.data()); }, @@ -1083,8 +1083,8 @@ struct llama_constraint * llama_constraint_init_logit_bias_impl( //////////////////////////////////////// -struct llama_constraint * llama_constraint_cp_impl(const struct llama_constraint & cnstr) { - return cnstr.iface->copy ? cnstr.iface->copy(&cnstr) : nullptr; +struct llama_constraint * llama_constraint_clone_impl(const struct llama_constraint & cnstr) { + return cnstr.iface->clone ? cnstr.iface->clone(&cnstr) : nullptr; } void llama_constraint_free_impl(struct llama_constraint * cnstr) { @@ -1148,7 +1148,7 @@ void llama_sampler_free_impl(struct llama_sampler * smpl) { delete smpl; } -struct llama_sampler * llama_sampler_cp_impl(const struct llama_sampler & smpl) { +struct llama_sampler * llama_sampler_clone_impl(const struct llama_sampler & smpl) { auto * result = new llama_sampler { /* .params = */ smpl.params, /* .vocab = */ smpl.vocab, @@ -1163,7 +1163,7 @@ struct llama_sampler * llama_sampler_cp_impl(const struct llama_sampler & smpl) /* .n_sample = */ 0, }; - // copy the constraints objects + // clone the constraints objects result->constraints.clear(); for (const auto & cnstr : smpl.constraints) { if (cnstr->ctx == nullptr) { @@ -1172,8 +1172,8 @@ struct llama_sampler * llama_sampler_cp_impl(const struct llama_sampler & smpl) /* .ctx = */ nullptr, }); } else { - GGML_ASSERT(cnstr->iface->copy); - result->constraints.push_back(cnstr->iface->copy(cnstr)); + GGML_ASSERT(cnstr->iface->clone); + result->constraints.push_back(cnstr->iface->clone(cnstr)); } } diff --git a/src/llama-sampling.h b/src/llama-sampling.h index 1295bc823..453650b28 100644 --- a/src/llama-sampling.h +++ b/src/llama-sampling.h @@ -29,24 +29,12 @@ struct llama_constraint * llama_constraint_init_typical_impl (float p, size struct llama_constraint * llama_constraint_init_temp_impl (float t); struct llama_constraint * llama_constraint_init_temp_ext_impl (float t, float delta, float exponent); -/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. -/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. -/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. -/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. -/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. -/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - struct llama_constraint * llama_constraint_init_mirostat_impl( const struct llama_vocab & vocab, float tau, float eta, int32_t m); -/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. -/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. -/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. -/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. -/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. struct llama_constraint * llama_constraint_init_mirostat_v2_impl( float tau, float eta); @@ -70,7 +58,7 @@ struct llama_constraint * llama_constraint_init_penalties_impl( int32_t n_logit_bias, const llama_logit_bias * logit_bias); -struct llama_constraint * llama_constraint_cp_impl(const struct llama_constraint & cnstr); +struct llama_constraint * llama_constraint_clone_impl(const struct llama_constraint & cnstr); void llama_constraint_free_impl(struct llama_constraint * cnstr); @@ -106,7 +94,7 @@ struct llama_sampler { struct llama_sampler * llama_sampler_init_impl (const struct llama_vocab & vocab, struct llama_sampler_params params); void llama_sampler_free_impl ( struct llama_sampler * smpl); -struct llama_sampler * llama_sampler_cp_impl (const struct llama_sampler & smpl); +struct llama_sampler * llama_sampler_clone_impl (const struct llama_sampler & smpl); void llama_sampler_reset_impl ( struct llama_sampler & smpl); void llama_sampler_accept_impl( struct llama_sampler & smpl, llama_token token); void llama_sampler_apply_impl ( struct llama_sampler & smpl, struct llama_token_data_array * cur_p); diff --git a/src/llama.cpp b/src/llama.cpp index 6a30daf39..436c21f9d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20669,8 +20669,8 @@ LLAMA_API struct llama_constraint * llama_constraint_init_logit_bias( return llama_constraint_init_logit_bias_impl(model->vocab, n_logit_bias, logit_bias); } -struct llama_constraint * llama_constraint_cp(const struct llama_constraint * cnstr) { - return llama_constraint_cp_impl(*cnstr); +struct llama_constraint * llama_constraint_clone(const struct llama_constraint * cnstr) { + return llama_constraint_clone_impl(*cnstr); } void llama_constraint_free(struct llama_constraint * cnstr) { @@ -20705,8 +20705,8 @@ void llama_sampler_free(struct llama_sampler * smpl) { llama_sampler_free_impl(smpl); } -struct llama_sampler * llama_sampler_cp(const struct llama_sampler * smpl) { - return llama_sampler_cp_impl(*smpl); +struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) { + return llama_sampler_clone_impl(*smpl); } void llama_sampler_reset(struct llama_sampler * smpl) {