llama : use struct llama_sampling in the sampling API

ggml-ci
2024-07-23 17:35:28 +03:00 · 2024-07-23 17:35:28 +03:00 · dbf85440c7
commit dbf85440c7
parent f866cb9342
30 changed files with 437 additions and 395 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -2125,7 +2125,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
        llama_kv_cache_clear(lctx);
        llama_synchronize(lctx);
-        llama_reset_timings(lctx);
+        llama_reset_timings(lctx, nullptr, nullptr);
    }

    return std::make_tuple(model, lctx);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -2,12 +2,11 @@

 #include <random>

-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_context * ctx, llama_seq_id seq_id) {
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_sampling * smpl) {
    struct llama_sampling_context * result = new llama_sampling_context();

    result->params  = params;
-    result->seq_id  = seq_id;
-    result->ctx     = ctx;
+    result->smpl    = smpl;
    result->grammar = nullptr;

    // if there is a grammar, parse it
@ -43,7 +42,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_

    result->n_valid = 0;

-    llama_sampling_set_rng_seed(result, params.seed);
+    llama_sampling_set_rng_seed(result->smpl, params.seed);

    return result;
 }
@ -79,13 +78,6 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
    ctx->n_valid = 0;
 }

-void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
-    if (seed == LLAMA_DEFAULT_SEED) {
-        seed = std::random_device{}();
-    }
-    llama_set_rng_seed_seq(ctx->ctx, seed, ctx->seq_id);
-}
-
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
    if (dst->grammar) {
        llama_grammar_free(dst->grammar);
@ -230,10 +222,13 @@ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::strin

 // no reasons to expose this function in header
 static void sampler_queue(
-                   struct llama_context * ctx_main,
-            const llama_sampling_params & params,
+          struct llama_sampling_context * ctx_sampling,
                 llama_token_data_array & cur_p,
                                 size_t   min_keep) {
+    llama_sampling * smpl = ctx_sampling->smpl;
+
+    const llama_sampling_params & params = ctx_sampling->params;
+
    const float         temp              = params.temp;
    const float         dynatemp_range    = params.dynatemp_range;
    const float         dynatemp_exponent = params.dynatemp_exponent;
@ -246,18 +241,18 @@ static void sampler_queue(

    for (auto sampler_type : samplers_sequence) {
        switch (sampler_type) {
-            case llama_sampler_type::TOP_K    : llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
-            case llama_sampler_type::TFS_Z    : llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
-            case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
-            case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
-            case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
+            case llama_sampler_type::TOP_K    : llama_sampling_top_k    (smpl, &cur_p, top_k,     min_keep); break;
+            case llama_sampler_type::TFS_Z    : llama_sampling_tail_free(smpl, &cur_p, tfs_z,     min_keep); break;
+            case llama_sampler_type::TYPICAL_P: llama_sampling_typical  (smpl, &cur_p, typical_p, min_keep); break;
+            case llama_sampler_type::TOP_P    : llama_sampling_top_p    (smpl, &cur_p, top_p,     min_keep); break;
+            case llama_sampler_type::MIN_P    : llama_sampling_min_p    (smpl, &cur_p, min_p,     min_keep); break;
            case llama_sampler_type::TEMPERATURE:
                if (dynatemp_range > 0) {
                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
-                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
+                    llama_sampling_entropy(smpl, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
                } else {
-                    llama_sample_temp(ctx_main, &cur_p, temp);
+                    llama_sampling_temp(smpl, &cur_p, temp);
                }
                break;
            default : break;
@ -271,6 +266,8 @@ static llama_token llama_sampling_sample_impl(
                  struct llama_context * ctx_cfg,
                  const int idx,
                  bool is_resampling) {
+    llama_sampling * smpl = ctx_sampling->smpl;
+
    const llama_sampling_params & params = ctx_sampling->params;

    const float temp         = params.temp;
@ -287,26 +284,26 @@ static llama_token llama_sampling_sample_impl(

    if (temp < 0.0) {
        // greedy sampling, with probs
-        llama_sample_softmax(ctx_main, &cur_p);
+        llama_sampling_softmax(smpl, &cur_p);
        id = cur_p.data[0].id;
    } else if (temp == 0.0) {
        // greedy sampling, no probs
-        id = llama_sample_token_greedy(ctx_main, &cur_p);
+        id = llama_sampling_sample_greedy(smpl, &cur_p);
    } else {
        if (mirostat == 1) {
            const int mirostat_m = 100;
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
+            llama_sampling_temp(smpl, &cur_p, temp);
+            id = llama_sampling_sample_mirostat(smpl, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
        } else if (mirostat == 2) {
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
+            llama_sampling_temp(smpl, &cur_p, temp);
+            id = llama_sampling_sample_mirostat_v2(smpl, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
        } else {
            // temperature sampling
            size_t min_keep = std::max(1, params.min_keep);

-            sampler_queue(ctx_main, params, cur_p, min_keep);
+            sampler_queue(ctx_sampling, cur_p, min_keep);

-            id = llama_sample_token_seq(ctx_main, &cur_p, ctx_sampling->seq_id);
+            id = llama_sampling_sample(smpl, &cur_p);

            //{
            //    const int n_top = 10;
@ -315,11 +312,11 @@ static llama_token llama_sampling_sample_impl(
            //    for (int i = 0; i < n_top; i++) {
            //        const llama_token id = cur_p.data[i].id;
            //        (void)id; // To avoid a warning that id is unused when logging is disabled.
-            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
+            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(smpl, id).c_str(), cur_p.data[i].p);
            //    }
            //}

-            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
+            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(smpl, id).c_str());
        }
    }

@ -360,6 +357,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
                  const int idx,
                  bool apply_grammar,
                  std::vector<float> * original_logits) {
+    llama_sampling * smpl = ctx_sampling->smpl;
+
    const llama_sampling_params & params = ctx_sampling->params;

    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
@ -390,7 +389,7 @@ static llama_token_data_array llama_sampling_prepare_impl(

    if (ctx_cfg) {
        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
+        llama_sampling_apply_guidance(smpl, logits, logits_guidance, params.cfg_scale);
    }

    cur.resize(n_vocab);
@ -407,7 +406,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
    if (penalty_tokens_used_size) {
        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];

-        llama_sample_repetition_penalties(ctx_main, &cur_p,
+        llama_sampling_repetition_penalties(smpl, &cur_p,
                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);

@ -445,7 +444,7 @@ llama_token_data_array llama_sampling_prepare(
                  const int idx,
                  bool apply_grammar,
                  std::vector<float> * original_logits) {
-    return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
+    return llama_sampling_prepare_impl(ctx_sampling, ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
 }

 void llama_sampling_accept(
--- a/common/sampling.h
+++ b/common/sampling.h
@ -70,12 +70,10 @@ struct llama_sampling_context {
    // parameters that will be used for sampling
    llama_sampling_params params;

-    llama_seq_id seq_id;
-
    // mirostat sampler state
    float mirostat_mu;

-    llama_context * ctx; // TMP
+    llama_sampling * smpl;
    llama_grammar * grammar;

    // internal
@ -91,7 +89,7 @@ struct llama_sampling_context {
 #include "common.h"

 // Create a new sampling context instance.
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_context * ctx, llama_seq_id seq_id);
+struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_sampling * smpl);

 void llama_sampling_free(struct llama_sampling_context * ctx);

@ -100,9 +98,6 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
 // - reset grammar
 void llama_sampling_reset(llama_sampling_context * ctx);

-// Set the sampler seed
-void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
-
 // Copy the sampler context
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);

--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -200,7 +200,7 @@ int main(int argc, char ** argv) {
        }
    }

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr, nullptr);

    llama_batch_free(batch);

--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -64,6 +64,7 @@ int main(int argc, char ** argv) {
    ctx_params.n_batch = std::max(n_predict, n_parallel);

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    llama_sampling * smpl = llama_get_sampling(ctx);

    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
@ -180,13 +181,13 @@ int main(int argc, char ** argv) {
            const float top_p = 0.9f;
            const float temp  = 0.4f;

-            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-            llama_sample_temp (ctx, &candidates_p, temp);
+            llama_sampling_top_k(smpl, &candidates_p, top_k, 1);
+            llama_sampling_top_p(smpl, &candidates_p, top_p, 1);
+            llama_sampling_temp (smpl, &candidates_p, temp);

-            const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
+            const llama_token new_token_id = llama_sampling_sample(smpl, &candidates_p);

-            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+            //const llama_token new_token_id = llama_sampling_sample_greedy(smpl, &candidates_p);

            // is it an end of generation? -> mark the stream as finished
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
@ -244,12 +245,13 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, smpl, nullptr);

    fprintf(stderr, "\n");

    llama_batch_free(batch);

+    llama_sampling_free(smpl);
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -258,7 +258,7 @@ int main(int argc, char ** argv) {
    }

    // clean up
-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr, nullptr);
    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr, nullptr);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -9,7 +9,7 @@
 static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
    std::vector<std::vector<float>> result;

-    const llama_model * mdl = llama_get_model(ctx);
+    const llama_model * model = llama_get_model(ctx);

    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);

@ -18,16 +18,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve

        const std::string input_string = instruction + sentences[i];

-        std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
+        std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false);

        const int32_t n_toks = inputs.size();

        // GritLM seems to have EOS = ""
        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
-        // inputs.push_back(llama_token_eos(mdl));
+        // inputs.push_back(llama_token_eos(model));

        // we want to ignore instruction tokens for mean pooling
-        const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
+        const int32_t n_inst = llama_tokenize(model, instruction, true, false).size();

 #ifdef GRIT_DEBUG
        // debug tokens - should be matching as referenced in the GritLM sample
@ -51,7 +51,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        llama_decode(ctx, batch);

        // get embedding dimensions
-        uint64_t n_embd = llama_n_embd(mdl);
+        uint64_t n_embd = llama_n_embd(model);

        // allocate embedding output
        std::vector<float> emb_unorm(n_embd, 0.0f);
@ -95,8 +95,9 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
 static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
    std::string result;

-    const llama_model * mdl = llama_get_model(ctx);
-    llama_token eos_token = llama_token_eos(mdl);
+    const llama_model * model = llama_get_model(ctx);
+    llama_sampling * smpl = llama_get_sampling(ctx);
+    llama_token eos_token = llama_token_eos(model);

    llama_kv_cache_clear(ctx);
    llama_set_embeddings(ctx, false);
@ -104,7 +105,7 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo

    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);

-    std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
+    std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true);
    int32_t i_current_token = 0;

    while (true) {
@ -118,14 +119,14 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
        llama_decode(ctx, bat);
        auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);

-        auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
+        auto candidates = std::vector<llama_token_data>(llama_n_vocab(model));
        auto n_candidates = (int32_t)candidates.size();
        for (int32_t token = 0; token < n_candidates; token++) {
            candidates[token] = llama_token_data{ token, logits[token], 0.0f };
        }
        auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };

-        llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
+        llama_token token = llama_sampling_sample_greedy(smpl, &candidates_p);
        if (token == eos_token) {
            break;
        }
@ -167,10 +168,10 @@ int main(int argc, char * argv[]) {

    llama_backend_init();

-    llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);

    // create generation context
-    llama_context * ctx = llama_new_context_with_model(mdl, cparams);
+    llama_context * ctx = llama_new_context_with_model(model, cparams);

    // ### Embedding/Representation ###
    // samples taken from: https://github.com/ContextualAI/gritlm#basic
@ -191,7 +192,7 @@ int main(int argc, char * argv[]) {
        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));

-        const int n_embd = llama_n_embd(mdl);
+        const int n_embd = llama_n_embd(model);

        const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
        const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
@ -212,7 +213,7 @@ int main(int argc, char * argv[]) {
    }

    llama_free(ctx);
-    llama_free_model(mdl);
+    llama_free_model(model);
    llama_backend_free();

    return 0;
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -638,7 +638,7 @@ int main(int argc, char ** argv) {

    g_collector.save_imatrix();

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr, nullptr);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -34,6 +34,7 @@

 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
+static llama_sampling_context  ** g_ctx_sampling;
 static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
@ -93,7 +94,7 @@ static void sigint_handler(int signo) {
        } else {
            console::cleanup();
            printf("\n");
-            llama_print_timings(*g_ctx);
+            llama_print_timings(*g_ctx, (*g_ctx_sampling)->smpl, (*g_ctx_sampling)->grammar);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
@ -171,11 +172,13 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    llama_model * model;
-    llama_context * ctx;
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+    llama_sampling_context * ctx_sampling = nullptr;

    g_model = &model;
    g_ctx = &ctx;
+    g_ctx_sampling = &ctx_sampling;

    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
@ -346,7 +349,7 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams, ctx, 0);
+    ctx_sampling = llama_sampling_init(sparams, llama_get_sampling(ctx));

    while (n_remain != 0 || params.interactive) {
        // predict
@ -635,7 +638,7 @@ int main(int argc, char ** argv) {
        fflush(stdout);
    }

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, ctx_sampling->smpl, ctx_sampling->grammar);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

    llama_free(ctx);
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -1434,7 +1434,7 @@ int main(int argc, char ** argv) {
            fflush(p_err->fout);
        }

-        llama_print_timings(ctx);
+        llama_print_timings(ctx, nullptr, nullptr);

        llama_free(ctx);
    }
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_

    LOG_TEE("\n");

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams, ctx_llava->ctx_llama, 0);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams, llama_get_sampling(ctx_llava->ctx_llama));
    if (!ctx_sampling) {
        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
        exit(1);
@ -310,7 +310,7 @@ int main(int argc, char ** argv) {
        // process the prompt
        process_prompt(ctx_llava, image_embed, &params, params.prompt);

-        llama_print_timings(ctx_llava->ctx_llama);
+        llama_print_timings(ctx_llava->ctx_llama, nullptr, nullptr);
        llava_image_embed_free(image_embed);
        ctx_llava->model = NULL;
        llava_free(ctx_llava);
@ -327,7 +327,7 @@ int main(int argc, char ** argv) {
            // process the prompt
            process_prompt(ctx_llava, image_embed, &params, params.prompt);

-            llama_print_timings(ctx_llava->ctx_llama);
+            llama_print_timings(ctx_llava->ctx_llama, nullptr, nullptr);
            llava_image_embed_free(image_embed);
            ctx_llava->model = NULL;
            llava_free(ctx_llava);
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -118,7 +118,7 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);

    // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, ctx, 0);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, llama_get_sampling(ctx));

    // verification n-grams
    std::vector<ngram_data> ngrams_cur(G);
@ -468,7 +468,7 @@ int main(int argc, char ** argv) {
    LOG_TEE("n_predict = %d\n", n_predict);
    LOG_TEE("n_accept  = %d\n", n_accept);

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, ctx_sampling->smpl, ctx_sampling->grammar);

    llama_kv_cache_view_free(&kvc_view);
    llama_sampling_free(ctx_sampling);
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -106,7 +106,7 @@ int main(int argc, char ** argv){

    bool has_eos = false;

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, ctx, 0);
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, llama_get_sampling(ctx));

    std::vector<llama_token> draft;

@ -241,7 +241,7 @@ int main(int argc, char ** argv){
    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx);
+    llama_print_timings(ctx, ctx_sampling->smpl, ctx_sampling->grammar);

    llama_sampling_free(ctx_sampling);
    llama_batch_free(batch_tgt);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -33,6 +33,7 @@

 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
+static llama_sampling_context  ** g_ctx_sampling;
 static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
@ -105,7 +106,7 @@ static void sigint_handler(int signo) {
        } else {
            console::cleanup();
            printf("\n");
-            llama_print_timings(*g_ctx);
+            llama_print_timings(*g_ctx, (*g_ctx_sampling)->smpl, (*g_ctx_sampling)->grammar);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
@ -121,8 +122,7 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v

 static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
    llama_chat_msg new_msg{role, content};
-    auto formatted = llama_chat_format_single(
-        model, g_params->chat_template, chat_msgs, new_msg, role == "user");
+    auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
    chat_msgs.push_back({role, content});
    return formatted;
 }
@ -197,12 +197,16 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    llama_model * model;
-    llama_context * ctx;
-    llama_context * ctx_guidance = NULL;
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+    llama_context * ctx_guidance = nullptr;
+    llama_sampling_context * ctx_sampling = nullptr;
+
    std::vector<llama_chat_msg> chat_msgs;
+
    g_model = &model;
    g_ctx = &ctx;
+    g_ctx_sampling = &ctx_sampling;

    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
@ -527,7 +531,7 @@ int main(int argc, char ** argv) {
        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
    }

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams, ctx, 0);
+    ctx_sampling = llama_sampling_init(sparams, llama_get_sampling(ctx));
    if (!ctx_sampling) {
        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
        exit(1);
@ -975,7 +979,7 @@ int main(int argc, char ** argv) {
        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, ctx_sampling->smpl, ctx_sampling->grammar);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

    if (ctx_guidance) { llama_free(ctx_guidance); }
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -51,6 +51,7 @@ static std::vector<std::string> k_prompts = {
 struct client {
    ~client() {
        if (ctx_sampling) {
+            llama_sampling_free(ctx_sampling->smpl);
            llama_sampling_free(ctx_sampling);
        }
    }
@ -161,7 +162,7 @@ int main(int argc, char ** argv) {
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
-        client.ctx_sampling = llama_sampling_init(params.sparams, ctx, i);
+        client.ctx_sampling = llama_sampling_init(params.sparams, llama_sampling_init(llama_n_vocab(model)));
    }

    std::vector<llama_token> tokens_system;
@ -371,7 +372,7 @@ int main(int argc, char ** argv) {
                    }

                    // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
+                    llama_kv_cache_seq_rm(ctx,    client.id + 1, -1, -1);
                    llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);

                    const auto t_main_end = ggml_time_us();
@ -413,7 +414,8 @@ int main(int argc, char ** argv) {

    LOG_TEE("\n");

-    llama_print_timings(ctx);
+    // TODO: print sampling/grammar timings for all clients
+    llama_print_timings(ctx, nullptr, nullptr);

    llama_batch_free(batch);

--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -80,12 +80,13 @@ int main(int argc, char ** argv) {
    GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }

+    llama_sampling * smpl = llama_get_sampling(ctx);
+
    // tokenize the prompt
    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
@ -230,7 +231,7 @@ int main(int argc, char ** argv) {
            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

            // sample the most likely token
-            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+            const llama_token new_token_id = llama_sampling_sample_greedy(smpl, &candidates_p);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
@ -267,7 +268,7 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr, nullptr);

    fprintf(stderr, "\n");

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -2054,7 +2054,7 @@ int main(int argc, char ** argv) {
        results = perplexity(ctx, params, n_ctx);
    }

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr, nullptr);
    write_logfile(ctx, params, model, results);

    llama_free(ctx);
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
    }

    // clean up
-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr, nullptr);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -37,6 +37,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    llama_sampling * smpl = llama_get_sampling(ctx);
+
    // tokenize prompt
    auto tokens = llama_tokenize(ctx, params.prompt, true);

@ -72,7 +74,7 @@ int main(int argc, char ** argv) {
            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sample_token(ctx, &candidates_p);
+        auto next_token = llama_sampling_sample(smpl, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx, next_token);

        printf("%s", next_token_str.c_str());
@ -95,6 +97,8 @@ int main(int argc, char ** argv) {
    // make new context
    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

+    llama_sampling * smpl2 = llama_get_sampling(ctx2);
+
    printf("\nsecond run: %s", params.prompt.c_str());

    // load state (rng, logits, embedding and kv_cache) from file
@ -128,7 +132,7 @@ int main(int argc, char ** argv) {
            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sample_token(ctx2, &candidates_p);
+        auto next_token = llama_sampling_sample(smpl2, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);

        printf("%s", next_token_str.c_str());
@ -153,7 +157,9 @@ int main(int argc, char ** argv) {
    }

    // make new context
-    auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
+    auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
+
+    llama_sampling * smpl3 = llama_get_sampling(ctx3);

    printf("\nsingle seq run: %s", params.prompt.c_str());

@ -216,7 +222,7 @@ int main(int argc, char ** argv) {
            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        auto next_token = llama_sample_token(ctx3, &candidates_p);
+        auto next_token = llama_sampling_sample(smpl3, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx3, next_token);

        printf("%s", next_token_str.c_str());
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -664,6 +664,7 @@ struct server_context {
        // Clear any sampling context
        for (server_slot & slot : slots) {
            if (slot.ctx_sampling != nullptr) {
+                llama_sampling_free(slot.ctx_sampling->smpl);
                llama_sampling_free(slot.ctx_sampling);
            }
        }
@ -1088,9 +1089,11 @@ struct server_context {

        {
            if (slot.ctx_sampling != nullptr) {
+                llama_sampling_free(slot.ctx_sampling->smpl);
                llama_sampling_free(slot.ctx_sampling);
            }
-            slot.ctx_sampling = llama_sampling_init(slot.sparams, ctx, slot.id);
+
+            slot.ctx_sampling = llama_sampling_init(slot.sparams, llama_sampling_init(llama_n_vocab(model)));
            if (slot.ctx_sampling == nullptr) {
                // for now, the only error that may happen here is invalid grammar
                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
@ -2402,7 +2405,7 @@ struct server_context {

                    // Make sure at least n_probs top tokens are at the front of the vector:
                    if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
-                        llama_sample_top_k(ctx, &cur_p, n_probs, 0);
+                        llama_sampling_top_k(slot.ctx_sampling->smpl, &cur_p, n_probs, 0);
                    }

                    if (slot.sparams.temp == 0.0f) {
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -55,6 +55,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    llama_sampling * smpl = llama_get_sampling(ctx);
+
    // tokenize the prompt

    std::vector<llama_token> tokens_list;
@ -123,7 +125,7 @@ int main(int argc, char ** argv) {
            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

            // sample the most likely token
-            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+            const llama_token new_token_id = llama_sampling_sample_greedy(smpl, &candidates_p);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
@ -160,7 +162,7 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx);
+    llama_print_timings(ctx, nullptr, nullptr);

    fprintf(stderr, "\n");

--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -174,8 +174,8 @@ int main(int argc, char ** argv) {
    // used to determine end of generation
    bool has_eos = false;

-    // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, ctx_tgt, 0);
+    // target model sampling context (reuse the llama_context's sampling instance)
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, llama_get_sampling(ctx_tgt));

    // draft sequence data
    std::vector<seq_draft> drafts(n_seq_dft);
@ -186,7 +186,8 @@ int main(int argc, char ** argv) {
    }

    for (int s = 0; s < n_seq_dft; ++s) {
-        drafts[s].ctx_sampling = llama_sampling_init(params.sparams, ctx_dft, s);
+        // allocate llama_sampling for each draft sequence
+        drafts[s].ctx_sampling = llama_sampling_init(params.sparams, llama_sampling_init(llama_n_vocab(model_dft)));
    }

    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
@ -230,8 +231,10 @@ int main(int argc, char ** argv) {
                    // stochastic verification

                    llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL);
-                    llama_sample_softmax(ctx_tgt, &dist_tgt);
-                    float p_tgt = 0, p_dft = 0;
+                    llama_sampling_softmax(ctx_sampling->smpl, &dist_tgt);
+
+                    float p_tgt = 0.0f;
+                    float p_dft = 0.0f;

                    // GGML_ASSERT(dist_tgt.size() == dist_dft.size());

@ -327,7 +330,7 @@ int main(int argc, char ** argv) {
                        // all drafted tokens were rejected
                        // sample from the target model
                        LOG("all drafted tokens were rejected, sampling from residual distribution\n");
-                        token_id = llama_sample_token(ctx_tgt, &dist_tgt);
+                        token_id = llama_sampling_sample(ctx_sampling->smpl, &dist_tgt);
                        llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
                        token_str = llama_token_to_piece(ctx_tgt, token_id);
                    }
@ -589,13 +592,15 @@ int main(int argc, char ** argv) {
    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);

    LOG_TEE("\ndraft:\n");
-    llama_print_timings(ctx_dft);
+    // TODO: print sampling/grammar timings for all drafts
+    llama_print_timings(ctx_dft, nullptr, nullptr);

    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx_tgt);
+    llama_print_timings(ctx_tgt, ctx_sampling->smpl, ctx_sampling->grammar);

    llama_sampling_free(ctx_sampling);
    for (int s = 0; s < n_seq_dft; ++s) {
+        llama_sampling_free(drafts[s].ctx_sampling->smpl);
        llama_sampling_free(drafts[s].ctx_sampling);
    }

--- a/include/llama.h
+++ b/include/llama.h
@ -40,7 +40,7 @@
 #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'

 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 8
+#define LLAMA_SESSION_VERSION 7

 #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
 #define LLAMA_STATE_SEQ_VERSION 1
@ -394,16 +394,22 @@ extern "C" {
        uint32_t           value; // Unicode code point or rule ID
    } llama_grammar_element;

+    // sampling types
+    struct llama_sampling;
+
    // performance timing information
    struct llama_timings {
        double t_start_ms;
        double t_end_ms;
        double t_load_ms;
-        double t_sample_ms;
+        double t_sampling_ms;
+        double t_grammar_ms;
        double t_p_eval_ms;
        double t_eval_ms;

-        int32_t n_sample;
+        int32_t n_sampling;
+        int32_t n_grammar_sample;
+        int32_t n_grammar_accept;
        int32_t n_p_eval;
        int32_t n_eval;
    };
@ -454,7 +460,8 @@ extern "C" {
    LLAMA_API bool llama_supports_mlock      (void);
    LLAMA_API bool llama_supports_gpu_offload(void);

-    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
+    LLAMA_API const struct llama_model    * llama_get_model   (const struct llama_context * ctx);
+    LLAMA_API       struct llama_sampling * llama_get_sampling(      struct llama_context * ctx);

    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
@ -1028,85 +1035,87 @@ extern "C" {
    // Sampling functions
    //

-    // Sets the current rng seed.
-    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
+    // TODO: args become llama_sampling_params
+    LLAMA_API struct llama_sampling * llama_sampling_init(int32_t n_vocab);

-    LLAMA_API DEPRECATED(void llama_set_rng_seed_seq(struct llama_context * ctx, uint32_t seed, llama_seq_id),
-        "temporary API, until llama_sampling_context is implemented, do not use");
+    LLAMA_API void llama_sampling_free(struct llama_sampling * smpl);
+
+    // Sets the current rng seed.
+    LLAMA_API void llama_sampling_set_rng_seed(struct llama_sampling * smpl, uint32_t seed);

    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_repetition_penalties(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-               const llama_token * last_tokens,
-                          size_t   penalty_last_n,
-                           float   penalty_repeat,
-                           float   penalty_freq,
-                           float   penalty_present);
+    LLAMA_API void llama_sampling_repetition_penalties(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates,
+                const llama_token * last_tokens,
+                           size_t   penalty_last_n,
+                            float   penalty_repeat,
+                            float   penalty_freq,
+                            float   penalty_present);

    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
    /// @param logits Logits extracted from the original generation context.
    /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
    /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-    LLAMA_API void llama_sample_apply_guidance(
-              struct llama_context * ctx,
-                             float * logits,
-                             float * logits_guidance,
-                             float   scale);
+    LLAMA_API void llama_sampling_apply_guidance(
+              struct llama_sampling * smpl,
+                              float * logits,
+                              float * logits_guidance,
+                              float   scale);

    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    LLAMA_API void llama_sample_softmax(
-            struct llama_context * ctx,
+    LLAMA_API void llama_sampling_softmax(
+            struct llama_sampling * smpl,
          llama_token_data_array * candidates);

    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_k(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                         int32_t   k,
-                          size_t   min_keep);
+    LLAMA_API void llama_sampling_top_k(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates,
+                          int32_t   k,
+                           size_t   min_keep);

    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_p(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   p,
-                          size_t   min_keep);
+    LLAMA_API void llama_sampling_top_p(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates,
+                            float   p,
+                           size_t   min_keep);

    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
-    LLAMA_API void llama_sample_min_p(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   p,
-                          size_t   min_keep);
+    LLAMA_API void llama_sampling_min_p(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates,
+                            float   p,
+                           size_t   min_keep);

    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API void llama_sample_tail_free(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   z,
-                          size_t   min_keep);
+    LLAMA_API void llama_sampling_tail_free(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates,
+                            float   z,
+                           size_t   min_keep);

    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API void llama_sample_typical(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   p,
-                          size_t   min_keep);
+    LLAMA_API void llama_sampling_typical(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates,
+                            float   p,
+                           size_t   min_keep);

    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
-    LLAMA_API void llama_sample_entropy(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates_p,
-                           float   min_temp,
-                           float   max_temp,
-                           float   exponent_val);
+    LLAMA_API void llama_sampling_entropy(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates_p,
+                            float   min_temp,
+                            float   max_temp,
+                            float   exponent_val);

-    LLAMA_API void llama_sample_temp(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   temp);
+    LLAMA_API void llama_sampling_temp(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates,
+                            float   temp);

    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@ -1114,43 +1123,36 @@ extern "C" {
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   tau,
-                           float   eta,
-                         int32_t   m,
-                           float * mu);
+    LLAMA_API llama_token llama_sampling_sample_mirostat(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates,
+                            float   tau,
+                            float   eta,
+                          int32_t   m,
+                            float * mu);

    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat_v2(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                           float   tau,
-                           float   eta,
-                           float * mu);
+    LLAMA_API llama_token llama_sampling_sample_mirostat_v2(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates,
+                            float   tau,
+                            float   eta,
+                            float * mu);

    /// @details Selects the token with the highest probability.
    ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
-    LLAMA_API llama_token llama_sample_token_greedy(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates);
+    LLAMA_API llama_token llama_sampling_sample_greedy(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates);

-    /// @details Randomly selects a token from the candidates based on their probabilities using RNG[0] of ctx.
-    LLAMA_API llama_token llama_sample_token(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates);
-
-    /// @details Same as llama_sample_token, but uses a seqeuence-specific RNG[seq_id].
-    LLAMA_API DEPRECATED(llama_token llama_sample_token_seq(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-                    llama_seq_id   seq_id),
-        "temporary API, until llama_sampling_context is implemented, do not use");
+    /// @details Randomly selects a token from the candidates based on their probabilities using RNG[0] of smpl.
+    LLAMA_API llama_token llama_sampling_sample(
+            struct llama_sampling * smpl,
+           llama_token_data_array * candidates);

    //
    // Model split
@ -1169,8 +1171,8 @@ extern "C" {
    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);

-    LLAMA_API void llama_print_timings(struct llama_context * ctx);
-    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
+    LLAMA_API void llama_print_timings(struct llama_context * ctx, struct llama_sampling * smpl, struct llama_grammar * grammar);
+    LLAMA_API void llama_reset_timings(struct llama_context * ctx, struct llama_sampling * smpl, struct llama_grammar * grammar);

    // Print system information
    LLAMA_API const char * llama_print_system_info(void);
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@ -438,7 +438,7 @@ struct llama_grammar * llama_grammar_init_impl(
    // Important: vec_rules has to be moved here, not copied, because stacks contains
    // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
    // then the pointers would be invalidated when the local vec_rules goes out of scope.
-    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
+    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {}, 0, 0, 0 };
 }

 void llama_grammar_free_impl(struct llama_grammar * grammar) {
@ -446,7 +446,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
 }

 struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar & grammar) {
-    llama_grammar * result = new llama_grammar{ grammar.rules, grammar.stacks, grammar.partial_utf8 };
+    llama_grammar * result = new llama_grammar{ grammar.rules, grammar.stacks, grammar.partial_utf8, 0, 0, 0 };

    // redirect elements in stacks to point to new rules
    for (size_t is = 0; is < result->stacks.size(); is++) {
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
@ -11,6 +11,11 @@ struct llama_grammar {

    // buffer for partially generated UTF-8 sequence from accepted tokens
    llama_partial_utf8 partial_utf8;
+
+    mutable int64_t t_total_us;
+
+    mutable int32_t n_sample;
+    mutable int32_t n_accept;
 };

 struct llama_grammar * llama_get_grammar(struct llama_context * ctx);
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -21,7 +21,15 @@ static void llama_log_softmax(float * array, size_t size) {
    }
 }

-void llama_set_rng_seed_impl(struct llama_sampling & smpl, uint32_t seed) {
+struct llama_sampling * llama_sampling_init_impl(int32_t n_vocab) {
+    return new llama_sampling(n_vocab);
+}
+
+void llama_sampling_free_impl(struct llama_sampling * sampling) {
+    delete sampling;
+}
+
+void llama_sampling_set_rng_seed_impl(struct llama_sampling & smpl, uint32_t seed) {
    if (seed == LLAMA_DEFAULT_SEED) {
        seed = time(NULL);
    }
@ -29,7 +37,7 @@ void llama_set_rng_seed_impl(struct llama_sampling & smpl, uint32_t seed) {
    smpl.rng.seed(seed);
 }

-void llama_sample_softmax_impl(struct llama_sampling & /*smpl*/, llama_token_data_array * candidates) {
+void llama_sampling_softmax_impl(struct llama_sampling & /*smpl*/, llama_token_data_array * candidates) {
    GGML_ASSERT(candidates->size > 0);

    // Sort the logits in descending order
@ -54,7 +62,7 @@ void llama_sample_softmax_impl(struct llama_sampling & /*smpl*/, llama_token_dat
    }
 }

-void llama_sample_top_k_impl(struct llama_sampling & /*smpl*/, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
+void llama_sampling_top_k_impl(struct llama_sampling & /*smpl*/, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
    // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
    // if (k >= (int32_t)candidates->size) {
    //     return;
@ -129,12 +137,12 @@ void llama_sample_top_k_impl(struct llama_sampling & /*smpl*/, llama_token_data_
    candidates->size = k;
 }

-void llama_sample_top_p_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+void llama_sampling_top_p_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
    if (p >= 1.0f) {
        return;
    }

-    llama_sample_softmax_impl(smpl, candidates);
+    llama_sampling_softmax_impl(smpl, candidates);

    // Compute the cumulative probabilities
    float cum_sum = 0.0f;
@ -155,7 +163,7 @@ void llama_sample_top_p_impl(struct llama_sampling & smpl, llama_token_data_arra
    candidates->size = last_idx;
 }

-void llama_sample_min_p_impl(struct llama_sampling & /*smpl*/, llama_token_data_array * candidates, float p, size_t min_keep) {
+void llama_sampling_min_p_impl(struct llama_sampling & /*smpl*/, llama_token_data_array * candidates, float p, size_t min_keep) {
    if (p <= 0.0f || !candidates->size) {
        return;
    }
@ -210,12 +218,12 @@ void llama_sample_min_p_impl(struct llama_sampling & /*smpl*/, llama_token_data_
    }
 }

-void llama_sample_tail_free_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float z, size_t min_keep) {
+void llama_sampling_tail_free_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float z, size_t min_keep) {
    if (z >= 1.0f || candidates->size <= 2) {
        return;
    }

-    llama_sample_softmax_impl(smpl, candidates);
+    llama_sampling_softmax_impl(smpl, candidates);

    // Compute the first and second derivatives
    std::vector<float> first_derivatives(candidates->size - 1);
@ -264,7 +272,7 @@ void llama_sample_tail_free_impl(struct llama_sampling & smpl, llama_token_data_
    candidates->size = last_idx;
 }

-void llama_sample_typical_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+void llama_sampling_typical_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
    // Reference implementation:
    // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
    if (p >= 1.0f) {
@ -272,7 +280,7 @@ void llama_sample_typical_impl(struct llama_sampling & smpl, llama_token_data_ar
    }

    // Compute the softmax of logits and calculate entropy
-    llama_sample_softmax_impl(smpl, candidates);
+    llama_sampling_softmax_impl(smpl, candidates);

    float entropy = 0.0f;
    for (size_t i = 0; i < candidates->size; ++i) {
@ -322,7 +330,7 @@ void llama_sample_typical_impl(struct llama_sampling & smpl, llama_token_data_ar
    candidates->sorted = false;
 }

-void llama_sample_entropy_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) {
+void llama_sampling_entropy_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) {
    // no need to do anything if there is only one (or zero) candidates
    if(candidates->size <= 1) {
        return;
@ -331,7 +339,7 @@ void llama_sample_entropy_impl(struct llama_sampling & smpl, llama_token_data_ar
    // Calculate maximum possible entropy
    float max_entropy = -logf(1.0f / candidates->size);

-    llama_sample_softmax_impl(smpl, candidates);
+    llama_sampling_softmax_impl(smpl, candidates);

    // Calculate entropy of the softmax probabilities
    float entropy = 0.0f;
@ -383,13 +391,13 @@ void llama_sample_entropy_impl(struct llama_sampling & smpl, llama_token_data_ar
 #endif
 }

-void llama_sample_temp_impl(struct llama_sampling & /*smpl*/, llama_token_data_array * candidates, float temp) {
+void llama_sampling_temp_impl(struct llama_sampling & /*smpl*/, llama_token_data_array * candidates, float temp) {
    for (size_t i = 0; i < candidates->size; ++i) {
        candidates->data[i].logit /= temp;
    }
 }

-void llama_sample_repetition_penalties_impl(
+void llama_sampling_repetition_penalties_impl(
        struct llama_sampling & /*smpl*/,
       llama_token_data_array * candidates,
            const llama_token * last_tokens,
@ -430,7 +438,7 @@ void llama_sample_repetition_penalties_impl(
    candidates->sorted = false;
 }

-void llama_sample_apply_guidance_impl(
+void llama_sampling_apply_guidance_impl(
        struct llama_sampling & smpl,
                        float * logits,
                        float * logits_guidance,
@ -448,10 +456,10 @@ void llama_sample_apply_guidance_impl(
    }
 }

-llama_token llama_sample_token_mirostat_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
+llama_token llama_sampling_sample_mirostat_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
    const int32_t n_vocab = float(smpl.n_vocab);

-    llama_sample_softmax_impl(smpl, candidates);
+    llama_sampling_softmax_impl(smpl, candidates);

    // Estimate s_hat using the most probable m tokens
    float s_hat = 0.0;
@ -470,8 +478,8 @@ llama_token llama_sample_token_mirostat_impl(struct llama_sampling & smpl, llama
    float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(n_vocab, -epsilon_hat)), 1 / s_hat);

    // Sample the next word X using top-k sampling
-    llama_sample_top_k_impl(smpl, candidates, int(k), 1);
-    llama_token X = llama_sample_token_impl(smpl, candidates);
+    llama_sampling_top_k_impl(smpl, candidates, int(k), 1);
+    llama_token X = llama_sampling_sample_impl(smpl, candidates);

    // Compute error as the difference between observed surprise and target surprise value
    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
@ -486,8 +494,8 @@ llama_token llama_sample_token_mirostat_impl(struct llama_sampling & smpl, llama
    return X;
 }

-llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float tau, float eta, float * mu) {
-    llama_sample_softmax_impl(smpl, candidates);
+llama_token llama_sampling_sample_mirostat_v2_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float tau, float eta, float * mu) {
+    llama_sampling_softmax_impl(smpl, candidates);

    // Truncate the words with surprise values greater than mu
    candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
@ -499,10 +507,10 @@ llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling & smpl, ll
    }

    // Normalize the probabilities of the remaining words
-    llama_sample_softmax_impl(smpl, candidates);
+    llama_sampling_softmax_impl(smpl, candidates);

    // Sample the next word X from the remaining words
-    llama_token X = llama_sample_token_impl(smpl, candidates);
+    llama_token X = llama_sampling_sample_impl(smpl, candidates);

    // Compute error as the difference between observed surprise and target surprise value
    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
@ -517,7 +525,7 @@ llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling & smpl, ll
    return X;
 }

-llama_token llama_sample_token_greedy_impl(struct llama_sampling & /*smpl*/, llama_token_data_array * candidates) {
+llama_token llama_sampling_sample_greedy_impl(struct llama_sampling & /*smpl*/, llama_token_data_array * candidates) {
    // Find max element
    auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
        return a.logit < b.logit;
@ -528,8 +536,8 @@ llama_token llama_sample_token_greedy_impl(struct llama_sampling & /*smpl*/, lla
    return result;
 }

-llama_token llama_sample_token_with_rng_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, std::mt19937 & rng) {
-    llama_sample_softmax_impl(smpl, candidates);
+llama_token llama_sampling_sample_with_rng_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, std::mt19937 & rng) {
+    llama_sampling_softmax_impl(smpl, candidates);

    std::vector<float> probs;
    probs.reserve(candidates->size);
@ -545,6 +553,6 @@ llama_token llama_sample_token_with_rng_impl(struct llama_sampling & smpl, llama
    return result;
 }

-llama_token llama_sample_token_impl(struct llama_sampling & smpl, llama_token_data_array * candidates) {
-    return llama_sample_token_with_rng_impl(smpl, candidates, smpl.rng);
+llama_token llama_sampling_sample_impl(struct llama_sampling & smpl, llama_token_data_array * candidates) {
+    return llama_sampling_sample_with_rng_impl(smpl, candidates, smpl.rng);
 }
--- a/src/llama-sampling.h
+++ b/src/llama-sampling.h
@ -3,29 +3,37 @@
 #include "llama-impl.h"

 struct llama_sampling {
-    llama_sampling(uint32_t seed, int32_t n_vocab) : rng(seed), n_vocab(n_vocab) {}
+    llama_sampling(int32_t n_vocab) : n_vocab(n_vocab) {}
+
+    const int32_t n_vocab;

    std::mt19937 rng;

-    const int32_t n_vocab;
+    mutable int64_t t_total_us = 0;
+
+    mutable int32_t n_sample = 0;
 };

 //
 // internal API
 //

-void llama_set_rng_seed_impl(struct llama_sampling & smpl, uint32_t seed);
+struct llama_sampling * llama_sampling_init_impl(int32_t n_vocab);

-void llama_sample_softmax_impl  (struct llama_sampling & smpl, llama_token_data_array * candidates);
-void llama_sample_top_k_impl    (struct llama_sampling & smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep);
-void llama_sample_top_p_impl    (struct llama_sampling & smpl, llama_token_data_array * candidates, float p, size_t min_keep);
-void llama_sample_min_p_impl    (struct llama_sampling & smpl, llama_token_data_array * candidates, float p, size_t min_keep);
-void llama_sample_tail_free_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float z, size_t min_keep);
-void llama_sample_typical_impl  (struct llama_sampling & smpl, llama_token_data_array * candidates, float p, size_t min_keep);
-void llama_sample_entropy_impl  (struct llama_sampling & smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
-void llama_sample_temp_impl     (struct llama_sampling & smpl, llama_token_data_array * candidates, float temp);
+void llama_sampling_free_impl(struct llama_sampling * sampling);

-void llama_sample_repetition_penalties_impl(
+void llama_sampling_set_rng_seed_impl(struct llama_sampling & smpl, uint32_t seed);
+
+void llama_sampling_softmax_impl  (struct llama_sampling & smpl, llama_token_data_array * candidates);
+void llama_sampling_top_k_impl    (struct llama_sampling & smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep);
+void llama_sampling_top_p_impl    (struct llama_sampling & smpl, llama_token_data_array * candidates, float p, size_t min_keep);
+void llama_sampling_min_p_impl    (struct llama_sampling & smpl, llama_token_data_array * candidates, float p, size_t min_keep);
+void llama_sampling_tail_free_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float z, size_t min_keep);
+void llama_sampling_typical_impl  (struct llama_sampling & smpl, llama_token_data_array * candidates, float p, size_t min_keep);
+void llama_sampling_entropy_impl  (struct llama_sampling & smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
+void llama_sampling_temp_impl     (struct llama_sampling & smpl, llama_token_data_array * candidates, float temp);
+
+void llama_sampling_repetition_penalties_impl(
        struct llama_sampling & smpl,
       llama_token_data_array * candidates,
            const llama_token * last_tokens,
@ -34,15 +42,15 @@ void llama_sample_repetition_penalties_impl(
                        float   penalty_freq,
                        float   penalty_present);

-void llama_sample_apply_guidance_impl(
+void llama_sampling_apply_guidance_impl(
        struct llama_sampling & smpl,
                        float * logits,
                        float * logits_guidance,
                        float   scale);

-llama_token llama_sample_token_mirostat_impl   (struct llama_sampling & smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu);
-llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float tau, float eta, float * mu);
-llama_token llama_sample_token_greedy_impl     (struct llama_sampling & smpl, llama_token_data_array * candidates);
-llama_token llama_sample_token_with_rng_impl   (struct llama_sampling & smpl, llama_token_data_array * candidates, std::mt19937 & rng);
-llama_token llama_sample_token_impl            (struct llama_sampling & smpl, llama_token_data_array * candidates);
+llama_token llama_sampling_sample_mirostat_impl   (struct llama_sampling & smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu);
+llama_token llama_sampling_sample_mirostat_v2_impl(struct llama_sampling & smpl, llama_token_data_array * candidates, float tau, float eta, float * mu);
+llama_token llama_sampling_sample_greedy_impl     (struct llama_sampling & smpl, llama_token_data_array * candidates);
+llama_token llama_sampling_sample_with_rng_impl   (struct llama_sampling & smpl, llama_token_data_array * candidates, std::mt19937 & rng);
+llama_token llama_sampling_sample_impl            (struct llama_sampling & smpl, llama_token_data_array * candidates);

--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2669,6 +2669,7 @@ struct llama_model {
 struct llama_context {
    llama_context(const llama_model & model)
        : model(model)
+        , sampling(llama_n_vocab(&model))
        , grammar()
        , t_start_us(model.t_start_us)
        , t_load_us(model.t_load_us) {}
@ -2686,12 +2687,11 @@ struct llama_context {
    const struct llama_model & model;

    struct llama_cparams        cparams;
+    struct llama_sampling       sampling;
    struct llama_grammar        grammar;
    struct llama_kv_cache       kv_self;
    struct llama_control_vector cvec;

-    std::vector<struct llama_sampling> sampling; // sampling context for each sequence
-
    std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;

    std::vector<ggml_backend_t> backends;
@ -2707,14 +2707,12 @@ struct llama_context {

    mutable int64_t t_start_us;
    mutable int64_t t_load_us;
-    mutable int64_t t_sample_us = 0;
    mutable int64_t t_p_eval_us = 0;
    mutable int64_t t_eval_us   = 0;

    mutable int64_t t_compute_start_us = 0;
    mutable int64_t n_queued_tokens = 0;

-    mutable int32_t n_sample = 0;
    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
    mutable int32_t n_eval   = 0; // number of eval calls

@ -16542,10 +16540,7 @@ struct llama_context * llama_new_context_with_model(
    ctx->abort_callback      = params.abort_callback;
    ctx->abort_callback_data = params.abort_callback_data;

-    ctx->sampling.reserve(cparams.n_seq_max);
-    for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
-        ctx->sampling.emplace_back(params.seed, llama_n_vocab(model));
-    }
+    llama_sampling_set_rng_seed_impl(ctx->sampling, params.seed);

    ctx->logits_all = params.logits_all;

@ -16827,6 +16822,10 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
    return &ctx->model;
 }

+struct llama_sampling * llama_get_sampling(struct llama_context * ctx) {
+    return &ctx->sampling;
+}
+
 const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
    return &ctx->model.vocab;
 }
@ -17322,7 +17321,6 @@ size_t llama_state_get_size(const struct llama_context * ctx) {

    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
    // for reference, std::mt19937(1337) serializes to 6701 bytes.
-    const size_t s_n_rng           = sizeof(uint32_t);
    const size_t s_rng_size        = sizeof(size_t);
    const size_t s_rng             = LLAMA_MAX_RNG_STATE;
    const size_t s_n_outputs       = sizeof(size_t);
@ -17342,8 +17340,8 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
    const size_t s_kv_cells        = ctx->kv_self.size * s_kv_cell;

    const size_t s_total = (
-        + s_n_rng
-        + cparams.n_seq_max*(s_rng_size + s_rng)
+        + s_rng_size
+        + s_rng
        + s_n_outputs
        + s_output_pos
        + s_logits_size
@ -17360,7 +17358,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
    );

    // on session change it is very likely that the state size has changed - so we need to update this function
-    static_assert(LLAMA_SESSION_VERSION == 8, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
+    static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");

    return s_total;
 }
@ -17423,22 +17421,16 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data

    // copy rngs
    {
-        const uint32_t n_rng = ctx->sampling.size();
+        std::ostringstream rng_ss;
+        rng_ss << ctx->sampling.rng;

-        data_ctx->write(&n_rng, sizeof(n_rng));
+        const std::string & rng_str  = rng_ss.str();
+        const size_t        rng_size = rng_str.size();

-        for (const auto & smpl : ctx->sampling) {
-            std::ostringstream rng_ss;
-            rng_ss << smpl.rng;
+        GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);

-            const std::string & rng_str  = rng_ss.str();
-            const size_t        rng_size = rng_str.size();
-
-            GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
-
-            data_ctx->write(&rng_size,      sizeof(rng_size));
-            data_ctx->write(rng_str.data(), rng_size);
-        }
+        data_ctx->write(&rng_size,      sizeof(rng_size));
+        data_ctx->write(rng_str.data(), rng_size);
    }

    // copy outputs
@ -17588,24 +17580,17 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {

    // set rngs
    {
-        uint32_t n_rng;
-        memcpy(&n_rng, inp, sizeof(n_rng)); inp += sizeof(n_rng);
+        size_t rng_size;
+        memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);

-        GGML_ASSERT(n_rng == ctx->cparams.n_seq_max);
+        GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);

-        for (auto & smpl : ctx->sampling) {
-            size_t rng_size;
-            memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
+        std::string rng_str((const char *)inp, rng_size); inp += rng_size;

-            GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
+        std::istringstream rng_ss(rng_str);
+        rng_ss >> ctx->sampling.rng;

-            std::string rng_str((const char *)inp, rng_size); inp += rng_size;
-
-            std::istringstream rng_ss(rng_str);
-            rng_ss >> smpl.rng;
-
-            GGML_ASSERT(!rng_ss.fail());
-        }
+        GGML_ASSERT(!rng_ss.fail());
    }

    // set output ids
@ -18978,11 +18963,14 @@ void llama_grammar_sample(
      const struct llama_grammar * grammar,
      const struct llama_context * ctx,
          llama_token_data_array * candidates) {
-    time_meas tm(ctx->t_sample_us); // TODO: measure grammar time separately from sampling
+    time_meas tm(grammar->t_total_us); // TODO: measure grammar time separately from sampling

    llama_grammar_sample_impl(*grammar, ctx->model.vocab, candidates);
+
+    grammar->n_sample++;
 }

+// deprecated
 void llama_sample_grammar(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
@ -18994,140 +18982,148 @@ void llama_grammar_accept_token(
            struct llama_grammar * grammar,
            struct llama_context * ctx,
                     llama_token   token) {
-    time_meas tm(ctx->t_sample_us); // TODO: measure grammar time separately from sampling
+    time_meas tm(grammar->t_total_us); // TODO: measure grammar time separately from sampling

    llama_grammar_accept_token_impl(*grammar, ctx->model.vocab, token);
+
+    grammar->n_accept++;
 }

 //
 // sampling
 //

-void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
-    llama_set_rng_seed_impl(ctx->sampling[0], seed);
+struct llama_sampling * llama_sampling_init(int32_t n_vocab) {
+    return llama_sampling_init_impl(n_vocab);
 }

-void llama_set_rng_seed_seq(struct llama_context * ctx, uint32_t seed, llama_seq_id seq_id) {
-    llama_set_rng_seed_impl(ctx->sampling[seq_id], seed);
+void llama_sampling_free(struct llama_sampling * smpl) {
+    if (smpl == nullptr) {
+        return;
+    }
+
+    llama_sampling_free_impl(smpl);
 }

-void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
-    time_meas tm(ctx->t_sample_us);
-
-    llama_sample_softmax_impl(ctx->sampling[0], candidates);
+void llama_sampling_set_rng_seed(struct llama_sampling * smpl, uint32_t seed) {
+    llama_sampling_set_rng_seed_impl(*smpl, seed);
 }

-void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
-    time_meas tm(ctx->t_sample_us);
+void llama_sampling_softmax(struct llama_sampling * smpl, llama_token_data_array * candidates) {
+    time_meas tm(smpl->t_total_us);

-    llama_sample_top_k_impl(ctx->sampling[0], candidates, k, min_keep);
+    llama_sampling_softmax_impl(*smpl, candidates);
 }

-void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    time_meas tm(ctx->t_sample_us);
+void llama_sampling_top_k(struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
+    time_meas tm(smpl->t_total_us);

-    llama_sample_top_p_impl(ctx->sampling[0], candidates, p, min_keep);
+    llama_sampling_top_k_impl(*smpl, candidates, k, min_keep);
 }

-void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    time_meas tm(ctx->t_sample_us);
+void llama_sampling_top_p(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+    time_meas tm(smpl->t_total_us);

-    llama_sample_min_p_impl(ctx->sampling[0], candidates, p, min_keep);
+    llama_sampling_top_p_impl(*smpl, candidates, p, min_keep);
 }

-void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
-    time_meas tm(ctx->t_sample_us);
+void llama_sampling_min_p(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+    time_meas tm(smpl->t_total_us);

-    llama_sample_tail_free_impl(ctx->sampling[0], candidates, z, min_keep);
+    llama_sampling_min_p_impl(*smpl, candidates, p, min_keep);
 }

-void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    time_meas tm(ctx->t_sample_us);
+void llama_sampling_tail_free(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep) {
+    time_meas tm(smpl->t_total_us);

-    llama_sample_typical_impl(ctx->sampling[0], candidates, p, min_keep);
+    llama_sampling_tail_free_impl(*smpl, candidates, z, min_keep);
 }

-void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
-    time_meas tm(ctx->t_sample_us);
+void llama_sampling_typical(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+    time_meas tm(smpl->t_total_us);

-    llama_sample_entropy_impl(ctx->sampling[0], candidates_p, min_temp, max_temp, exponent_val);
+    llama_sampling_typical_impl(*smpl, candidates, p, min_keep);
 }

-void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
-    time_meas tm(ctx->t_sample_us);
+void llama_sampling_entropy(struct llama_sampling * smpl, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
+    time_meas tm(smpl->t_total_us);

-    llama_sample_temp_impl(ctx->sampling[0], candidates_p, temp);
+    llama_sampling_entropy_impl(*smpl, candidates_p, min_temp, max_temp, exponent_val);
 }

-void llama_sample_repetition_penalties(
-            struct llama_context * ctx,
+void llama_sampling_temp(struct llama_sampling * smpl, llama_token_data_array * candidates_p, float temp) {
+    time_meas tm(smpl->t_total_us);
+
+    llama_sampling_temp_impl(*smpl, candidates_p, temp);
+}
+
+void llama_sampling_repetition_penalties(
+            struct llama_sampling * smpl,
          llama_token_data_array * candidates,
               const llama_token * last_tokens,
                          size_t   penalty_last_n,
                           float   penalty_repeat,
                           float   penalty_freq,
                           float   penalty_present) {
-    time_meas tm(ctx->t_sample_us);
+    time_meas tm(smpl->t_total_us);

-    llama_sample_repetition_penalties_impl(ctx->sampling[0], candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
+    llama_sampling_repetition_penalties_impl(*smpl, candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
 }

-void llama_sample_apply_guidance(
-          struct llama_context * ctx,
+void llama_sampling_apply_guidance(
+          struct llama_sampling * smpl,
                         float * logits,
                         float * logits_guidance,
                         float   scale) {
-    time_meas tm(ctx->t_sample_us);
+    time_meas tm(smpl->t_total_us);

-    llama_sample_apply_guidance_impl(ctx->sampling[0], logits, logits_guidance, scale);
+    llama_sampling_apply_guidance_impl(*smpl, logits, logits_guidance, scale);
 }

-llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
-    time_meas tm(ctx->t_sample_us);
+llama_token llama_sampling_sample_mirostat(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
+    time_meas tm(smpl->t_total_us);

-    auto res = llama_sample_token_mirostat_impl(ctx->sampling[0], candidates, tau, eta, m, mu);
+    auto res = llama_sampling_sample_mirostat_impl(*smpl, candidates, tau, eta, m, mu);

-    ctx->n_sample++;
+    smpl->n_sample++;

    return res;
 }

-llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
-    time_meas tm(ctx->t_sample_us);
+llama_token llama_sampling_sample_mirostat_v2(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu) {
+    time_meas tm(smpl->t_total_us);

-    auto res = llama_sample_token_mirostat_v2_impl(ctx->sampling[0], candidates, tau, eta, mu);
+    auto res = llama_sampling_sample_mirostat_v2_impl(*smpl, candidates, tau, eta, mu);

-    ctx->n_sample++;
+    smpl->n_sample++;

    return res;
 }

-llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
-    time_meas tm(ctx->t_sample_us);
+llama_token llama_sampling_sample_greedy(struct llama_sampling * smpl, llama_token_data_array * candidates) {
+    time_meas tm(smpl->t_total_us);

-    auto res = llama_sample_token_greedy_impl(ctx->sampling[0], candidates);
+    auto res = llama_sampling_sample_greedy_impl(*smpl, candidates);

-    ctx->n_sample++;
+    smpl->n_sample++;

    return res;
 }

-llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
-    return llama_sample_token_seq(ctx, candidates, 0);
-}
+llama_token llama_sampling_sample(struct llama_sampling * smpl, llama_token_data_array * candidates) {
+    time_meas tm(smpl->t_total_us);

-llama_token llama_sample_token_seq(struct llama_context * ctx, llama_token_data_array * candidates, llama_seq_id seq_id) {
-    GGML_ASSERT(seq_id >= 0 && seq_id < (int32_t) ctx->cparams.n_seq_max);
+    auto res = llama_sampling_sample_impl(*smpl, candidates);

-    time_meas tm(ctx->t_sample_us);
-
-    auto res = llama_sample_token_impl(ctx->sampling[seq_id], candidates);
-
-    ctx->n_sample++;
+    smpl->n_sample++;

    return res;
 }

+//
+// model split
+//
+
 int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
    if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
@ -19152,30 +19148,29 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
    return 0;
 }

-struct llama_timings llama_get_timings(struct llama_context * ctx) {
-    struct llama_timings result = {
-        /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
-        /*.t_end_ms    =*/ 1.00 * ggml_time_ms(),
-        /*.t_load_ms   =*/ 1e-3 * ctx->t_load_us,
-        /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
-        /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
-        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,
+void llama_print_timings(struct llama_context * ctx, struct llama_sampling * smpl, struct llama_grammar * grammar) {
+    const llama_timings timings = {
+        /*.t_start_ms    =*/ 1e-3 * ctx->t_start_us,
+        /*.t_end_ms      =*/ 1.00 * ggml_time_ms(),
+        /*.t_load_ms     =*/ 1e-3 * ctx->t_load_us,
+        /*.t_sampling_ms =*/ 1e-3 * (smpl ? smpl->t_total_us : ctx->sampling.t_total_us),
+        /*.t_grammar_ms  =*/ grammar ? (1e-3 * grammar->t_total_us) : 0.0,
+        /*.t_p_eval_ms   =*/ 1e-3 * ctx->t_p_eval_us,
+        /*.t_eval_ms     =*/ 1e-3 * ctx->t_eval_us,

-        /*.n_sample =*/ std::max(1, ctx->n_sample),
-        /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
-        /*.n_eval   =*/ std::max(1, ctx->n_eval),
+        /*.n_sampling       =*/ std::max(0, smpl ? smpl->n_sample : ctx->sampling.n_sample),
+        /*.n_grammar_sample =*/ std::max(0, grammar ? grammar->n_sample : 0),
+        /*.n_grammar_accept =*/ std::max(0, grammar ? grammar->n_accept : 0),
+        /*.n_p_eval         =*/ std::max(0, ctx->n_p_eval),
+        /*.n_eval           =*/ std::max(1, ctx->n_eval),
    };

-    return result;
-}
-
-void llama_print_timings(struct llama_context * ctx) {
-    const llama_timings timings = llama_get_timings(ctx);
-
    LLAMA_LOG_INFO("\n");
    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
-    LLAMA_LOG_INFO("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
+    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, timings.t_sampling_ms, timings.n_sampling, timings.t_sampling_ms / timings.n_sampling, 1e3 / timings.t_sampling_ms * timings.n_sampling);
+    LLAMA_LOG_INFO("%s:     grammar time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, timings.t_grammar_ms, timings.n_grammar_sample, timings.t_grammar_ms / timings.n_grammar_sample, 1e3 / timings.t_grammar_ms * timings.n_grammar_sample);
    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
@ -19183,11 +19178,18 @@ void llama_print_timings(struct llama_context * ctx) {
    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
 }

-void llama_reset_timings(struct llama_context * ctx) {
+void llama_reset_timings(struct llama_context * ctx, struct llama_sampling * smpl, struct llama_grammar * grammar) {
    ctx->t_start_us  = ggml_time_us();
    ctx->t_eval_us   = ctx->n_eval   = 0;
-    ctx->t_sample_us = ctx->n_sample = 0;
    ctx->t_p_eval_us = ctx->n_p_eval = 0;
+
+    if (smpl) {
+        smpl->t_total_us = smpl->n_sample = 0;
+    }
+
+    if (grammar) {
+        grammar->t_total_us = grammar->n_sample = grammar->n_accept = 0;
+    }
 }

 const char * llama_print_system_info(void) {
@ -19233,21 +19235,15 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
            1.0e-3 * ctx->t_eval_us / ctx->n_eval);
    fprintf(stream, "mst_p_eval: %.2f  # ms / token during prompt processing\n",
            1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
-    fprintf(stream, "mst_sample: %.2f  # ms / token during sampling\n",
-            1.0e-3 * ctx->t_sample_us / ctx->n_sample);
    fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
    fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
-    fprintf(stream, "n_sample: %d  # number of sampled tokens\n", ctx->n_sample);
    fprintf(stream, "t_eval_us: %" PRId64 "  # total microseconds spent generating tokens\n", ctx->t_eval_us);
    fprintf(stream, "t_load_us: %" PRId64 "  # total microseconds spent loading the model\n", ctx->t_load_us);
    fprintf(stream, "t_p_eval_us: %" PRId64 "  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
-    fprintf(stream, "t_sample_us: %" PRId64 "  # total microseconds spent sampling\n", ctx->t_sample_us);
    fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
            1.0e6 * ctx->n_eval / ctx->t_eval_us);
    fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",
            1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
-    fprintf(stream, "ts_sample: %.2f  # tokens / second during sampling\n",
-            1.0e6 * ctx->n_sample / ctx->t_sample_us);
 }

 // For internal test use
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@ -1,5 +1,5 @@
 #include "ggml.h"
-#include "llama-sampling.h"
+#include "llama.h"

 #ifdef NDEBUG
 #undef NDEBUG
@ -20,7 +20,7 @@ static void dump(const llama_token_data_array * candidates) {

 static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
    const size_t n_vocab = probs.size();
-    llama_sampling smpl(1234, n_vocab);
+    llama_sampling * smpl = llama_sampling_init(n_vocab);

    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
@ -30,9 +30,9 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
    }

    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax_impl(smpl, &candidates_p);
+    llama_sampling_softmax(smpl, &candidates_p);
    DUMP(&candidates_p);
-    llama_sample_top_k_impl(smpl, &candidates_p, k, 1);
+    llama_sampling_top_k(smpl, &candidates_p, k, 1);
    DUMP(&candidates_p);

    GGML_ASSERT(candidates_p.size == expected_probs.size());
@ -43,7 +43,7 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float

 static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
    const size_t n_vocab = probs.size();
-    llama_sampling smpl(1234, n_vocab);
+    llama_sampling * smpl = llama_sampling_init(n_vocab);

    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
@ -53,9 +53,9 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
    }

    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax_impl(smpl, &candidates_p);
+    llama_sampling_softmax(smpl, &candidates_p);
    DUMP(&candidates_p);
-    llama_sample_top_p_impl(smpl, &candidates_p, p, 1);
+    llama_sampling_top_p(smpl, &candidates_p, p, 1);
    DUMP(&candidates_p);

    GGML_ASSERT(candidates_p.size == expected_probs.size());
@ -66,7 +66,7 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float

 static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
    const size_t n_vocab = probs.size();
-    llama_sampling smpl(1234, n_vocab);
+    llama_sampling * smpl = llama_sampling_init(n_vocab);

    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
@ -77,7 +77,7 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>

    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
    DUMP(&candidates_p);
-    llama_sample_tail_free_impl(smpl, &candidates_p, z, 1);
+    llama_sampling_tail_free(smpl, &candidates_p, z, 1);
    DUMP(&candidates_p);

    GGML_ASSERT(candidates_p.size == expected_probs.size());
@ -88,7 +88,7 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>

 static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
    const size_t n_vocab = probs.size();
-    llama_sampling smpl(1234, n_vocab);
+    llama_sampling * smpl = llama_sampling_init(n_vocab);

    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
@ -99,9 +99,9 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float

    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
    DUMP(&candidates_p);
-    llama_sample_min_p_impl(smpl, &candidates_p, p, 1);
+    llama_sampling_min_p(smpl, &candidates_p, p, 1);
    DUMP(&candidates_p);
-    llama_sample_softmax_impl(smpl, &candidates_p);
+    llama_sampling_softmax(smpl, &candidates_p);

    GGML_ASSERT(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
@ -111,7 +111,7 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float

 static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
    const size_t n_vocab = probs.size();
-    llama_sampling smpl(1234, n_vocab);
+    llama_sampling * smpl = llama_sampling_init(n_vocab);

    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
@ -122,7 +122,7 @@ static void test_typical(const std::vector<float> & probs, const std::vector<flo

    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
    DUMP(&candidates_p);
-    llama_sample_typical_impl(smpl, &candidates_p, p, 1);
+    llama_sampling_typical(smpl, &candidates_p, p, 1);
    DUMP(&candidates_p);

    GGML_ASSERT(candidates_p.size == expected_probs.size());
@ -138,7 +138,7 @@ static void test_repetition_penalties(
    GGML_ASSERT(probs.size() == expected_probs.size());

    const size_t n_vocab = probs.size();
-    llama_sampling smpl(1234, n_vocab);
+    llama_sampling * smpl = llama_sampling_init(n_vocab);

    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
@ -148,10 +148,10 @@ static void test_repetition_penalties(
    }

    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax_impl(smpl, &candidates_p);
+    llama_sampling_softmax(smpl, &candidates_p);
    DUMP(&candidates_p);
-    llama_sample_repetition_penalties_impl(smpl, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
-    llama_sample_softmax_impl(smpl, &candidates_p);
+    llama_sampling_repetition_penalties(smpl, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
+    llama_sampling_softmax(smpl, &candidates_p);
    DUMP(&candidates_p);

    GGML_ASSERT(candidates_p.size == expected_probs.size());
@ -162,7 +162,7 @@ static void test_repetition_penalties(

 static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
 ) {
-    llama_sampling smpl(1234, n_vocab);
+    llama_sampling * smpl = llama_sampling_init(n_vocab);

    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
@ -178,16 +178,16 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler

    for (auto s : samplers_sequence) {
        switch (s){
-            case 'k': llama_sample_top_k_impl(smpl, &candidates_p, top_k, 1); break;
+            case 'k': llama_sampling_top_k(smpl, &candidates_p, top_k, 1); break;
            case 'f': GGML_ASSERT(false && "tail_free test not implemented");   break;
            case 'y': GGML_ASSERT(false && "typical test not implemented");     break;
-            case 'p': llama_sample_top_p_impl(smpl, &candidates_p, top_p, 1); break;
-            case 'm': llama_sample_min_p_impl(smpl, &candidates_p, min_p, 1); break;
+            case 'p': llama_sampling_top_p(smpl, &candidates_p, top_p, 1); break;
+            case 'm': llama_sampling_min_p(smpl, &candidates_p, min_p, 1); break;
            case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
            default : GGML_ASSERT(false && "Unknown sampler");                  break;
        }

-        llama_sample_softmax_impl(smpl, &candidates_p); // make sure tokens are sorted for tests
+        llama_sampling_softmax(smpl, &candidates_p); // make sure tokens are sorted for tests

        const int size = candidates_p.size;