llama : remove llama_constraint

ggml-ci
2024-09-05 16:49:14 +03:00 · 2024-09-05 16:49:14 +03:00 · 0b6dfcebb2
commit 0b6dfcebb2
parent a2d8b27a4b
19 changed files with 1020 additions and 1055 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -841,15 +841,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.defrag_thold = std::stof(argv[i]);
        return true;
    }
-    if (arg == "--samplers" || arg == "--constraints") {
+    if (arg == "--samplers") {
        CHECK_ARG
-        const auto constraint_names = string_split(argv[i], ';');
-        sparams.constraints = gpt_constraint_types_from_names(constraint_names, true);
+        const auto sampler_names = string_split(argv[i], ';');
+        sparams.samplers = gpt_sampler_types_from_names(sampler_names, true);
        return true;
    }
    if (arg == "--sampling-seq") {
        CHECK_ARG
-        sparams.constraints = gpt_constraint_types_from_chars(argv[i]);
+        sparams.samplers = gpt_sampler_types_from_chars(argv[i]);
        return true;
    }
    if (arg == "--top-p") {
@ -1706,13 +1706,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    const auto & sparams = params.sparams;

-    std::string constraint_type_chars;
-    std::string constraint_type_names;
-    for (const auto & constraint : sparams.constraints) {
-        constraint_type_chars += gpt_constraint_type_to_chr(constraint);
-        constraint_type_names += gpt_constraint_type_to_str(constraint) + ";";
+    std::string sampler_type_chars;
+    std::string sampler_type_names;
+    for (const auto & sampler : sparams.samplers) {
+        sampler_type_chars += gpt_sampler_type_to_chr(sampler);
+        sampler_type_names += gpt_sampler_type_to_str(sampler) + ";";
    }
-    constraint_type_names.pop_back();
+    sampler_type_names.pop_back();

    struct option_info {
        LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
@ -1826,9 +1826,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "sampling" });
    options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", sparams.seed });
    options.push_back({ "*",           "       --samplers SAMPLERS",    "samplers that will be used for generation in the order, separated by \';\'\n"
-                                                                        "(default: %s)", constraint_type_names.c_str() });
+                                                                        "(default: %s)", sampler_type_names.c_str() });
    options.push_back({ "*",           "       --sampling-seq SEQUENCE",
-                                                                        "simplified sequence for samplers that will be used (default: %s)", constraint_type_chars.c_str() });
+                                                                        "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
    options.push_back({ "*",           "       --ignore-eos",           "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
    options.push_back({ "*",           "       --penalize-nl",          "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
    options.push_back({ "*",           "       --temp T",               "temperature (default: %.1f)", (double)sparams.temp });
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -2,14 +2,127 @@

 #include "common.h"

+// the ring buffer works similarly to std::deque, but with a fixed capacity
+// TODO: deduplicate with llama-impl.h
+template<typename T>
+struct ring_buffer {
+    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
+
+    T & front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    const T & front() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    T & back() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    const T & back() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    void push_back(const T & value) {
+        if (sz == capacity) {
+            // advance the start when buffer is full
+            first = (first + 1) % capacity;
+        } else {
+            sz++;
+        }
+        data[pos] = value;
+        pos = (pos + 1) % capacity;
+    }
+
+    T pop_front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        T value = data[first];
+        first = (first + 1) % capacity;
+        sz--;
+        return value;
+    }
+
+    const T & rat(size_t i) const {
+        if (i >= sz) {
+            throw std::runtime_error("ring buffer: index out of bounds");
+        }
+        return data[(first + sz - i - 1) % capacity];
+    }
+
+    std::vector<T> to_vector() const {
+        std::vector<T> result;
+        result.reserve(sz);
+        for (size_t i = 0; i < sz; i++) {
+            result.push_back(data[(first + i) % capacity]);
+        }
+        return result;
+    }
+
+    void clear() {
+        // here only reset the status of the buffer
+        sz = 0;
+        first = 0;
+        pos = 0;
+    }
+
+    bool empty() const {
+        return sz == 0;
+    }
+
+    size_t size() const {
+        return sz;
+    }
+
+    size_t capacity = 0;
+    size_t sz = 0;
+    size_t first = 0;
+    size_t pos = 0;
+    std::vector<T> data;
+};
+
 struct gpt_sampler {
    gpt_sampler_params params;

-    struct llama_constraint * bias;
-    struct llama_constraint * pnlt;
-    struct llama_constraint * grmr;
+    struct llama_sampler * bias;
+    struct llama_sampler * pnlt;
+    struct llama_sampler * grmr;

-    struct llama_sampler * smpl;
+    struct llama_sampler * chain;
+
+    ring_buffer<llama_token> prev;
+
+    std::vector<llama_token_data> cur;
+
+    llama_token_data_array cur_p;
+
+    void set_logits(struct llama_context * ctx, int idx) {
+        const auto * logits = llama_get_logits_ith(ctx, idx);
+
+        const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
+        cur.resize(n_vocab);
+
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+        }
+
+        cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
+    }
 };

 std::string gpt_sampler_params::print() const {
@ -29,28 +142,26 @@ std::string gpt_sampler_params::print() const {
 std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
    std::string result = "\tlogits";

-    for (int i = 0; i < llama_sampler_n_constraints(gsmpl->smpl); i++) {
-        const auto * cnstr = llama_sampler_constraint_get(gsmpl->smpl, i);
-        result += std::string(" -> ") + llama_constraint_name(cnstr) + " ";
+    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
+        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
+        result += std::string(" -> ") + llama_sampler_name(smpl) + " ";
    }

    return result;
 }

 struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
-    llama_sampler_params lparams = llama_sampler_default_params();
+    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

-    lparams.seed   = params.seed;
-    lparams.n_prev = params.n_prev;
-    lparams.type   = params.temp <= 0.0f ? LLAMA_SAMPLER_TYPE_GREEDY : LLAMA_SAMPLER_TYPE_DIST;
+    lparams.no_timing = false;

    auto * result = new gpt_sampler {
        /* .params = */ params,
-        /* .bias   = */ llama_constraint_init_logit_bias(
+        /* .bias   = */ llama_sampler_init_logit_bias(
            model,
            params.logit_bias.size(),
            params.logit_bias.data()),
-        /* .pnlt   = */ llama_constraint_init_penalties(
+        /* .pnlt   = */ llama_sampler_init_penalties(
            model,
            params.penalty_last_n,
            params.penalty_repeat,
@ -58,45 +169,53 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
            params.penalty_present,
            params.penalize_nl,
            params.ignore_eos),
-        /* .grmr   = */ llama_constraint_init_grammar(model, params.grammar.c_str(), "root"),
-        /* .smpl   = */ llama_sampler_init(model, lparams)
+        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
+        /* .chain  = */ llama_sampler_chain_init(lparams),
+        /* .prev   = */ ring_buffer<llama_token>(params.n_prev),
+        /* .cur    = */ {},
+        /* .cur_p  = */ {},
    };

    if (params.temp > 0.0f) {
        if (params.mirostat == 0) {
-            for (const auto & cnstr : params.constraints) {
+            for (const auto & cnstr : params.samplers) {
                switch (cnstr) {
-                    case GPT_CONSTRAINT_TYPE_TOP_K:
-                        llama_sampler_constraint_add(result->smpl, llama_constraint_init_top_k    (params.top_k));
+                    case GPT_SAMPLER_TYPE_TOP_K:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
                        break;
-                    case GPT_CONSTRAINT_TYPE_TOP_P:
-                        llama_sampler_constraint_add(result->smpl, llama_constraint_init_top_p    (params.top_p, params.min_keep));
+                    case GPT_SAMPLER_TYPE_TOP_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
                        break;
-                    case GPT_CONSTRAINT_TYPE_MIN_P:
-                        llama_sampler_constraint_add(result->smpl, llama_constraint_init_min_p    (params.min_p, params.min_keep));
+                    case GPT_SAMPLER_TYPE_MIN_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
                        break;
-                    case GPT_CONSTRAINT_TYPE_TFS_Z:
-                        llama_sampler_constraint_add(result->smpl, llama_constraint_init_tail_free(params.tfs_z, params.min_keep));
+                    case GPT_SAMPLER_TYPE_TFS_Z:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
                        break;
-                    case GPT_CONSTRAINT_TYPE_TYPICAL_P:
-                        llama_sampler_constraint_add(result->smpl, llama_constraint_init_typical  (params.typ_p, params.min_keep));
+                    case GPT_SAMPLER_TYPE_TYPICAL_P:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
                        break;
-                    case GPT_CONSTRAINT_TYPE_TEMPERATURE:
-                        llama_sampler_constraint_add(result->smpl, llama_constraint_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    case GPT_SAMPLER_TYPE_TEMPERATURE:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                        break;
                    default:
-                        GGML_ASSERT(false && "unknown constraint type");
+                        GGML_ASSERT(false && "unknown sampler type");
                }
            }
        } else if (params.mirostat == 1) {
-            llama_sampler_constraint_add(result->smpl, llama_constraint_init_temp(params.temp));
-            llama_sampler_constraint_add(result->smpl, llama_constraint_init_mirostat(model, params.mirostat_tau, params.mirostat_eta));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(model, params.mirostat_tau, params.mirostat_eta));
        } else if (params.mirostat == 2) {
-            llama_sampler_constraint_add(result->smpl, llama_constraint_init_temp(params.temp));
-            llama_sampler_constraint_add(result->smpl, llama_constraint_init_mirostat_v2(params.mirostat_tau, params.mirostat_eta));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
+            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.mirostat_tau, params.mirostat_eta));
        } else {
            GGML_ASSERT(false && "unknown mirostat version");
        }
+        llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
+        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+    } else {
+        llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
+        llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
    }

    return result;
@ -104,11 +223,11 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st

 void gpt_sampler_free(struct gpt_sampler * gsmpl) {
    if (gsmpl) {
-        llama_constraint_free(gsmpl->bias);
-        llama_constraint_free(gsmpl->pnlt);
-        llama_constraint_free(gsmpl->grmr);
+        llama_sampler_free(gsmpl->bias);
+        llama_sampler_free(gsmpl->pnlt);
+        llama_sampler_free(gsmpl->grmr);

-        llama_sampler_free(gsmpl->smpl);
+        llama_sampler_free(gsmpl->chain);

        delete gsmpl;
    }
@ -117,69 +236,66 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) {
 struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
    return new gpt_sampler {
        /* .params = */ gsmpl->params,
-        /* .bias   = */ llama_constraint_clone(gsmpl->bias),
-        /* .pnlt   = */ llama_constraint_clone(gsmpl->pnlt),
-        /* .grmr   = */ llama_constraint_clone(gsmpl->grmr),
-        /* .smpl   = */ llama_sampler_clone   (gsmpl->smpl)
+        /* .bias   = */ llama_sampler_clone(gsmpl->bias),
+        /* .pnlt   = */ llama_sampler_clone(gsmpl->pnlt),
+        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
+        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
+        /* .prev   = */ gsmpl->prev,
+        /* .cur    = */ gsmpl->cur,
+        /* .cur_p  = */ gsmpl->cur_p,
    };
 }

 void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool apply_grammar) {
    if (apply_grammar) {
-        llama_constraint_accept(gsmpl->grmr, token);
+        llama_sampler_accept(gsmpl->grmr, token);
    }

-    llama_sampler_accept(gsmpl->smpl, token);
+    llama_sampler_accept(gsmpl->chain, token);
+
+    gsmpl->prev.push_back(token);
 }

 void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
-    llama_constraint_reset(gsmpl->grmr);
+    llama_sampler_reset(gsmpl->grmr);

-    llama_sampler_reset(gsmpl->smpl);
-}
-
-void gpt_sampler_set_logits(struct gpt_sampler * gsmpl, const float * logits) {
-    llama_sampler_set_logits(gsmpl->smpl, logits);
+    llama_sampler_reset(gsmpl->chain);
 }

 llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
-    return llama_sampler_get_candidates(gsmpl->smpl);
+    return &gsmpl->cur_p;
 }

 llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
-    return llama_sampler_last(gsmpl->smpl);
+    return gsmpl->prev.rat(0);
 }

-void gpt_print_timings(struct llama_context * ctx, struct gpt_sampler * gsmpl) {
-    llama_print_timings(ctx, gsmpl ? gsmpl->smpl : nullptr);
-}
-
-llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_token_data_array * cur_p) {
-    return llama_sampler_sample(gsmpl->smpl, cur_p);
+void gpt_print_timings(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
+    llama_print_timings(ctx, gsmpl ? gsmpl->chain : nullptr);
 }

 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
-    auto & bias = gsmpl->bias;
-    auto & pnlt = gsmpl->pnlt;
-    auto & grmr = gsmpl->grmr;
-    auto & smpl = gsmpl->smpl;
+    auto & bias  = gsmpl->bias;
+    auto & pnlt  = gsmpl->pnlt;
+    auto & grmr  = gsmpl->grmr;
+    auto & chain = gsmpl->chain;

-    const auto * logits = llama_get_logits_ith(ctx, idx);
+    gsmpl->set_logits(ctx, idx);

-    llama_sampler_set_logits(smpl, logits);
+    auto & cur_p = gsmpl->cur_p;

-    auto * cur_p = llama_sampler_get_candidates(smpl);
-
-    llama_constraint_apply(bias, cur_p);
-    llama_constraint_apply(pnlt, cur_p);
+    llama_sampler_apply(bias, &cur_p);
+    llama_sampler_apply(pnlt, &cur_p);

    if (grammar_first) {
-        llama_constraint_apply(grmr, cur_p);
+        llama_sampler_apply(grmr, &cur_p);
    }

-    llama_sampler_apply(smpl, cur_p);
+    llama_sampler_apply(chain, &cur_p);

-    const llama_token id = llama_sampler_sample(smpl, cur_p);
+    const llama_token id = cur_p.data[cur_p.selected].id;
+
+    GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - check your sampling configuration");

    if (grammar_first) {
        return id;
@ -188,9 +304,9 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
    // check if it the sampled token fits the grammar
    {
        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
+        llama_token_data_array single_token_data_array = { &single_token_data, 1, LLAMA_TOKEN_NULL, false };

-        llama_constraint_apply(grmr, &single_token_data_array);
+        llama_sampler_apply(grmr, &single_token_data_array);

        // check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
@ -199,28 +315,22 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
        }
    }

-    // if the token is not valid, sample again, first apply the grammar constraints and then sample
-    llama_sampler_set_logits(smpl, logits);
+    // if the token is not valid, sample again, first apply the grammar samplers and then sample
+    gsmpl->set_logits(ctx, idx);

-    llama_constraint_apply(bias, cur_p);
-    llama_constraint_apply(pnlt, cur_p);
-    llama_constraint_apply(grmr, cur_p);
+    llama_sampler_apply(bias, &cur_p);
+    llama_sampler_apply(pnlt, &cur_p);
+    llama_sampler_apply(grmr, &cur_p);

-    llama_sampler_apply(smpl, cur_p);
+    llama_sampler_apply(chain, &cur_p);

-    return llama_sampler_sample(smpl, cur_p);
-}
+    GGML_ASSERT(cur_p.data[cur_p.selected].id != LLAMA_TOKEN_NULL && "null token in the sampling history - check your sampling configuration");

-void gpt_sampler_apply_grammar(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p) {
-    GGML_ASSERT(cur_p != nullptr);
-
-    llama_constraint_apply(gsmpl->grmr, cur_p);
+    return cur_p.data[cur_p.selected].id;
 }

 std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
-    auto & smpl = gsmpl->smpl;
-
-    n = std::min(n, llama_sampler_n_prev(smpl));
+    n = std::min(n, (int) gsmpl->prev.size());

    if (n <= 0) {
        return "";
@ -230,7 +340,7 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
    result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab

    for (int i = n - 1; i >= 0; i--) {
-        const llama_token id = llama_sampler_prev(smpl, i);
+        const llama_token id = gsmpl->prev.rat(i);

        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");

@ -240,95 +350,95 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
    return result;
 }

-char gpt_constraint_type_to_chr(enum gpt_constraint_type cnstr) {
+char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
    switch (cnstr) {
-        case GPT_CONSTRAINT_TYPE_TOP_K:       return 'k';
-        case GPT_CONSTRAINT_TYPE_TFS_Z:       return 'f';
-        case GPT_CONSTRAINT_TYPE_TYPICAL_P:   return 'y';
-        case GPT_CONSTRAINT_TYPE_TOP_P:       return 'p';
-        case GPT_CONSTRAINT_TYPE_MIN_P:       return 'm';
-        case GPT_CONSTRAINT_TYPE_TEMPERATURE: return 't';
+        case GPT_SAMPLER_TYPE_TOP_K:       return 'k';
+        case GPT_SAMPLER_TYPE_TFS_Z:       return 'f';
+        case GPT_SAMPLER_TYPE_TYPICAL_P:   return 'y';
+        case GPT_SAMPLER_TYPE_TOP_P:       return 'p';
+        case GPT_SAMPLER_TYPE_MIN_P:       return 'm';
+        case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
        default : return '?';
    }
 }

-std::string gpt_constraint_type_to_str(enum gpt_constraint_type cnstr) {
+std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
    switch (cnstr) {
-        case GPT_CONSTRAINT_TYPE_TOP_K:       return "top_k";
-        case GPT_CONSTRAINT_TYPE_TFS_Z:       return "tfs_z";
-        case GPT_CONSTRAINT_TYPE_TYPICAL_P:   return "typ_p";
-        case GPT_CONSTRAINT_TYPE_TOP_P:       return "top_p";
-        case GPT_CONSTRAINT_TYPE_MIN_P:       return "min_p";
-        case GPT_CONSTRAINT_TYPE_TEMPERATURE: return "temperature";
+        case GPT_SAMPLER_TYPE_TOP_K:       return "top_k";
+        case GPT_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
+        case GPT_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
+        case GPT_SAMPLER_TYPE_TOP_P:       return "top_p";
+        case GPT_SAMPLER_TYPE_MIN_P:       return "min_p";
+        case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
        default : return "";
    }
 }

-std::vector<gpt_constraint_type> gpt_constraint_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, gpt_constraint_type> constraint_canonical_name_map {
-        { "top_k",       GPT_CONSTRAINT_TYPE_TOP_K },
-        { "top_p",       GPT_CONSTRAINT_TYPE_TOP_P },
-        { "typ_p",       GPT_CONSTRAINT_TYPE_TYPICAL_P },
-        { "min_p",       GPT_CONSTRAINT_TYPE_MIN_P },
-        { "tfs_z",       GPT_CONSTRAINT_TYPE_TFS_Z },
-        { "temperature", GPT_CONSTRAINT_TYPE_TEMPERATURE },
+std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
+    std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
+        { "top_k",       GPT_SAMPLER_TYPE_TOP_K },
+        { "top_p",       GPT_SAMPLER_TYPE_TOP_P },
+        { "typ_p",       GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "min_p",       GPT_SAMPLER_TYPE_MIN_P },
+        { "tfs_z",       GPT_SAMPLER_TYPE_TFS_Z },
+        { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
    };

-    // since constraints names are written multiple ways
+    // since samplers names are written multiple ways
    // make it ready for both system names and input names
-    std::unordered_map<std::string, gpt_constraint_type> constraint_alt_name_map {
-        { "top-k",       GPT_CONSTRAINT_TYPE_TOP_K },
-        { "top-p",       GPT_CONSTRAINT_TYPE_TOP_P },
-        { "nucleus",     GPT_CONSTRAINT_TYPE_TOP_P },
-        { "typical-p",   GPT_CONSTRAINT_TYPE_TYPICAL_P },
-        { "typical",     GPT_CONSTRAINT_TYPE_TYPICAL_P },
-        { "typ-p",       GPT_CONSTRAINT_TYPE_TYPICAL_P },
-        { "typ",         GPT_CONSTRAINT_TYPE_TYPICAL_P },
-        { "min-p",       GPT_CONSTRAINT_TYPE_MIN_P },
-        { "tfs-z",       GPT_CONSTRAINT_TYPE_TFS_Z },
-        { "tfs",         GPT_CONSTRAINT_TYPE_TFS_Z },
-        { "temp",        GPT_CONSTRAINT_TYPE_TEMPERATURE },
+    std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
+        { "top-k",       GPT_SAMPLER_TYPE_TOP_K },
+        { "top-p",       GPT_SAMPLER_TYPE_TOP_P },
+        { "nucleus",     GPT_SAMPLER_TYPE_TOP_P },
+        { "typical-p",   GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "typical",     GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "typ-p",       GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "typ",         GPT_SAMPLER_TYPE_TYPICAL_P },
+        { "min-p",       GPT_SAMPLER_TYPE_MIN_P },
+        { "tfs-z",       GPT_SAMPLER_TYPE_TFS_Z },
+        { "tfs",         GPT_SAMPLER_TYPE_TFS_Z },
+        { "temp",        GPT_SAMPLER_TYPE_TEMPERATURE },
    };

-    std::vector<gpt_constraint_type> constraints;
-    constraints.reserve(names.size());
+    std::vector<gpt_sampler_type> samplers;
+    samplers.reserve(names.size());

    for (const auto & name : names) {
-        auto constraint = constraint_canonical_name_map.find(name);
-        if (constraint != constraint_canonical_name_map.end()) {
-            constraints.push_back(constraint->second);
+        auto sampler = sampler_canonical_name_map.find(name);
+        if (sampler != sampler_canonical_name_map.end()) {
+            samplers.push_back(sampler->second);
        } else {
            if (allow_alt_names) {
-                constraint = constraint_alt_name_map.find(name);
-                if (constraint != constraint_alt_name_map.end()) {
-                    constraints.push_back(constraint->second);
+                sampler = sampler_alt_name_map.find(name);
+                if (sampler != sampler_alt_name_map.end()) {
+                    samplers.push_back(sampler->second);
                }
            }
        }
    }

-    return constraints;
+    return samplers;
 }

-std::vector<gpt_constraint_type> gpt_constraint_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, gpt_constraint_type> constraint_name_map {
-        { gpt_constraint_type_to_chr(GPT_CONSTRAINT_TYPE_TOP_K),       GPT_CONSTRAINT_TYPE_TOP_K },
-        { gpt_constraint_type_to_chr(GPT_CONSTRAINT_TYPE_TFS_Z),       GPT_CONSTRAINT_TYPE_TFS_Z },
-        { gpt_constraint_type_to_chr(GPT_CONSTRAINT_TYPE_TYPICAL_P),   GPT_CONSTRAINT_TYPE_TYPICAL_P },
-        { gpt_constraint_type_to_chr(GPT_CONSTRAINT_TYPE_TOP_P),       GPT_CONSTRAINT_TYPE_TOP_P },
-        { gpt_constraint_type_to_chr(GPT_CONSTRAINT_TYPE_MIN_P),       GPT_CONSTRAINT_TYPE_MIN_P },
-        { gpt_constraint_type_to_chr(GPT_CONSTRAINT_TYPE_TEMPERATURE), GPT_CONSTRAINT_TYPE_TEMPERATURE }
+std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
+    std::unordered_map<char, gpt_sampler_type> sampler_name_map {
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P),       GPT_SAMPLER_TYPE_TOP_P },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P),       GPT_SAMPLER_TYPE_MIN_P },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
    };

-    std::vector<gpt_constraint_type> constraints;
-    constraints.reserve(chars.size());
+    std::vector<gpt_sampler_type> samplers;
+    samplers.reserve(chars.size());

    for (const auto & c : chars) {
-        const auto constraint = constraint_name_map.find(c);
-        if (constraint != constraint_name_map.end()) {
-            constraints.push_back(constraint->second);
+        const auto sampler = sampler_name_map.find(c);
+        if (sampler != sampler_name_map.end()) {
+            samplers.push_back(sampler->second);
        }
    }

-    return constraints;
+    return samplers;
 }
--- a/common/sampling.h
+++ b/common/sampling.h
@ -5,14 +5,14 @@
 #include <string>
 #include <vector>

-enum gpt_constraint_type {
-    GPT_CONSTRAINT_TYPE_NONE        = 0,
-    GPT_CONSTRAINT_TYPE_TOP_K       = 1,
-    GPT_CONSTRAINT_TYPE_TOP_P       = 2,
-    GPT_CONSTRAINT_TYPE_MIN_P       = 3,
-    GPT_CONSTRAINT_TYPE_TFS_Z       = 4,
-    GPT_CONSTRAINT_TYPE_TYPICAL_P   = 5,
-    GPT_CONSTRAINT_TYPE_TEMPERATURE = 6,
+enum gpt_sampler_type {
+    GPT_SAMPLER_TYPE_NONE        = 0,
+    GPT_SAMPLER_TYPE_TOP_K       = 1,
+    GPT_SAMPLER_TYPE_TOP_P       = 2,
+    GPT_SAMPLER_TYPE_MIN_P       = 3,
+    GPT_SAMPLER_TYPE_TFS_Z       = 4,
+    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
+    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
 };

 // sampling parameters
@ -21,7 +21,7 @@ struct gpt_sampler_params {

    int32_t n_prev            = 64;    // number of previous tokens to remember
    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep          = 0;     // 0 = disabled, otherwise constraints should return at least min_keep tokens
+    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   min_p             = 0.05f; // 0.0 = disabled
@ -40,13 +40,13 @@ struct gpt_sampler_params {
    bool    penalize_nl       = false; // consider newlines as a repeatable token
    bool    ignore_eos        = false;

-    std::vector<enum gpt_constraint_type> constraints = {
-        GPT_CONSTRAINT_TYPE_TOP_K,
-        GPT_CONSTRAINT_TYPE_TFS_Z,
-        GPT_CONSTRAINT_TYPE_TYPICAL_P,
-        GPT_CONSTRAINT_TYPE_TOP_P,
-        GPT_CONSTRAINT_TYPE_MIN_P,
-        GPT_CONSTRAINT_TYPE_TEMPERATURE
+    std::vector<enum gpt_sampler_type> samplers = {
+        GPT_SAMPLER_TYPE_TOP_K,
+        GPT_SAMPLER_TYPE_TFS_Z,
+        GPT_SAMPLER_TYPE_TYPICAL_P,
+        GPT_SAMPLER_TYPE_TOP_P,
+        GPT_SAMPLER_TYPE_MIN_P,
+        GPT_SAMPLER_TYPE_TEMPERATURE
    };

    std::string grammar; // optional BNF-like grammar to constrain sampling
@ -73,40 +73,36 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl);
 void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool apply_grammar);
 void gpt_sampler_reset (struct gpt_sampler * gsmpl);

-void gpt_sampler_apply_grammar(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p);
-
-void gpt_sampler_set_logits(struct gpt_sampler * gsmpl, const float * logits);
-
 llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);

-llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_token_data_array * cur_p);
+//llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_token_data_array * cur_p);

 llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);

-void gpt_print_timings(struct llama_context * ctx, struct gpt_sampler * gsmpl);
+void gpt_print_timings(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);

 // extended sampling implementation:
 //
 // - set logits
-// - apply the configured sampling constraints
+// - apply the configured sampler chain
 // - check if the token fits the grammar (if any)
 // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
-// if grammar_first is true, the grammar is applied before the constraints (slower)
+// if grammar_first is true, the grammar is applied before the samplers (slower)
 // useful in cases where all the resulting candidates must fit the grammar
 //
 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);

 // helpers

-// print the constraints into a string
+// print the sampler chain into a string
 std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);

 // get a string representation of the last accepted tokens
 std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);

-char        gpt_constraint_type_to_chr(enum gpt_constraint_type cnstr);
-std::string gpt_constraint_type_to_str(enum gpt_constraint_type cnstr);
+char        gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
+std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);

-std::vector<enum gpt_constraint_type> gpt_constraint_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<enum gpt_constraint_type> gpt_constraint_types_from_chars(const std::string & chars);
+std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -50,9 +50,9 @@ defer {
    llama_free(context)
 }

-var sparams = llama_sampler_params()
+var sparams = llama_sampler_chain_default_params()

-let smpl = llama_sampler_init(model, sparams)
+let smpl = llama_sampler_chain_init(sparams)
 guard smpl != nil else {
    print("Failed to initialize sampling")
    exit(1)
@ -61,9 +61,9 @@ defer {
    llama_sampler_free(smpl)
 }

-llama_sampler_constraint_add(smpl, llama_constraint_init_top_k(40));
-llama_sampler_constraint_add(smpl, llama_constraint_init_top_p(0.9, 1));
-llama_sampler_constraint_add(smpl, llama_constraint_init_temp (0.4));
+llama_sampler_sampler_add(smpl, llama_sampler_init_top_k(40));
+llama_sampler_sampler_add(smpl, llama_sampler_init_top_p(0.9, 1));
+llama_sampler_sampler_add(smpl, llama_sampler_init_temp (0.4));

 let n_ctx = llama_n_ctx(context)

@ -137,11 +137,9 @@ while n_cur <= n_len {
            continue
        }

-        var logits = llama_get_logits_ith(context, i_batch[i])
+        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])

-        llama_sampler_set_logits(smpl, logits)
-
-        let new_token_id = llama_sampler_sample(smpl, nil)
+        llama_sampler_accept(smpl, new_token_id)

        // is it an end of stream? -> mark the stream as finished
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -64,15 +64,13 @@ int main(int argc, char ** argv) {

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

-    auto sparams = llama_sampler_default_params();
+    auto sparams = llama_sampler_chain_default_params();

-    sparams.seed = params.sparams.seed;
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);

-    llama_sampler * smpl = llama_sampler_init(model, sparams);
-
-    llama_sampler_constraint_add(smpl, llama_constraint_init_top_k(params.sparams.top_k));
-    llama_sampler_constraint_add(smpl, llama_constraint_init_top_p(params.sparams.top_p, params.sparams.min_keep));
-    llama_sampler_constraint_add(smpl, llama_constraint_init_temp (params.sparams.temp));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
+    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));

    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
@ -173,11 +171,9 @@ int main(int argc, char ** argv) {
                continue;
            }

-            const auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);

-            llama_sampler_set_logits(smpl, logits);
-
-            const llama_token new_token_id = llama_sampler_sample(smpl, nullptr);
+            llama_sampler_accept(smpl, new_token_id);

            // is it an end of generation? -> mark the stream as finished
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -120,11 +120,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std

        llama_decode(ctx, bat);

-        const auto * logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
+        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
+        llama_sampler_accept(smpl, token);

-        llama_sampler_set_logits(smpl, logits);
-
-        llama_token token = llama_sampler_sample(smpl, nullptr);
        if (token == eos_token) {
            break;
        }
@ -171,11 +169,9 @@ int main(int argc, char * argv[]) {
    // create generation context
    llama_context * ctx = llama_new_context_with_model(model, cparams);

-    auto sparams = llama_sampler_default_params();
+    auto sparams = llama_sampler_chain_default_params();

-    sparams.type = LLAMA_SAMPLER_TYPE_GREEDY;
-
-    llama_sampler * smpl = llama_sampler_init(model, sparams);
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);

    // ### Embedding/Representation ###
    // samples taken from: https://github.com/ContextualAI/gritlm#basic
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -394,12 +394,10 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
    if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");

-    const auto * logits = llama_get_logits_ith(context, batch->n_tokens - 1);
-
-    llama_sampler_set_logits(sampling, logits);
-
    // sample the most likely token
-    const auto new_token_id = llama_sampler_sample(sampling, nullptr);
+    const auto new_token_id = llama_sampler_sample(sampling, context, batch->n_tokens - 1);
+
+    llama_sampler_accept(sampling, new_token_id);

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -43,9 +43,8 @@ actor LlamaContext {
        self.tokens_list = []
        self.batch = llama_batch_init(512, 0, 1)
        self.temporary_invalid_cchars = []
-        var sparams = llama_sampler_default_params()
-        sparams.type = LLAMA_SAMPLER_TYPE_GREEDY
-        self.sampling = llama_sampler_init(context, sparams)
+        var sparams = llama_sampler_chain_default_params()
+        self.sampling = llama_sampler_chain_init(sparams)
    }

    deinit {
@ -148,12 +147,9 @@ actor LlamaContext {
    func completion_loop() -> String {
        var new_token_id: llama_token = 0

-        let n_vocab = llama_n_vocab(model)
-        let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
+        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)

-        llama_sampler_set_logits(sampling, logits);
-
-        new_token_id = llama_sampler_sample(sampling, nil)
+        llama_sampler_accept(sampling, new_token_id)

        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -83,11 +83,11 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    auto sparams = llama_sampler_default_params();
+    auto sparams = llama_sampler_chain_default_params();

-    sparams.type = LLAMA_SAMPLER_TYPE_GREEDY;
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);

-    llama_sampler * smpl = llama_sampler_init(model, sparams);
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());

    // tokenize the prompt
    std::vector<llama_token> tokens_list;
@ -220,12 +220,9 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_len) {
        // sample the next token
        {
-            const auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);

-            llama_sampler_set_logits(smpl, logits);
-
-            // sample the most likely token
-            const llama_token new_token_id = llama_sampler_sample(smpl, nullptr);
+            llama_sampler_accept(smpl, new_token_id);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -38,10 +38,12 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    llama_sampler_params sparams = llama_sampler_default_params();
-    sparams.seed = params.sparams.seed;
+    auto sparams = llama_sampler_chain_default_params();

-    llama_sampler * smpl = llama_sampler_init(model, sparams);
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));

    // tokenize prompt
    auto tokens = llama_tokenize(ctx, params.prompt, true);
@ -69,13 +71,11 @@ int main(int argc, char ** argv) {
    printf("\nfirst run: %s", params.prompt.c_str());

    for (auto i = 0; i < params.n_predict; i++) {
-        const auto * logits = llama_get_logits(ctx);
-
-        llama_sampler_set_logits(smpl, logits);
-
-        auto next_token     = llama_sampler_sample(smpl, nullptr);
+        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
        auto next_token_str = llama_token_to_piece(ctx, next_token);

+        llama_sampler_accept(smpl, next_token);
+
        printf("%s", next_token_str.c_str());
        result0 += next_token_str;

@ -96,7 +96,10 @@ int main(int argc, char ** argv) {
    // make new context
    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

-    llama_sampler * smpl2 = llama_sampler_init(model, sparams);
+    llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
+    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));

    printf("\nsecond run: %s", params.prompt.c_str());

@ -126,13 +129,11 @@ int main(int argc, char ** argv) {

    // second run
    for (auto i = 0; i < params.n_predict; i++) {
-        const auto * logits = llama_get_logits(ctx2);
-
-        llama_sampler_set_logits(smpl2, logits);
-
-        auto next_token     = llama_sampler_sample(smpl2, nullptr);
+        auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);

+        llama_sampler_accept(smpl2, next_token);
+
        printf("%s", next_token_str.c_str());
        result1 += next_token_str;

@ -157,7 +158,10 @@ int main(int argc, char ** argv) {
    // make new context
    auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

-    llama_sampler * smpl3 = llama_sampler_init(model, sparams);
+    llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
+    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));

    printf("\nsingle seq run: %s", params.prompt.c_str());

@ -215,13 +219,11 @@ int main(int argc, char ** argv) {

    // third run with seq 1 instead of 0
    for (auto i = 0; i < params.n_predict; i++) {
-        const auto * logits = llama_get_logits(ctx3);
-
-        llama_sampler_set_logits(smpl3, logits);
-
-        auto next_token     = llama_sampler_sample(smpl3, nullptr);
+        auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
        auto next_token_str = llama_token_to_piece(ctx3, next_token);

+        llama_sampler_accept(smpl3, next_token);
+
        printf("%s", next_token_str.c_str());
        result2 += next_token_str;

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1027,17 +1027,17 @@ struct server_context {
        }

        {
-            const auto & constraints = data.find("samplers");
-            if (constraints != data.end() && constraints->is_array()) {
-                std::vector<std::string> constraint_names;
-                for (const auto & name : *constraints) {
+            const auto & samplers = data.find("samplers");
+            if (samplers != data.end() && samplers->is_array()) {
+                std::vector<std::string> sampler_names;
+                for (const auto & name : *samplers) {
                    if (name.is_string()) {
-                        constraint_names.emplace_back(name);
+                        sampler_names.emplace_back(name);
                    }
                }
-                slot.sparams.constraints = gpt_constraint_types_from_names(constraint_names, false);
+                slot.sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
            } else {
-                slot.sparams.constraints = default_sparams.constraints;
+                slot.sparams.samplers = default_sparams.samplers;
            }
        }

@ -1253,10 +1253,10 @@ struct server_context {
    }

    json get_formated_generation(const server_slot & slot) const {
-        std::vector<std::string> constraints;
-        constraints.reserve(slot.sparams.constraints.size());
-        for (const auto & constraint : slot.sparams.constraints) {
-            constraints.emplace_back(gpt_constraint_type_to_str(constraint));
+        std::vector<std::string> samplers;
+        samplers.reserve(slot.sparams.samplers.size());
+        for (const auto & sampler : slot.sparams.samplers) {
+            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
        }

        return json {
@ -1290,7 +1290,7 @@ struct server_context {
            {"n_probs",                   slot.sparams.n_probs},
            {"min_keep",                  slot.sparams.min_keep},
            {"grammar",                   slot.sparams.grammar},
-            {"samplers",                  constraints},
+            {"samplers",                  samplers},
        };
    }

--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -55,11 +55,9 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    auto sparams = llama_sampler_default_params();
+    auto sparams = llama_sampler_chain_default_params();

-    sparams.type = LLAMA_SAMPLER_TYPE_GREEDY;
-
-    llama_sampler * smpl = llama_sampler_init(model, sparams);
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);

    // tokenize the prompt

@ -116,12 +114,9 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_predict) {
        // sample the next token
        {
-            const auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);

-            llama_sampler_set_logits(smpl, logits);
-
-            // sample the most likely token
-            const llama_token new_token_id = llama_sampler_sample(smpl, nullptr);
+            llama_sampler_accept(smpl, new_token_id);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -179,7 +179,7 @@ int main(int argc, char ** argv) {
    // target model sampling context (reuse the llama_context's sampling instance)
    struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);

-    struct llama_constraint * softmax = llama_constraint_init_softmax();
+    struct llama_sampler * softmax = llama_sampler_init_softmax();

    // draft sequence data
    std::vector<seq_draft> drafts(n_seq_dft);
@ -255,7 +255,7 @@ int main(int argc, char ** argv) {

                        LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                        float r = u_dist(rng);
-                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
+                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };

                        //GGML_ASSERT(dist_tgt.size <= dist_dft.size);

@ -625,7 +625,7 @@ int main(int argc, char ** argv) {
        gpt_sampler_free(drafts[s].smpl);
    }

-    llama_constraint_free(softmax);
+    llama_sampler_free(softmax);
    llama_batch_free(batch_dft);

    llama_free(ctx_tgt);
--- a/include/llama.h
+++ b/include/llama.h
@ -216,6 +216,7 @@ extern "C" {
        // TODO: consider SoA
        llama_token_data * data;
        size_t size;
+        int64_t selected;
        bool sorted;
    } llama_token_data_array;

@ -369,21 +370,9 @@ extern "C" {
        float bias;
    } llama_logit_bias;

-    enum llama_sampler_type {
-        LLAMA_SAMPLER_TYPE_GREEDY = 0,
-        LLAMA_SAMPLER_TYPE_DIST   = 1,
-    };
-
-    typedef struct llama_sampler_params {
-        uint32_t seed; // the seed used to initialize the rng of the sampler
-
-        int32_t n_prev; // size of ring buffer to keep previous accepted tokens (needed for llama_sampler_prev_ API)
-
-        // TODO: will be used by the llama_decode_with_sampler() API in the future
-        enum llama_sampler_type type;
-
+    typedef struct llama_sampler_chain_params {
        bool no_timing; // whether to measure performance timings
-    } llama_sampler_params;
+    } llama_sampler_chain_params;

    // performance timing information
    struct llama_timings {
@ -412,7 +401,7 @@ extern "C" {
    // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
    LLAMA_API struct llama_model_params          llama_model_default_params(void);
    LLAMA_API struct llama_context_params        llama_context_default_params(void);
-    LLAMA_API struct llama_sampler_params        llama_sampler_default_params(void);
+    LLAMA_API struct llama_sampler_chain_params  llama_sampler_chain_default_params(void);
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);

    // Initialize the llama + ggml backend
@ -1003,70 +992,73 @@ extern "C" {
    //
    // Sampling API
    //
-    // - Constraints
-    //   The llama_constraint object works on a set of candidate tokens (llama_token_data_array), by modifying their
-    //   logits and probabilities inplace. The interface is abstracted so that users can implement custom constraints.
-    //
-    // - Samplers
-    //   The llama_sampler samples a token based on the candidate token probabilities. Before the actual sampling, the
-    //   sampler can apply a sequence of constraints in order to modify the probabilities of the candidates.
-    //
-    // The llama_sampler object contains the entire sampling information:
-    //
-    //   - RNG state (seed and generator)
-    //   - Custom set of constraints (see llama_sampler_constraint_add)
-    //   - Sampling method (greedy, dist)
-    //   - Previous tokens
-    //
    // In the future, it will be utilized offload the sampling to the backends (e.g. GPU).
    //
    // TODO: in the future, the entire API should be changed to accept llama_vocab, instead of llama_model

-    // constraints
+    typedef void * llama_sampler_context_t;

-    struct llama_constraint;
-
-    typedef void * llama_constraint_context_t;
-
-    // user code can implement the interface below in order to create custom llama_constraint
-    struct llama_constraint_i {
-        const char *              (*name)  (const struct llama_constraint * cnstr);                                 // can be NULL
-        void                      (*accept)(      struct llama_constraint * cnstr, llama_token token);              // can be NULL
-        void                      (*apply) (      struct llama_constraint * cnstr, llama_token_data_array * cur_p); // required
-        void                      (*reset) (      struct llama_constraint * cnstr);                                 // can be NULL
-        struct llama_constraint * (*clone) (const struct llama_constraint * cnstr);                                 // can be NULL if ctx is NULL
-        void                      (*free)  (      struct llama_constraint * cnstr);                                 // can be NULL if ctx is NULL
+    // user code can implement the interface below in order to create custom llama_sampler
+    struct llama_sampler_i {
+        const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
+        void                   (*accept)(      struct llama_sampler * smpl, llama_token token);              // can be NULL
+        void                   (*apply) (      struct llama_sampler * smpl, llama_token_data_array * cur_p); // required
+        void                   (*reset) (      struct llama_sampler * smpl);                                 // can be NULL
+        struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
+        void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL

        // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
-        //void (*apply_ggml) (struct llama_constraint * cnstr, ...);
+        //void (*apply_ggml) (struct llama_sampler * smpl, ...);
    };

-    struct llama_constraint {
-        struct llama_constraint_i  * iface;
-        llama_constraint_context_t   ctx;
+    struct llama_sampler {
+        struct llama_sampler_i  * iface;
+        llama_sampler_context_t   ctx;
    };

+    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
+    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
+    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
+    LLAMA_API void                   llama_sampler_reset (      struct llama_sampler * smpl);
+    LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
+    // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
+    LLAMA_API void                   llama_sampler_free  (      struct llama_sampler * smpl);
+
+    // llama_sampler_chain is a type of llama_sampler that can contain multiple llama_samplers
+
+    LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
+
+    // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
+    LLAMA_API void                   llama_sampler_chain_add(      struct llama_sampler * chain, struct llama_sampler * smpl);
+    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
+    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
+
+    // available samplers:
+
+    LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
+    LLAMA_API struct llama_sampler * llama_sampler_init_dist       (uint32_t seed);
+
    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    LLAMA_API struct llama_constraint * llama_constraint_init_softmax    (void);
+    LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void);

    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API struct llama_constraint * llama_constraint_init_top_k      (int32_t k);
+    LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);

    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API struct llama_constraint * llama_constraint_init_top_p      (float   p, int32_t min_keep);
+    LLAMA_API struct llama_sampler * llama_sampler_init_top_p      (float   p, int32_t min_keep);

    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
-    LLAMA_API struct llama_constraint * llama_constraint_init_min_p      (float   p, int32_t min_keep);
+    LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, int32_t min_keep);

    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API struct llama_constraint * llama_constraint_init_tail_free  (float   z, int32_t min_keep);
+    LLAMA_API struct llama_sampler * llama_sampler_init_tail_free  (float   z, int32_t min_keep);

    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API struct llama_constraint * llama_constraint_init_typical    (float   p, int32_t min_keep);
-    LLAMA_API struct llama_constraint * llama_constraint_init_temp       (float   t);
+    LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, int32_t min_keep);
+    LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);

    /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
-    LLAMA_API struct llama_constraint * llama_constraint_init_temp_ext   (float   t, float   delta, float exponent);
+    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);

    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@ -1074,7 +1066,7 @@ extern "C" {
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API struct llama_constraint * llama_constraint_init_mirostat(
+    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat(
            const struct llama_model * model,
                               float   tau,
                               float   eta);
@ -1084,16 +1076,16 @@ extern "C" {
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API struct llama_constraint * llama_constraint_init_mirostat_v2(
+    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2(
                               float   tau,
                               float   eta);

-    LLAMA_API struct llama_constraint * llama_constraint_init_grammar(
+    LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
            const struct llama_model * model,
                          const char * grammar_str,
                          const char * grammar_root);

-    LLAMA_API struct llama_constraint * llama_constraint_init_penalties(
+    LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
            const struct llama_model * model,
                             int32_t   penalty_last_n,  // last n tokens to penalize (0 = disable penalty, -1 = context size)
                               float   penalty_repeat,  // 1.0 = disabled
@ -1102,57 +1094,14 @@ extern "C" {
                                bool   penalize_nl,     // consider newlines as a repeatable token
                                bool   ignore_eos);     // ignore the end-of-sequence token

-    LLAMA_API struct llama_constraint * llama_constraint_init_logit_bias(
+    LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
            const struct llama_model * model,
                             int32_t   n_logit_bias,
              const llama_logit_bias * logit_bias);

-    LLAMA_API struct llama_constraint * llama_constraint_clone(const struct llama_constraint * cnstr);
-
-    // important: do not call if the constraint has been added to a llama_sampler (via llama_sampler_constraint_add)
-    LLAMA_API void llama_constraint_free(struct llama_constraint * cnstr);
-
-    LLAMA_API const char * llama_constraint_name  (const struct llama_constraint * cnstr);
-    LLAMA_API void         llama_constraint_accept(      struct llama_constraint * cnstr, llama_token token);
-    LLAMA_API void         llama_constraint_apply (      struct llama_constraint * cnstr, llama_token_data_array * cur_p);
-    LLAMA_API void         llama_constraint_reset (      struct llama_constraint * cnstr);
-
-    // samplers
-
-    LLAMA_API struct llama_sampler * llama_sampler_init  (const struct llama_model   * model, struct llama_sampler_params params);
-    LLAMA_API void                   llama_sampler_free  (      struct llama_sampler * smpl);
-    LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
-    LLAMA_API void                   llama_sampler_reset (      struct llama_sampler * smpl);
-    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
-    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
-
-    LLAMA_API void llama_sampler_set_logits(struct llama_sampler * smpl, const float * logits);
-
-    LLAMA_API llama_token_data_array * llama_sampler_get_candidates(struct llama_sampler * smpl);
-
-    // important: takes ownership of the constraint object and will free it in llama_sampler_free
-    LLAMA_API void                      llama_sampler_constraint_add(      struct llama_sampler * smpl, struct llama_constraint * cnstr);
-    LLAMA_API int                       llama_sampler_n_constraints (const struct llama_sampler * smpl);
-    LLAMA_API struct llama_constraint * llama_sampler_constraint_get(const struct llama_sampler * smpl, int32_t i);
-
-
-    LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, llama_token_data_array * cur_p);
-
-    /// @details Get the number of accepted tokens so far (max of n_prev)
-    LLAMA_API int llama_sampler_n_prev(const struct llama_sampler * smpl);
-
-    /// @details Get the ith accepted token
-    /// @param ith [0, n_prev), ith == 0 is the last accepted token.
-    /// returns LLAMA_TOKEN_NULL if ith is out of bounds
-    LLAMA_API llama_token llama_sampler_prev(const struct llama_sampler * smpl, int32_t ith);
-
-    /// @details Get the last accepted token
-    /// Same as llama_sampler_prev(smpl, 0)
-    /// returns LLAMA_TOKEN_NULL if there are no accepted tokens
-    LLAMA_API llama_token llama_sampler_last(const struct llama_sampler * smpl);
+    LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);

    // TODO: extend in the future
-    //LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t i);
    //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);

    //
@ -1172,8 +1121,9 @@ extern "C" {
    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);

-    LLAMA_API void llama_print_timings(struct llama_context * ctx, struct llama_sampler * smpl);
-    LLAMA_API void llama_reset_timings(struct llama_context * ctx, struct llama_sampler * smpl);
+    // note: requires llama_sampler_chain. how to prevent misuse?
+    LLAMA_API void llama_print_timings(const struct llama_context * ctx, const struct llama_sampler * chain);
+    LLAMA_API void llama_reset_timings(      struct llama_context * ctx,       struct llama_sampler * chain);

    // Print system information
    LLAMA_API const char * llama_print_system_info(void);
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@ -32,6 +32,20 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 // helpers
 //

+struct time_meas {
+    time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+
+    ~time_meas() {
+        if (t_start_us >= 0) {
+            t_acc += ggml_time_us() - t_start_us;
+        }
+    }
+
+    const int64_t t_start_us;
+
+    int64_t & t_acc;
+};
+
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
        return;
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
--- a/src/llama-sampling.h
+++ b/src/llama-sampling.h
@ -2,89 +2,26 @@

 #include "llama-grammar.h"

-#include <random>
 #include <unordered_map>

 struct llama_vocab;
 struct llama_grammar;

-using llama_token_cnt = std::unordered_map<llama_token, int>;
-
-// TODO: tmp exposed until test-sampling is fixed
-void llama_constraint_penalties_impl(
-       llama_token_data_array * cur_p,
-        const llama_token_cnt & token_count,
-                        float   penalty_repeat,
-                        float   penalty_freq,
-                        float   penalty_present);
-
-// constraints
-
-struct llama_constraint * llama_constraint_init_softmax_impl    ();
-struct llama_constraint * llama_constraint_init_top_k_impl      (int32_t k);
-struct llama_constraint * llama_constraint_init_top_p_impl      (float   p, size_t min_keep);
-struct llama_constraint * llama_constraint_init_min_p_impl      (float   p, size_t min_keep);
-struct llama_constraint * llama_constraint_init_tail_free_impl  (float   z, size_t min_keep);
-struct llama_constraint * llama_constraint_init_typical_impl    (float   p, size_t min_keep);
-struct llama_constraint * llama_constraint_init_temp_impl       (float   t);
-struct llama_constraint * llama_constraint_init_temp_ext_impl   (float   t, float  delta, float exponent);
-
-struct llama_constraint * llama_constraint_init_mirostat_impl(
-        const struct llama_vocab & vocab,
-                           float   tau,
-                           float   eta,
-                         int32_t   m);
-
-struct llama_constraint * llama_constraint_init_mirostat_v2_impl(
-                           float   tau,
-                           float   eta);
-
-struct llama_constraint * llama_constraint_init_grammar_impl(
-        const struct llama_vocab & vocab,
-                      const char * grammar_str,
-                      const char * grammar_root);
-
-struct llama_constraint * llama_constraint_init_penalties_impl(
-        const struct llama_vocab & vocab,
-                         int32_t   penalty_last_n,
-                           float   penalty_repeat,
-                           float   penalty_freq,
-                           float   penalty_present,
-                            bool   penalize_nl,
-                            bool   ignore_eos);
-
-    LLAMA_API struct llama_constraint * llama_constraint_init_logit_bias_impl(
-        const struct llama_vocab & vocab,
-                         int32_t   n_logit_bias,
-          const llama_logit_bias * logit_bias);
-
-struct llama_constraint * llama_constraint_clone_impl(const struct llama_constraint & cnstr);
-
-void llama_constraint_free_impl(struct llama_constraint * cnstr);
-
-const char * llama_constraint_name_impl  (const struct llama_constraint & cnstr);
-void         llama_constraint_accept_impl(      struct llama_constraint & cnstr, llama_token token);
-void         llama_constraint_apply_impl (      struct llama_constraint & cnstr, struct llama_token_data_array * cur_p);
-void         llama_constraint_reset_impl (      struct llama_constraint & cnstr);
-
 // samplers

-struct llama_sampler {
-    llama_sampler_params params;
+const char *           llama_sampler_name_impl  (const struct llama_sampler & smpl);
+void                   llama_sampler_accept_impl(      struct llama_sampler & smpl, llama_token token);
+void                   llama_sampler_apply_impl (      struct llama_sampler & smpl, struct llama_token_data_array * cur_p);
+void                   llama_sampler_reset_impl (      struct llama_sampler & smpl);
+struct llama_sampler * llama_sampler_clone_impl (const struct llama_sampler & smpl);
+void                   llama_sampler_free_impl  (      struct llama_sampler * smpl);

-    const struct llama_vocab * vocab;
+// sampler chain

-    // state
+struct llama_sampler_chain {
+    llama_sampler_chain_params params;

-    std::mt19937 rng;
-
-    ring_buffer<llama_token> prev;
-
-    std::vector<llama_constraint *> constraints;
-
-    std::vector<llama_token_data> cur;
-
-    llama_token_data_array cur_p;
+    std::vector<struct llama_sampler *> samplers;

    // timing

@ -93,18 +30,57 @@ struct llama_sampler {
    mutable int32_t n_sample;
 };

-struct llama_sampler * llama_sampler_init_impl  (const struct llama_vocab   & vocab, struct llama_sampler_params params);
-void                   llama_sampler_free_impl  (      struct llama_sampler * smpl);
-struct llama_sampler * llama_sampler_clone_impl (const struct llama_sampler & smpl);
-void                   llama_sampler_reset_impl (      struct llama_sampler & smpl);
-void                   llama_sampler_accept_impl(      struct llama_sampler & smpl, llama_token token);
-void                   llama_sampler_apply_impl (      struct llama_sampler & smpl, struct llama_token_data_array * cur_p);
+struct llama_sampler * llama_sampler_chain_init_impl(      struct llama_sampler_chain_params params);
+void                   llama_sampler_chain_add_impl (      struct llama_sampler_chain & chain, struct llama_sampler * smpl);
+struct llama_sampler * llama_sampler_chain_get_impl (const struct llama_sampler_chain & chain, int32_t i);
+int                    llama_sampler_chain_n_impl   (const struct llama_sampler_chain & chain);

-void                      llama_sampler_constraint_add_impl(      struct llama_sampler & smpl, struct llama_constraint * cnstr);
-int                       llama_sampler_n_constraints_impl (const struct llama_sampler & smpl);
-struct llama_constraint * llama_sampler_constraint_get_impl(const struct llama_sampler & smpl, int ith);
+using llama_token_cnt = std::unordered_map<llama_token, int>;

-llama_token llama_sampler_sample_impl(struct llama_token_data_array * cur_p, std::mt19937 & rng, enum llama_sampler_type type);
+// TODO: tmp exposed until test-sampling is fixed
+void llama_sampler_penalties_impl(
+       llama_token_data_array * cur_p,
+        const llama_token_cnt & token_count,
+                        float   penalty_repeat,
+                        float   penalty_freq,
+                        float   penalty_present);

-llama_token llama_sampler_prev_impl  (const struct llama_sampler & smpl, int ith);
-int         llama_sampler_n_prev_impl(const struct llama_sampler & smpl);
+struct llama_sampler * llama_sampler_init_greedy_impl   ();
+struct llama_sampler * llama_sampler_init_dist_impl     (uint32_t seed);
+struct llama_sampler * llama_sampler_init_softmax_impl  ();
+struct llama_sampler * llama_sampler_init_top_k_impl    (int32_t k);
+struct llama_sampler * llama_sampler_init_top_p_impl    (float   p, size_t min_keep);
+struct llama_sampler * llama_sampler_init_min_p_impl    (float   p, size_t min_keep);
+struct llama_sampler * llama_sampler_init_tail_free_impl(float   z, size_t min_keep);
+struct llama_sampler * llama_sampler_init_typical_impl  (float   p, size_t min_keep);
+struct llama_sampler * llama_sampler_init_temp_impl     (float   t);
+struct llama_sampler * llama_sampler_init_temp_ext_impl (float   t, float  delta, float exponent);
+
+struct llama_sampler * llama_sampler_init_mirostat_impl(
+        const struct llama_vocab & vocab,
+                           float   tau,
+                           float   eta,
+                         int32_t   m);
+
+struct llama_sampler * llama_sampler_init_mirostat_v2_impl(
+                           float   tau,
+                           float   eta);
+
+struct llama_sampler * llama_sampler_init_grammar_impl(
+        const struct llama_vocab & vocab,
+                      const char * grammar_str,
+                      const char * grammar_root);
+
+struct llama_sampler * llama_sampler_init_penalties_impl(
+        const struct llama_vocab & vocab,
+                         int32_t   penalty_last_n,
+                           float   penalty_repeat,
+                           float   penalty_freq,
+                           float   penalty_present,
+                            bool   penalize_nl,
+                            bool   ignore_eos);
+
+    LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias_impl(
+        const struct llama_vocab & vocab,
+                         int32_t   n_logit_bias,
+          const llama_logit_bias * logit_bias);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -147,21 +147,6 @@ static void zeros(std::ofstream & file, size_t n) {
    }
 }

-struct time_meas {
-    time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
-
-    ~time_meas() {
-        if (t_start_us >= 0) {
-            t_acc += ggml_time_us() - t_start_us;
-        }
-    }
-
-    const int64_t t_start_us;
-
-    int64_t & t_acc;
-};
-
-
 LLAMA_ATTRIBUTE_FORMAT(1, 2)
 static std::string format(const char * fmt, ...) {
    va_list ap;
@ -17937,11 +17922,8 @@ struct llama_context_params llama_context_default_params() {
    return result;
 }

-struct llama_sampler_params llama_sampler_default_params() {
-    struct llama_sampler_params result = {
-        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
-        /*.n_prev                      =*/ 256,
-        /*.type                        =*/ LLAMA_SAMPLER_TYPE_DIST,
+struct llama_sampler_chain_params llama_sampler_chain_default_params() {
+    struct llama_sampler_chain_params result = {
        /*.no_timing                   =*/ false, // TODO: change to true and set explicitly in examples
    };

@ -20610,98 +20592,24 @@ int32_t llama_chat_apply_template(
 // sampling
 //

-struct llama_constraint * llama_constraint_init_softmax(void) {
-    return llama_constraint_init_softmax_impl();
+const char * llama_sampler_name(const struct llama_sampler * smpl) {
+    return llama_sampler_name_impl(*smpl);
 }

-struct llama_constraint * llama_constraint_init_top_k(int32_t k) {
-    return llama_constraint_init_top_k_impl(k);
+void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
+    llama_sampler_accept_impl(*smpl, token);
 }

-struct llama_constraint * llama_constraint_init_top_p(float p, int32_t min_keep) {
-    return llama_constraint_init_top_p_impl(p, min_keep);
+void llama_sampler_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    llama_sampler_apply_impl(*smpl, cur_p);
 }

-struct llama_constraint * llama_constraint_init_min_p(float p, int32_t min_keep) {
-    return llama_constraint_init_min_p_impl(p, min_keep);
+void llama_sampler_reset(struct llama_sampler * smpl) {
+    llama_sampler_reset_impl(*smpl);
 }

-struct llama_constraint * llama_constraint_init_tail_free(float z, int32_t min_keep) {
-    return llama_constraint_init_tail_free_impl(z, min_keep);
-}
-
-struct llama_constraint * llama_constraint_init_typical(float p, int32_t min_keep) {
-    return llama_constraint_init_typical_impl(p, min_keep);
-}
-
-struct llama_constraint * llama_constraint_init_temp(float temp) {
-    return llama_constraint_init_temp_impl(temp);
-}
-
-struct llama_constraint * llama_constraint_init_temp_ext(float temp, float delta, float exponent) {
-    return llama_constraint_init_temp_ext_impl(temp, delta, exponent);
-}
-
-struct llama_constraint * llama_constraint_init_mirostat(const struct llama_model * model, float tau, float eta) {
-    return llama_constraint_init_mirostat_impl(model->vocab, tau, eta, 100);
-}
-
-struct llama_constraint * llama_constraint_init_mirostat_v2(float tau, float eta) {
-    return llama_constraint_init_mirostat_v2_impl(tau, eta);
-}
-
-struct llama_constraint * llama_constraint_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
-    return llama_constraint_init_grammar_impl(model->vocab, grammar_str, grammar_root);
-}
-
-struct llama_constraint * llama_constraint_init_penalties(
-        const struct llama_model * model,
-                         int32_t   penalty_last_n,
-                           float   penalty_repeat,
-                           float   penalty_freq,
-                           float   penalty_present,
-                            bool   penalize_nl,
-                            bool   ignore_eos) {
-    return llama_constraint_init_penalties_impl(model->vocab, penalty_last_n, penalty_repeat, penalty_freq, penalty_present, penalize_nl, ignore_eos);
-}
-
-LLAMA_API struct llama_constraint * llama_constraint_init_logit_bias(
-        const struct llama_model * model,
-                         int32_t   n_logit_bias,
-          const llama_logit_bias * logit_bias) {
-    return llama_constraint_init_logit_bias_impl(model->vocab, n_logit_bias, logit_bias);
-}
-
-struct llama_constraint * llama_constraint_clone(const struct llama_constraint * cnstr) {
-    return llama_constraint_clone_impl(*cnstr);
-}
-
-void llama_constraint_free(struct llama_constraint * cnstr) {
-    if (cnstr == nullptr) {
-        return;
-    }
-
-    llama_constraint_free_impl(cnstr);
-}
-
-const char * llama_constraint_name(const struct llama_constraint * cnstr) {
-    return llama_constraint_name_impl(*cnstr);
-}
-
-void llama_constraint_accept(struct llama_constraint * cnstr, llama_token token) {
-    llama_constraint_accept_impl(*cnstr, token);
-}
-
-void llama_constraint_apply(struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
-    llama_constraint_apply_impl(*cnstr, cur_p);
-}
-
-void llama_constraint_reset(struct llama_constraint * cnstr) {
-    llama_constraint_reset_impl(*cnstr);
-}
-
-struct llama_sampler * llama_sampler_init(const struct llama_model * model, struct llama_sampler_params params) {
-    return llama_sampler_init_impl(model->vocab, params);
+struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
+    return llama_sampler_clone_impl(*smpl);
 }

 void llama_sampler_free(struct llama_sampler * smpl) {
@ -20712,86 +20620,110 @@ void llama_sampler_free(struct llama_sampler * smpl) {
    llama_sampler_free_impl(smpl);
 }

-struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
-    return llama_sampler_clone_impl(*smpl);
+struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
+    return llama_sampler_chain_init_impl(params);
 }

-void llama_sampler_reset(struct llama_sampler * smpl) {
-    llama_sampler_reset_impl(*smpl);
+void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
+    llama_sampler_chain_add_impl(*(struct llama_sampler_chain *) chain->ctx, smpl);
 }

-void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
-    llama_sampler_accept_impl(*smpl, token);
+struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
+    return llama_sampler_chain_get_impl(*(const struct llama_sampler_chain *) chain->ctx, i);
 }

-void llama_sampler_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    time_meas tm(smpl->t_sample_us, smpl->params.no_timing);
-
-    if (cur_p == nullptr) {
-        cur_p = &smpl->cur_p;
-    }
-
-    llama_sampler_apply_impl(*smpl, cur_p);
+int llama_sampler_chain_n(const struct llama_sampler * chain) {
+    return llama_sampler_chain_n_impl(*(const struct llama_sampler_chain *) chain->ctx);
 }

-void llama_sampler_set_logits(struct llama_sampler * smpl, const float * logits) {
-    const int n_vocab = smpl->vocab->n_vocab;
+struct llama_sampler * llama_sampler_init_greedy(void) {
+    return llama_sampler_init_greedy_impl();
+}

-    smpl->cur.resize(n_vocab);
+struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
+    return llama_sampler_init_dist_impl(seed);
+}

+struct llama_sampler * llama_sampler_init_softmax(void) {
+    return llama_sampler_init_softmax_impl();
+}
+
+struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
+    return llama_sampler_init_top_k_impl(k);
+}
+
+struct llama_sampler * llama_sampler_init_top_p(float p, int32_t min_keep) {
+    return llama_sampler_init_top_p_impl(p, min_keep);
+}
+
+struct llama_sampler * llama_sampler_init_min_p(float p, int32_t min_keep) {
+    return llama_sampler_init_min_p_impl(p, min_keep);
+}
+
+struct llama_sampler * llama_sampler_init_tail_free(float z, int32_t min_keep) {
+    return llama_sampler_init_tail_free_impl(z, min_keep);
+}
+
+struct llama_sampler * llama_sampler_init_typical(float p, int32_t min_keep) {
+    return llama_sampler_init_typical_impl(p, min_keep);
+}
+
+struct llama_sampler * llama_sampler_init_temp(float temp) {
+    return llama_sampler_init_temp_impl(temp);
+}
+
+struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
+    return llama_sampler_init_temp_ext_impl(temp, delta, exponent);
+}
+
+struct llama_sampler * llama_sampler_init_mirostat(const struct llama_model * model, float tau, float eta) {
+    return llama_sampler_init_mirostat_impl(model->vocab, tau, eta, 100);
+}
+
+struct llama_sampler * llama_sampler_init_mirostat_v2(float tau, float eta) {
+    return llama_sampler_init_mirostat_v2_impl(tau, eta);
+}
+
+struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
+    return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
+}
+
+struct llama_sampler * llama_sampler_init_penalties(
+        const struct llama_model * model,
+                         int32_t   penalty_last_n,
+                           float   penalty_repeat,
+                           float   penalty_freq,
+                           float   penalty_present,
+                            bool   penalize_nl,
+                            bool   ignore_eos) {
+    return llama_sampler_init_penalties_impl(model->vocab, penalty_last_n, penalty_repeat, penalty_freq, penalty_present, penalize_nl, ignore_eos);
+}
+
+LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
+        const struct llama_model * model,
+                         int32_t   n_logit_bias,
+          const llama_logit_bias * logit_bias) {
+    return llama_sampler_init_logit_bias_impl(model->vocab, n_logit_bias, logit_bias);
+}
+
+llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
+    const auto * logits = llama_get_logits_ith(ctx, idx);
+
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+
+    // TODO: do not allocate each time
+    std::vector<llama_token_data> cur(n_vocab);
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        smpl->cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
    }

-    smpl->cur_p = { smpl->cur.data(), smpl->cur.size(), false };
+    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+
+    llama_sampler_apply(smpl, &cur_p);
+
+    return cur_p.data[cur_p.selected].id;
 }

-llama_token_data_array * llama_sampler_get_candidates(struct llama_sampler * smpl) {
-    return &smpl->cur_p;
-}
-
-void llama_sampler_constraint_add(struct llama_sampler * smpl, struct llama_constraint * cnstr) {
-    llama_sampler_constraint_add_impl(*smpl, cnstr);
-}
-
-int llama_sampler_n_constraints (const struct llama_sampler * smpl) {
-    return llama_sampler_n_constraints_impl(*smpl);
-}
-
-struct llama_constraint * llama_sampler_constraint_get(const struct llama_sampler * smpl, int32_t i) {
-    return llama_sampler_constraint_get_impl(*smpl, i);
-}
-
-llama_token llama_sampler_sample(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    time_meas tm(smpl->t_sample_us, smpl->params.no_timing);
-
-    if (cur_p == nullptr) {
-        cur_p = &smpl->cur_p;
-    }
-
-    auto res = llama_sampler_sample_impl(cur_p, smpl->rng, smpl->params.type);
-
-    smpl->n_sample++;
-
-    return res;
-}
-
-int llama_sampler_n_prev(const struct llama_sampler * smpl) {
-    return llama_sampler_n_prev_impl(*smpl);
-}
-
-llama_token llama_sampler_prev(const struct llama_sampler * smpl, int32_t ith) {
-    return llama_sampler_prev_impl(*smpl, ith);
-}
-
-llama_token llama_sampler_last(const struct llama_sampler * smpl) {
-    return llama_sampler_prev_impl(*smpl, 0);
-}
-
-//llama_token llama_sampler_sample(struct llama_sampler * smpl, const struct llama_context * ctx, int32_t i) {
-//    GGML_ABORT("not implemented");
-//}
-
 //
 // model split
 //
@ -20820,7 +20752,9 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
    return 0;
 }

-void llama_print_timings(struct llama_context * ctx, struct llama_sampler * smpl) {
+void llama_print_timings(const struct llama_context * ctx, const struct llama_sampler * chain) {
+    auto * smpl = chain ? (const struct llama_sampler_chain *) chain->ctx : nullptr;
+
    const llama_timings timings = {
        /*.t_start_ms   =*/ 1e-3 * ctx->t_start_us,
        /*.t_end_ms     =*/ 1.00 * ggml_time_ms(),
@ -20845,13 +20779,15 @@ void llama_print_timings(struct llama_context * ctx, struct llama_sampler * smpl
    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
 }

-void llama_reset_timings(struct llama_context * ctx, struct llama_sampler * smpl) {
+void llama_reset_timings(struct llama_context * ctx, struct llama_sampler * chain) {
    ctx->t_start_us  = ggml_time_us();
    ctx->t_eval_us   = ctx->n_eval   = 0;
    ctx->t_p_eval_us = ctx->n_p_eval = 0;

-    if (smpl) {
-        smpl->t_sample_us  = smpl->n_sample  = 0;
+    if (chain) {
+        auto * smpl = (struct llama_sampler_chain *) chain->ctx;
+
+        smpl->t_sample_us = smpl->n_sample = 0;
    }
 }

--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@ -21,8 +21,8 @@ static void dump(const llama_token_data_array * cur_p) {

 #define APPLY(__cnstr, __cur_p) do { \
    auto * cnstr = (__cnstr); \
-    llama_constraint_apply(cnstr, (__cur_p)); \
-    llama_constraint_free(cnstr); \
+    llama_sampler_apply(cnstr, (__cur_p)); \
+    llama_sampler_free(cnstr); \
 } while(0)

 static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
@ -35,10 +35,10 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }

-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-    APPLY(llama_constraint_init_softmax(), &cur_p);
+    llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
+    APPLY(llama_sampler_init_softmax(), &cur_p);
    DUMP(&cur_p);
-    APPLY(llama_constraint_init_top_k(k), &cur_p);
+    APPLY(llama_sampler_init_top_k(k), &cur_p);
    DUMP(&cur_p);

    GGML_ASSERT(cur_p.size == expected_probs.size());
@ -57,10 +57,10 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }

-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-    APPLY(llama_constraint_init_softmax(), &cur_p);
+    llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
+    APPLY(llama_sampler_init_softmax(), &cur_p);
    DUMP(&cur_p);
-    APPLY(llama_constraint_init_top_p(p, 1), &cur_p);
+    APPLY(llama_sampler_init_top_p(p, 1), &cur_p);
    DUMP(&cur_p);

    GGML_ASSERT(cur_p.size == expected_probs.size());
@ -79,9 +79,9 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }

-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
+    llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
    DUMP(&cur_p);
-    APPLY(llama_constraint_init_tail_free(z, 1), &cur_p);
+    APPLY(llama_sampler_init_tail_free(z, 1), &cur_p);
    DUMP(&cur_p);

    GGML_ASSERT(cur_p.size == expected_probs.size());
@ -100,11 +100,11 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float
        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }

-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
+    llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
    DUMP(&cur_p);
-    APPLY(llama_constraint_init_min_p(p, 1), &cur_p);
+    APPLY(llama_sampler_init_min_p(p, 1), &cur_p);
    DUMP(&cur_p);
-    APPLY(llama_constraint_init_softmax(), &cur_p);
+    APPLY(llama_sampler_init_softmax(), &cur_p);

    GGML_ASSERT(cur_p.size == expected_probs.size());
    for (size_t i = 0; i < cur_p.size; i++) {
@ -122,9 +122,9 @@ static void test_typical(const std::vector<float> & probs, const std::vector<flo
        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }

-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
+    llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
    DUMP(&cur_p);
-    APPLY(llama_constraint_init_typical(p, 1), &cur_p);
+    APPLY(llama_sampler_init_typical(p, 1), &cur_p);
    DUMP(&cur_p);

    GGML_ASSERT(cur_p.size == expected_probs.size());
@ -153,11 +153,11 @@ static void test_penalties(
        token_count[last_tokens[i]]++;
    }

-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-    APPLY(llama_constraint_init_softmax(), &cur_p);
+    llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
+    APPLY(llama_sampler_init_softmax(), &cur_p);
    DUMP(&cur_p);
-    llama_constraint_penalties_impl(&cur_p, token_count, repeat_penalty, alpha_frequency, alpha_presence); // TODO: avoid
-    APPLY(llama_constraint_init_softmax(), &cur_p);
+    llama_sampler_penalties_impl(&cur_p, token_count, repeat_penalty, alpha_frequency, alpha_presence); // TODO: avoid
+    APPLY(llama_sampler_init_softmax(), &cur_p);
    DUMP(&cur_p);

    GGML_ASSERT(cur_p.size == expected_probs.size());
@ -175,23 +175,23 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }

-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
+    llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };

          llama_token min_token_id = 0;
    const llama_token max_token_id = n_vocab-1;

    for (auto s : samplers_sequence) {
        switch (s){
-            case 'k': APPLY(llama_constraint_init_top_k(top_k), &cur_p); break;
+            case 'k': APPLY(llama_sampler_init_top_k(top_k), &cur_p); break;
            case 'f': GGML_ABORT("tail_free test not implemented");
            case 'y': GGML_ABORT("typical test not implemented");
-            case 'p': APPLY(llama_constraint_init_top_p(top_p, 1), &cur_p); break;
-            case 'm': APPLY(llama_constraint_init_min_p(min_p, 1), &cur_p); break;
+            case 'p': APPLY(llama_sampler_init_top_p(top_p, 1), &cur_p); break;
+            case 'm': APPLY(llama_sampler_init_min_p(min_p, 1), &cur_p); break;
            case 't': GGML_ABORT("temperature test not implemented");
            default : GGML_ABORT("Unknown sampler");
        }

-        APPLY(llama_constraint_init_softmax(), &cur_p); // make sure tokens are sorted for tests
+        APPLY(llama_sampler_init_softmax(), &cur_p); // make sure tokens are sorted for tests

        const int size = cur_p.size;