Added --logit-bias and --no-penalize-nl, removed std::span

2023-04-28 03:12:49 +03:00 · 2023-04-28 03:12:49 +03:00 · 61f822f63b
commit 61f822f63b
parent f01c67fe55
6 changed files with 185 additions and 165 deletions
--- a/2
+++ b/2
@ -35,7 +35,7 @@ endif

 # keep standard at C11 and C++11
 CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++20 -fPIC
+CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =

 # warnings
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -6,6 +6,8 @@
 #include <string>
 #include <iterator>
 #include <algorithm>
+#include <sstream>
+#include <iostream>

 #if defined (_WIN32)
 #include <fcntl.h>
@ -138,18 +140,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.repeat_penalty = std::stof(argv[i]);
-        } else if (arg == "--alpha_frequency") {
+        } else if (arg == "--frequency_penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.alpha_frequency = std::stof(argv[i]);
-        } else if (arg == "--alpha_presence") {
+            params.frequency_penalty = std::stof(argv[i]);
+        } else if (arg == "--presence_penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.alpha_presence = std::stof(argv[i]);
+            params.presence_penalty = std::stof(argv[i]);
        } else if (arg == "--mirostat") {
            if (++i >= argc) {
                invalid_param = true;
@ -227,7 +229,28 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "--perplexity") {
            params.perplexity = true;
        } else if (arg == "--ignore-eos") {
-            params.ignore_eos = true;
+            params.logit_bias[llama_token_eos()] = -INFINITY;
+        } else if (arg == "--no-penalize-nl") {
+            params.penalize_nl = false;
+        } else if (arg == "-l" || arg == "--logit-bias") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::stringstream ss(argv[i]);
+            llama_token key;
+            char sign;
+            std::string value_str;
+            try {
+                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-' || sign == '=' || sign == ':')) {
+                    params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                } else {
+                    throw std::exception();
+                }
+            } catch (const std::exception &e) {
+                invalid_param = true;
+                break;
+            }
        } else if (arg == "--n_parts") {
            if (++i >= argc) {
                invalid_param = true;
@ -282,19 +305,23 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -f FNAME, --file FNAME\n");
    fprintf(stderr, "                        prompt file to start generation.\n");
    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
-    fprintf(stderr, "  --top_k N             top-k sampling (default: %d, disabled: 0)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f, disabled: 1.0)\n", (double)params.top_p);
-    fprintf(stderr, "  --tfs N               tail free sampling, parameter z (default: %.1f, disabled: 1.0)\n", (double)params.tfs_z);
-    fprintf(stderr, "  --typical N           locally typical sampling, parameter p (default: %.1f, disabled: 1.0)\n", (double)params.typical_p);
-    fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d, disabled: 0)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f, disabled: 1.0)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  --alpha_presence N    repeat alpha presence (default: %.1f, disabled: 0.0)\n", (double)params.alpha_presence);
-    fprintf(stderr, "  --alpha_frequency N   repeat alpha frequency (default: %.1f, disabled: 0.0)\n", (double)params.alpha_frequency);
-    fprintf(stderr, "  --mirostat N          use mirostat sampling (default: %d, disabled: 0, mirostat: 1, mirostat 2.0: 2)\n", params.mirostat);
+    fprintf(stderr, "  --top_k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
+    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
+    fprintf(stderr, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
+    fprintf(stderr, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
+    fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
+    fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    fprintf(stderr, "  --presence_penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
+    fprintf(stderr, "  --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    fprintf(stderr, "  --mirostat N          use mirostat sampling (default: %d, 0 = disabled, 1 = mirostat, 2 = mirostat 2.0)\n", params.mirostat);
    fprintf(stderr, "  --mirostat_eta N      mirostat learning rate (default: %.1f)\n", (double)params.mirostat_eta);
    fprintf(stderr, "  --mirostat_tau N      mirostat target entropy (default: %.1f)\n", (double)params.mirostat_tau);
+    fprintf(stderr, "  -l TOKEN+BIAS, --logit-bias TOKEN+BIAS");
+    fprintf(stderr, "                        modifies the likelihood of token appearing in the completion,\n");
+    fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello'\n");
    fprintf(stderr, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating\n");
+    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2+-inf)\n");
+    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
    fprintf(stderr, "  --memory_f32          use f32 instead of f16 for memory key+value\n");
    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
    fprintf(stderr, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
--- a/examples/common.h
+++ b/examples/common.h
@ -8,6 +8,7 @@
 #include <vector>
 #include <random>
 #include <thread>
+#include <unordered_map>

 //
 // CLI argument parsing
@ -23,18 +24,19 @@ struct gpt_params {
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt

    // sampling parameters
-    int32_t top_k = 0;          // <= 0 to use vocab size
-    float   top_p = 1.0f;       // 1.0 = disabled
-    float   tfs_z = 1.0f;       // 1.0 = disabled
-    float   typical_p = 1.0f;   // 1.0 = disabled
-    float   temp = 1.0f;        // 1.0 = disabled
+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+    int32_t top_k = 0;              // <= 0 to use vocab size
+    float   top_p = 1.0f;           // 1.0 = disabled
+    float   tfs_z = 1.0f;           // 1.0 = disabled
+    float   typical_p = 1.0f;       // 1.0 = disabled
+    float   temp = 1.0f;            // 1.0 = disabled
    float   repeat_penalty  = 1.0f; // 1.0 = disabled
-    int32_t repeat_last_n = -1;  // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   alpha_frequency = 0.0f; // 0.0 = disabled
-    float   alpha_presence = 0.0f;  // 0.0 = disabled
-    int     mirostat = 0;       // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau = 5.0f; // target entropy
-    float   mirostat_eta = 0.1f; // learning rate
+    int32_t repeat_last_n = -1;     // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   frequency_penalty = 0.0f; // 0.0 = disabled
+    float   presence_penalty = 0.0f;  // 0.0 = disabled
+    int     mirostat = 0;           // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau = 5.0f;    // target entropy
+    float   mirostat_eta = 0.1f;    // learning rate

    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
    std::string prompt = "";
@ -54,7 +56,7 @@ struct gpt_params {
    bool interactive_first = false; // wait for user input immediately

    bool instruct          = false; // instruction mode (used for Alpaca models)
-    bool ignore_eos        = false; // do not stop generating after eos
+    bool penalize_nl       = true;  // consider newlines as a repeatable token
    bool perplexity        = false; // compute perplexity over the prompt
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -276,8 +276,8 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
        }
    }
-    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, alpha_presence = %f, alpha_frequency = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_eta = %f, mirostat_tau = %f\n",
-        params.repeat_last_n, params.repeat_penalty, params.alpha_presence, params.alpha_frequency, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
+    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_eta = %f, mirostat_tau = %f\n",
+            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    fprintf(stderr, "\n\n");

@ -394,11 +394,12 @@ int main(int argc, char ** argv) {
            const float   typical_p      = params.typical_p;
            const int32_t repeat_last_n  = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
            const float   repeat_penalty = params.repeat_penalty;
-            const float   alpha_presence = params.alpha_presence;
-            const float   alpha_frequency = params.alpha_frequency;
-            const int     mirostat   = params.mirostat;
+            const float   alpha_presence = params.presence_penalty;
+            const float   alpha_frequency = params.frequency_penalty;
+            const int     mirostat       = params.mirostat;
            const float   mirostat_tau   = params.mirostat_tau;
            const float   mirostat_eta   = params.mirostat_eta;
+            const bool    penalize_nl   = params.penalize_nl;

            // optionally save the session on first sample (for faster prompt loading next time)
            if (!path_session.empty() && need_to_save_session) {
@ -412,8 +413,9 @@ int main(int argc, char ** argv) {
                auto logits = llama_get_logits(ctx);
                auto n_vocab = llama_n_vocab(ctx);

-                if (params.ignore_eos) {
-                    logits[llama_token_eos()] = -INFINITY;
+                // Apply params.logit_bias map
+                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+                    logits[it->first] += it->second;
                }

                std::vector<llama_token_data> candidates;
@ -425,6 +427,7 @@ int main(int argc, char ** argv) {
                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

                // Apply penalties
+                float nl_logit = logits[llama_token_nl()];
                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
                llama_sample_repetition_penalty(ctx, &candidates_p,
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@ -432,7 +435,9 @@ int main(int argc, char ** argv) {
                llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                    last_n_repeat, alpha_frequency, alpha_presence);
-
+                if (!penalize_nl) {
+                    logits[llama_token_nl()] = nl_logit;
+                }

                if (temp <= 0) {
                    // Greedy sampling
--- a/llama.cpp
+++ b/llama.cpp
@ -28,7 +28,6 @@
 #include <atomic>
 #include <mutex>
 #include <sstream>
-#include <span>

 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
@ -1484,26 +1483,23 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c

    const int64_t t_start_sample_us = ggml_time_us();

-    std::span<llama_token_data> tokens(candidates->data, candidates->size);
-
    // Sort the logits in descending order
    if (!candidates->sorted) {
-        std::sort(tokens.begin(), tokens.end(), [](const llama_token_data & a, const llama_token_data & b) {
+        std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
            return a.logit > b.logit;
        });
        candidates->sorted = true;
    }

-    float max_l = tokens[0].logit;
+    float max_l = candidates->data[0].logit;
    float cum_sum = 0.0f;
-    for (size_t i = 0; i < tokens.size(); ++i) {
-        // printf("llama_sample_softmax: i: %d, logit: %f\n", i, tokens[i].logit);
-        float p = expf(tokens[i].logit - max_l);
-        tokens[i].p = p;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        float p = expf(candidates->data[i].logit - max_l);
+        candidates->data[i].p = p;
        cum_sum += p;
    }
-    for (size_t i = 0; i < tokens.size(); ++i) {
-        tokens[i].p /= cum_sum;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        candidates->data[i].p /= cum_sum;
    }

    if (ctx) {
@ -1511,48 +1507,46 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
    }
 }

-void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates_p, int k, size_t min_keep) {
+void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
    const int64_t t_start_sample_us = ggml_time_us();

    k = std::max(k, (int) min_keep);
-    k = std::min(k, (int) candidates_p->size);
-
-    std::span<llama_token_data> candidates(candidates_p->data, candidates_p->size);
+    k = std::min(k, (int) candidates->size);

    // Sort scores in descending order
-    if (!candidates_p->sorted) {
+    if (!candidates->sorted) {
        auto comp = [](const llama_token_data & a, const llama_token_data & b) {
            return a.logit > b.logit;
        };
-        if (k == (int) candidates_p->size) {
-            std::sort(candidates.begin(), candidates.end(), comp);
+        if (k == (int) candidates->size) {
+            std::sort(candidates->data, candidates->data + candidates->size, comp);
        } else {
-            std::partial_sort(candidates.begin(), candidates.begin() + k, candidates.end(), comp);
+            std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
        }
-        candidates_p->sorted = true;
+        candidates->sorted = true;
    }
-    candidates_p->size = k;
+    candidates->size = k;

    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    }
 }

-void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates_p, float p, size_t min_keep) {
+void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
    if (p >= 1.0f) {
        return;
    }

    const int64_t t_start_sample_us = ggml_time_us();

-    llama_sample_softmax(ctx, candidates_p);
+    llama_sample_softmax(ctx, candidates);

    // Compute the cumulative probabilities
    float cum_sum = 0.0f;
-    size_t last_idx = candidates_p->size;
+    size_t last_idx = candidates->size;

-    for (size_t i = 0; i < candidates_p->size; ++i) {
-        cum_sum += candidates_p->data[i].p;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        cum_sum += candidates->data[i].p;

        // Check if the running sum is greater than p or if we have kept at least min_keep tokens
        if (cum_sum > p && i >= min_keep) {
@ -1562,29 +1556,28 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
    }

    // Resize the output vector to keep only the top-p tokens
-    candidates_p->size = last_idx;
+    candidates->size = last_idx;

    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    }
 }

-// https://www.trentonbricken.com/Tail-Free-Sampling/
-void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates_p, float z, size_t min_keep) {
-    if (z >= 1.0f || candidates_p->size <= 2) {
+void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
+    if (z >= 1.0f || candidates->size <= 2) {
        return;
    }

    const int64_t t_start_sample_us = ggml_time_us();

-    llama_sample_softmax(nullptr, candidates_p);
+    llama_sample_softmax(nullptr, candidates);

    // Compute the first and second derivatives
-    std::vector<float> first_derivatives(candidates_p->size - 1);
-    std::vector<float> second_derivatives(candidates_p->size - 2);
+    std::vector<float> first_derivatives(candidates->size - 1);
+    std::vector<float> second_derivatives(candidates->size - 2);

    for (size_t i = 0; i < first_derivatives.size(); ++i) {
-        first_derivatives[i] = candidates_p->data[i].p - candidates_p->data[i + 1].p;
+        first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
    }
    for (size_t i = 0; i < second_derivatives.size(); ++i) {
        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
@ -1602,7 +1595,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
    }

    float cum_sum = 0.0f;
-    size_t last_idx = candidates_p->size;
+    size_t last_idx = candidates->size;
    for (size_t i = 0; i < second_derivatives.size(); ++i) {
        cum_sum += second_derivatives[i];

@ -1614,41 +1607,40 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
    }

    // Resize the output vector to keep only the tokens above the tail location
-    candidates_p->size = last_idx;
+    candidates->size = last_idx;

    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    }
 }

-// https://arxiv.org/pdf/2202.00666.pdf
-// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
-void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates_p, float typical_p, size_t min_keep) {
-    if (typical_p >= 1.0f) {
+
+void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
+    // Reference implementation:
+    // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
+    if (p >= 1.0f) {
        return;
    }

    const int64_t t_start_sample_us = ggml_time_us();

    // Compute the softmax of logits and calculate entropy
-    llama_sample_softmax(nullptr, candidates_p);
-
-    std::span<llama_token_data> candidates(candidates_p->data, candidates_p->size);
+    llama_sample_softmax(nullptr, candidates);

    float entropy = 0.0f;
-    for (const auto & candidate : candidates) {
-        entropy += -candidate.p * logf(candidate.p);
+    for (size_t i = 0; i < candidates->size; ++i) {
+        entropy += -candidates->data[i].p * logf(candidates->data[i].p);
    }

    // Compute the absolute difference between negative log probability and entropy for each candidate
    std::vector<float> shifted_scores;
-    for (const auto & candidate : candidates) {
-        float shifted_score = fabsf(-logf(candidate.p) - entropy);
+    for (size_t i = 0; i < candidates->size; ++i) {
+        float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
        shifted_scores.push_back(shifted_score);
    }

-    // Sort candidates based on the shifted_scores and their corresponding indices
-    std::vector<size_t> indices(candidates.size());
+    // Sort tokens based on the shifted_scores and their corresponding indices
+    std::vector<size_t> indices(candidates->size);
    std::iota(indices.begin(), indices.end(), 0);

    std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
@ -1661,10 +1653,10 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c

    for (size_t i = 0; i < indices.size(); ++i) {
        size_t idx = indices[i];
-        cum_sum += candidates[idx].p;
+        cum_sum += candidates->data[idx].p;

        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
-        if (cum_sum > typical_p && i >= min_keep - 1) {
+        if (cum_sum > p && i >= min_keep - 1) {
            last_idx = i + 1;
            break;
        }
@ -1674,12 +1666,12 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
    std::vector<llama_token_data> new_candidates;
    for (size_t i = 0; i < last_idx; ++i) {
        size_t idx = indices[i];
-        new_candidates.push_back(candidates[idx]);
+        new_candidates.push_back(candidates->data[idx]);
    }

-    // Replace the data in candidates_p with the new_candidates data
-    std::copy(new_candidates.begin(), new_candidates.end(), candidates_p->data);
-    candidates_p->size = new_candidates.size();
+    // Replace the data in candidates with the new_candidates data
+    std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
+    candidates->size = new_candidates.size();

    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@ -1689,9 +1681,8 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
 void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
    const int64_t t_start_sample_us = ggml_time_us();

-    std::span<llama_token_data> candidates(candidates_p->data, candidates_p->size);
-    for (auto & candidate : candidates) {
-        candidate.logit /= temp;
+    for (size_t i = 0; i < candidates_p->size; ++i) {
+        candidates_p->data[i].logit /= temp;
    }

    if (ctx) {
@ -1699,29 +1690,25 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
    }
 }

-void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates_p, llama_token * last_tokens_p, size_t last_tokens_size, float penalty) {
+void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty) {
    if (last_tokens_size == 0 || penalty == 1.0f) {
        return;
    }

    const int64_t t_start_sample_us = ggml_time_us();

-    // CTRL paper: https://arxiv.org/pdf/1909.05858.pdf
-    std::span<llama_token_data> candidates(candidates_p->data, candidates_p->size);
-    std::span<llama_token> last_tokens(last_tokens_p, last_tokens_size);
-
-    for (size_t i = 0; i < candidates.size(); ++i) {
-        auto token_iter = std::find(last_tokens.begin(), last_tokens.end(), candidates[i].id);
-        if (token_iter == last_tokens.end()) {
+    for (size_t i = 0; i < candidates->size; ++i) {
+        auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
+        if (token_iter == last_tokens + last_tokens_size) {
            continue;
        }

        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
-        if (candidates[i].logit <= 0) {
-            candidates[i].logit *= penalty;
+        if (candidates->data[i].logit <= 0) {
+            candidates->data[i].logit *= penalty;
        } else {
-            candidates[i].logit /= penalty;
+            candidates->data[i].logit /= penalty;
        }

        // But it does not penalize tokens that logits are near zero, which is a problem.
@ -1731,76 +1718,60 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
        // candidates[i].logit = std::log(probability);
    }

-    candidates_p->sorted = false;
+    candidates->sorted = false;

    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    }
 }

-void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates_p, llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
+void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
    if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
        return;
    }

    const int64_t t_start_sample_us = ggml_time_us();

-    std::span<llama_token_data> candidates(candidates_p->data, candidates_p->size);
-    std::span<llama_token> last_tokens(last_tokens_p, last_tokens_size);
-
    // Create a frequency map to count occurrences of each token in last_tokens
    std::unordered_map<llama_token, int> token_count;
-    for (const auto & token : last_tokens) {
-        token_count[token]++;
+    for (size_t i = 0; i < last_tokens_size; ++i) {
+        token_count[last_tokens_p[i]]++;
    }

    // Apply frequency and presence penalties to the candidates
-    for (size_t i = 0; i < candidates.size(); ++i) {
-        auto token_iter = token_count.find(candidates[i].id);
+    for (size_t i = 0; i < candidates->size; ++i) {
+        auto token_iter = token_count.find(candidates->data[i].id);
        if (token_iter == token_count.end()) {
            continue;
        }

        int count = token_iter->second;
-        candidates[i].logit -= count * alpha_frequency + float(count > 0) * alpha_presence;
+        candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
    }

-    candidates_p->sorted = false;
+    candidates->sorted = false;

    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    }
 }

-/// @brief Mirostat 1.0 implementation.
-/// @param candidates  A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-/// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-/// @param N The size of the vocabulary. This is used in the calculation of the `k` value.
-/// @param k A reference to the integer variable used to store the calculated top-k value. The top-k value determines how many of the most probable tokens are considered for sampling.
-/// @param mu A reference to the floating-point variable that represents the maximum cross-entropy value. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates_p, float tau, float eta, int m, float N, int * k, float * mu) {
+
+llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float N, int * k, float * mu) {
    assert(ctx);

    int64_t t_start_sample_us;
    t_start_sample_us = ggml_time_us();

-    // https://arxiv.org/abs/2007.14966
-    // Algorithm 1
-    std::span<llama_token_data> candidates(candidates_p->data, candidates_p->size);
-
-    // printf("llama_sample_mirostat: candidates.size() = %d, m = %d, N = %f, tau = %f, eta = %f, *k = %d, *mu = %f\n", candidates.size(), m, N, tau, eta, *k, *mu);
-
-    llama_sample_softmax(nullptr, candidates_p);
+    llama_sample_softmax(nullptr, candidates);

    // Estimate s_hat using the most probable m tokens
    float s_hat = 0.0;
    float sum_ti_bi = 0.0;
    float sum_ti_sq = 0.0;
-    for (size_t i = 0; i < size_t(m - 1) && i < candidates.size() - 1; ++i) {
+    for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
        float t_i = logf(float(i + 2) / float(i + 1));
-        float b_i = logf(candidates[i].p / candidates[i + 1].p);
+        float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
        sum_ti_bi += t_i * b_i;
        sum_ti_sq += t_i * t_i;
    }
@ -1808,25 +1779,23 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_

    // Compute k from the estimated s_hat and target surprise value
    float epsilon_hat = s_hat - 1;
-    // printf("llama_sample_mirostat: s_hat = %f, epsilon_hat = %f, *mu = %f, N = %f\n", s_hat, epsilon_hat, *mu, N);
    float new_k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
-    // printf("llama_sample_mirostat: new_k = %f\n", new_k);
-    *k = int(std::min(new_k, float(candidates.size())));
+    *k = int(std::min(new_k, float(candidates->size)));

    // Sample the next word X using top-k sampling
    // printf("llama_sample_mirostat *k = %d\n", *k);
-    llama_sample_top_k(nullptr, candidates_p, *k);
+    llama_sample_top_k(nullptr, candidates, *k);
    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    }
-    llama_token X = llama_sample_token(ctx, candidates_p);
+    llama_token X = llama_sample_token(ctx, candidates);
    t_start_sample_us = ggml_time_us();

    // Compute error as the difference between observed surprise and target surprise value
-    size_t X_idx = std::distance(candidates.begin(), std::find_if(candidates.begin(), candidates.end(), [&](const llama_token_data & candidate) {
+    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
        return candidate.id == X;
    }));
-    float observed_surprise = -log2f(candidates[X_idx].p);
+    float observed_surprise = -log2f(candidates->data[X_idx].p);
    float e = observed_surprise - tau;

    // Update mu using the learning rate and error
@ -1839,37 +1808,33 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
    return X;
 }

-llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates_p, float tau, float eta, float * mu) {
+llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
    assert(ctx);
    int64_t t_start_sample_us;
    t_start_sample_us = ggml_time_us();

-    // https://arxiv.org/abs/2007.14966
-    // Algorithm 2
-    std::span<llama_token_data> candidates(candidates_p->data, candidates_p->size);
-
-    llama_sample_softmax(ctx, candidates_p);
+    llama_sample_softmax(ctx, candidates);

    // Truncate the words with surprise values greater than mu
-    candidates_p->size = std::distance(candidates.begin(), std::find_if(candidates.begin(), candidates.end(), [&](const llama_token_data & candidate) {
+    candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
        return -log2f(candidate.p) > *mu;
    }));

    // Normalize the probabilities of the remaining words
-    llama_sample_softmax(ctx, candidates_p);
+    llama_sample_softmax(ctx, candidates);

    // Sample the next word X from the remaining words
    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    }
-    llama_token X = llama_sample_token(ctx, candidates_p);
+    llama_token X = llama_sample_token(ctx, candidates);
    t_start_sample_us = ggml_time_us();

    // Compute error as the difference between observed surprise and target surprise value
-    size_t X_idx = std::distance(candidates.begin(), std::find_if(candidates.begin(), candidates.end(), [&](const llama_token_data & candidate) {
+    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
        return candidate.id == X;
    }));
-    float observed_surprise = -log2f(candidates[X_idx].p);
+    float observed_surprise = -log2f(candidates->data[X_idx].p);
    float e = observed_surprise - tau;

    // Update mu using the learning rate and error
@ -1881,12 +1846,11 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
    return X;
 }

-llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates_p) {
+llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
    const int64_t t_start_sample_us = ggml_time_us();

    // Find max element
-    std::span<llama_token_data> candidates(candidates_p->data, candidates_p->size);
-    auto max_iter = std::max_element(candidates.begin(), candidates.end(), [](const llama_token_data & a, const llama_token_data & b) {
+    auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
        return a.logit < b.logit;
    });

@ -1898,24 +1862,22 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
    return result;
 }

-llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates_p) {
+llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
    assert(ctx);
    const int64_t t_start_sample_us = ggml_time_us();
-    llama_sample_softmax(nullptr, candidates_p);
-
-    std::span<llama_token_data> candidates(candidates_p->data, candidates_p->size);
+    llama_sample_softmax(nullptr, candidates);

    std::vector<float> probs;
-    probs.reserve(candidates.size());
-    for (auto & candidate : candidates) {
-        probs.push_back(candidate.p);
+    probs.reserve(candidates->size);
+    for (size_t i = 0; i < candidates->size; ++i) {
+        probs.push_back(candidates->data[i].p);
    }

    std::discrete_distribution<> dist(probs.begin(), probs.end());
    auto & rng = ctx->rng;
    int idx = dist(rng);

-    llama_token result = candidates[idx].id;
+    llama_token result = candidates->data[idx].id;

    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    ctx->n_sample++;
@ -2691,6 +2653,10 @@ llama_token llama_token_eos() {
    return 2;
 }

+llama_token llama_token_nl() {
+    return 13;
+}
+

 void llama_print_timings(struct llama_context * ctx) {
    const int64_t t_end_us = ggml_time_us();
--- a/llama.h
+++ b/llama.h
@ -185,18 +185,38 @@ extern "C" {
    // Special tokens
    LLAMA_API llama_token llama_token_bos();
    LLAMA_API llama_token llama_token_eos();
+    LLAMA_API llama_token llama_token_nl();

    // Sampling functions
-    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates_p, llama_token * last_tokens_p, size_t last_tokens_size, float penalty);
-    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates_p, llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+
+    /// @brief Repetition penalty
+    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/pdf/1909.05858.pdf with negative logit fix
+    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty);
+    /// @brief Frequency and presence repetition penalties
+    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details
+    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);

    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+
+    /// @brief Tail Free Sampling https://www.trentonbricken.com/Tail-Free-Sampling/
    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
+
+    /// @brief Locally Typical Sampling https://arxiv.org/pdf/2202.00666.pdf
    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);

+    /// @brief Mirostat implementation.
+    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param ctx The llama context.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+    /// @param N The size of the vocabulary. This is used in the calculation of the `k` value.
+    /// @param k A reference to the integer variable used to store the calculated top-k value. The top-k value determines how many of the most probable tokens are considered for sampling.
+    /// @param mu A reference to the floating-point variable that represents the maximum cross-entropy value. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float N, int * k, float * mu);
    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);