Merge d3286d6eca into d11afd6652

2024-05-10 14:48:33 +08:00 · 2024-05-10 14:48:33 +08:00 · 0db0192200
commit 0db0192200
parent d11afd6652 d3286d6eca
21 changed files with 78 additions and 58 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -2350,15 +2350,17 @@ std::vector<llama_token> llama_tokenize(
  const struct llama_context * ctx,
           const std::string & text,
                        bool   add_special,
-                        bool   parse_special) {
-    return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
+                        bool   parse_special,
+                        bool   fix_double_bos) {
+    return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special, fix_double_bos);
 }

 std::vector<llama_token> llama_tokenize(
    const struct llama_model * model,
           const std::string & text,
                        bool   add_special,
-                        bool   parse_special) {
+                        bool   parse_special,
+                        bool   fix_double_bos) {
    // upper limit for the number of tokens
    int n_tokens = text.length() + 2 * add_special;
    std::vector<llama_token> result(n_tokens);
@ -2370,9 +2372,19 @@ std::vector<llama_token> llama_tokenize(
    } else {
        result.resize(n_tokens);
    }
+    if (fix_double_bos) {
+        llama_fix_double_bos(model, result);
+    }
    return result;
 }

+void llama_fix_double_bos(const struct llama_model * model, std::vector<llama_token> & prompt) {
+    const llama_token bos = llama_token_bos(model);
+    if (prompt.size() >= 2 && prompt[0] == bos && prompt[1] == bos) {
+        prompt.erase(prompt.begin(), prompt.begin() + 1);
+    }
+}
+
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
    std::vector<char> result(8, 0);
    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
--- a/common/common.h
+++ b/common/common.h
@ -239,13 +239,18 @@ std::vector<llama_token> llama_tokenize(
  const struct llama_context * ctx,
           const std::string & text,
                        bool   add_special,
-                        bool   parse_special = false);
+                        bool   parse_special  = false,
+                        bool   fix_dobule_bos = false);

 std::vector<llama_token> llama_tokenize(
    const struct llama_model * model,
           const std::string & text,
                        bool   add_special,
-                        bool   parse_special = false);
+                        bool   parse_special  = false,
+                        bool   fix_double_bos = false);
+
+// if the first and the second token in the prompt are both EOS, remove the first token
+void llama_fix_double_bos(const struct llama_model * model, std::vector<llama_token> & prompt);

 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -71,7 +71,7 @@ int main(int argc, char ** argv) {
    // tokenize the prompt

    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(model, params.prompt, true);
+    tokens_list = ::llama_tokenize(model, params.prompt, true, true, true);

    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;

--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@ -137,7 +137,7 @@ int main(int argc, char ** argv)
    // Tokenize the prompt :
    //---------------------------------

-    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true, true, true);

    const size_t max_context_size     = llama_n_ctx( ctx );
    const size_t max_tokens_list_size = max_context_size - 4 ;
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
    for (const auto & prompt : prompts) {
-        auto inp = ::llama_tokenize(ctx, prompt, true, false);
+        auto inp = ::llama_tokenize(ctx, prompt, true, false, true);
        if (inp.size() > n_batch) {
            fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                    __func__, (long long int) inp.size(), (long long int) n_batch);
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -400,7 +400,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true, true, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -248,8 +248,8 @@ int main(int argc, char ** argv) {
        suff_rm_leading_spc = false;
    }
    std::vector<llama_token> embd_inp;
-    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
+    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
+    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
    const int space_token = 29871;
    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
        inp_sfx.erase(inp_sfx.begin());
@ -280,10 +280,10 @@ int main(int argc, char ** argv) {
    if (ctx_guidance) {
        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));

-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true, true);
        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());

-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());

        original_prompt_len = original_inp.size();
@ -630,8 +630,8 @@ int main(int argc, char ** argv) {
                    suff_rm_leading_spc = false;
                }
                // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
+                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
+                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
                    inp_sfx.erase(inp_sfx.begin());
                }
@ -703,7 +703,7 @@ int main(int argc, char ** argv) {

                    const size_t original_size = embd_inp.size();

-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer, false, true, false);
                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -35,7 +35,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {

 static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true, add_bos);
    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
    return true;
 }
@ -156,14 +156,14 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
        LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
        if (params->verbose_prompt) {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
+            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
        LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
        if (params->verbose_prompt) {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
@ -173,7 +173,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
        user_prompt = prompt + "\nASSISTANT:";
        if (params->verbose_prompt) {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> inp;
    std::vector<llama_token> all;

-    inp = ::llama_tokenize(ctx, params.prompt, true, true);
+    inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
    all = inp;

    const int max_context_size     = llama_n_ctx(ctx);
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@ -29,7 +29,7 @@ int main(int argc, char ** argv){

    // tokenize the prompt
    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, true, true);
+    inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
    fprintf(stderr, "%s: tokenization done\n", __func__);


--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -34,7 +34,7 @@ int main(int argc, char ** argv){

    // tokenize the prompt
    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, true, true);
+    inp = ::llama_tokenize(ctx, params.prompt, true, true, true);

    llama_ngram_cache ngram_cache_context;
    llama_ngram_cache ngram_cache_dynamic;
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -42,7 +42,7 @@ int main(int argc, char ** argv){

    // tokenize the prompt
    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, true, true);
+    inp = ::llama_tokenize(ctx, params.prompt, true, true, true);

    llama_ngram_cache ngram_cache_context;
    llama_ngram_cache ngram_cache_dynamic;
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
        if (params.chatml) {
            params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
        }
-        embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
    } else {
        LOG("use session tokens\n");
        embd_inp = session_tokens;
@ -277,10 +277,10 @@ int main(int argc, char ** argv) {
    if (ctx_guidance) {
        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));

-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true, true);
        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());

-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());

        original_prompt_len = original_inp.size();
@ -339,15 +339,15 @@ int main(int argc, char ** argv) {
    }

    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true,  true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false, true);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true,  true, false);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false, true, false);

    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());

    // chatml prefix & suffix
-    const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
-    const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
+    const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n",                true,  true, false);
+    const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true, false);

    LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
    LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
@ -421,7 +421,7 @@ int main(int argc, char ** argv) {
            for (const auto & antiprompt : params.antiprompt) {
                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
                if (params.verbose_prompt) {
-                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
+                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true, false);
                    for (int i = 0; i < (int) tmp.size(); i++) {
                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                    }
@ -436,7 +436,7 @@ int main(int argc, char ** argv) {
        if (!params.input_prefix.empty()) {
            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
            if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
+                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true, true);
                for (int i = 0; i < (int) tmp.size(); i++) {
                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                }
@ -446,7 +446,7 @@ int main(int argc, char ** argv) {
        if (!params.input_suffix.empty()) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
            if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
                for (int i = 0; i < (int) tmp.size(); i++) {
                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                }
@ -519,7 +519,7 @@ int main(int argc, char ** argv) {

    antiprompt_ids.reserve(params.antiprompt.size());
    for (const std::string & antiprompt : params.antiprompt) {
-        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
+        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true, false));
    }

    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
@ -804,7 +804,7 @@ int main(int argc, char ** argv) {
                if (params.interactive) {
                    if (!params.antiprompt.empty()) {
                        // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true, false);
                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                        is_antiprompt = true;
                    }
@ -878,9 +878,9 @@ int main(int argc, char ** argv) {
                        process_escapes(buffer);
                    }

-                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
-                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true,  false);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false, false);
+                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true,  false);

                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -164,7 +164,7 @@ int main(int argc, char ** argv) {
    }

    std::vector<llama_token> tokens_system;
-    tokens_system = ::llama_tokenize(ctx, k_system, true);
+    tokens_system = ::llama_tokenize(ctx, k_system, true, true, true);
    const int32_t n_tokens_system = tokens_system.size();

    llama_seq_id g_seq_id = 0;
@ -256,7 +256,7 @@ int main(int argc, char ** argv) {

                    // do not prepend BOS because we have a system prompt!
                    std::vector<llama_token> tokens_prompt;
-                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
+                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, false, true, false);

                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -108,10 +108,10 @@ int main(int argc, char ** argv) {

    // tokenize the prompt
    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true, true, true);

    // tokenize the prefix and use it as a sink
-    const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size();
+    const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true, true, true).size();

    const int n_tokens_all = tokens_list.size();

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -345,7 +345,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &

    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true, true, true);

    const int n_ctx = llama_n_ctx(ctx);

@ -498,7 +498,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true, true, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@ -843,7 +843,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
        for (size_t j = 0; j < 4; j++) {
            hs_cur.ending[j] = prompt_lines[idx*6+2+j];
-            hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
+            hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true, true, true);
        }

        // determine the common prefix of the endings
@ -1136,8 +1136,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
    fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);

    for (auto & task : data) {
-        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
-        task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
+        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true, true, true);
+        task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true, true, true);

        task.common_prefix = 0;
        for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
@ -1152,8 +1152,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
            task.seq_tokens[0].size() - task.common_prefix +
            task.seq_tokens[1].size() - task.common_prefix;

-        task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
-        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
+        task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true, true, true).size();
+        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true, true, true).size();
    }

    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
@ -1359,7 +1359,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
            }
            return false;
        }
-        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
+        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true, true, true));
    }
    auto min_len = task.seq_tokens.front().size();
    for (auto& seq : task.seq_tokens) {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
    }

    // tokenize prompt
-    auto tokens = llama_tokenize(ctx, params.prompt, true);
+    auto tokens = llama_tokenize(ctx, params.prompt, true, true, true);

    // evaluate prompt
    llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -767,6 +767,9 @@ struct server_context {
        //       but it's better compared to completely ignoring ChatML and other chat templates
        const bool TMP_FORCE_SPECIAL = true;

+        // If special tokens are added, also make sure that this doesn't cause 2 BOS tokens if the user also adds one:
+        const bool fix_double_bos = add_special;
+
        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
        // or the first element of the json_prompt array is a string.
        std::vector<llama_token> prompt_tokens;
@ -779,7 +782,7 @@ struct server_context {

                    std::vector<llama_token> p;
                    if (first) {
-                        p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
+                        p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL, fix_double_bos);
                        first = false;
                    } else {
                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
@ -796,7 +799,7 @@ struct server_context {
            }
        } else {
            auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
+            prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL, fix_double_bos);
        }

        return prompt_tokens;
@ -1060,7 +1063,7 @@ struct server_context {
        system_tokens.clear();

        if (!system_prompt.empty()) {
-            system_tokens = ::llama_tokenize(ctx, system_prompt, true);
+            system_tokens = ::llama_tokenize(ctx, system_prompt, true, false, true);

            llama_batch_clear(batch);

--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
    // tokenize the prompt

    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true, true, true);

    const int n_ctx    = llama_n_ctx(ctx);
    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -128,7 +128,7 @@ int main(int argc, char ** argv) {

    // Tokenize the prompt
    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);
+    inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true, true);

    const int max_context_size     = llama_n_ctx(ctx_tgt);
    const int max_tokens_list_size = max_context_size - 4;
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@ -28,7 +28,7 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> tokens;

-    tokens = ::llama_tokenize(model, prompt, true, true);
+    tokens = ::llama_tokenize(model, prompt, true, true, true);

    for (int i = 0; i < (int) tokens.size(); i++) {
        if (printing_ids) {