tokenize: fix double BOS token

This commit is contained in:
Johannes Gäßler 2024-05-07 11:12:53 +02:00
parent 858f6b73f6
commit d3286d6eca
21 changed files with 78 additions and 58 deletions

View file

@ -2343,15 +2343,17 @@ std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx, const struct llama_context * ctx,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special) { bool parse_special,
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special); bool fix_double_bos) {
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special, fix_double_bos);
} }
std::vector<llama_token> llama_tokenize( std::vector<llama_token> llama_tokenize(
const struct llama_model * model, const struct llama_model * model,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special) { bool parse_special,
bool fix_double_bos) {
// upper limit for the number of tokens // upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special; int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens); std::vector<llama_token> result(n_tokens);
@ -2363,9 +2365,19 @@ std::vector<llama_token> llama_tokenize(
} else { } else {
result.resize(n_tokens); result.resize(n_tokens);
} }
if (fix_double_bos) {
llama_fix_double_bos(model, result);
}
return result; return result;
} }
void llama_fix_double_bos(const struct llama_model * model, std::vector<llama_token> & prompt) {
const llama_token bos = llama_token_bos(model);
if (prompt.size() >= 2 && prompt[0] == bos && prompt[1] == bos) {
prompt.erase(prompt.begin(), prompt.begin() + 1);
}
}
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::vector<char> result(8, 0); std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);

View file

@ -238,13 +238,18 @@ std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx, const struct llama_context * ctx,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special = false); bool parse_special = false,
bool fix_dobule_bos = false);
std::vector<llama_token> llama_tokenize( std::vector<llama_token> llama_tokenize(
const struct llama_model * model, const struct llama_model * model,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special = false); bool parse_special = false,
bool fix_double_bos = false);
// if the first and the second token in the prompt are both EOS, remove the first token
void llama_fix_double_bos(const struct llama_model * model, std::vector<llama_token> & prompt);
// tokenizes a token into a piece, optionally renders special/control tokens // tokenizes a token into a piece, optionally renders special/control tokens
// should work similar to Python's `tokenizer.id_to_piece` // should work similar to Python's `tokenizer.id_to_piece`

View file

@ -71,7 +71,7 @@ int main(int argc, char ** argv) {
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(model, params.prompt, true); tokens_list = ::llama_tokenize(model, params.prompt, true, true, true);
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel; const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;

View file

@ -137,7 +137,7 @@ int main(int argc, char ** argv)
// Tokenize the prompt : // Tokenize the prompt :
//--------------------------------- //---------------------------------
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true, true, true);
const size_t max_context_size = llama_n_ctx( ctx ); const size_t max_context_size = llama_n_ctx( ctx );
const size_t max_tokens_list_size = max_context_size - 4 ; const size_t max_tokens_list_size = max_context_size - 4 ;

View file

@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
// tokenize the prompts and trim // tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs; std::vector<std::vector<int32_t>> inputs;
for (const auto & prompt : prompts) { for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true, false); auto inp = ::llama_tokenize(ctx, prompt, true, false, true);
if (inp.size() > n_batch) { if (inp.size() > n_batch) {
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch); __func__, (long long int) inp.size(), (long long int) n_batch);

View file

@ -378,7 +378,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__); fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true, true, true);
auto tim2 = std::chrono::high_resolution_clock::now(); auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

View file

@ -248,8 +248,8 @@ int main(int argc, char ** argv) {
suff_rm_leading_spc = false; suff_rm_leading_spc = false;
} }
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
const int space_token = 29871; const int space_token = 29871;
if (suff_rm_leading_spc && inp_sfx[0] == space_token) { if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin()); inp_sfx.erase(inp_sfx.begin());
@ -280,10 +280,10 @@ int main(int argc, char ** argv) {
if (ctx_guidance) { if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true); guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size(); original_prompt_len = original_inp.size();
@ -630,8 +630,8 @@ int main(int argc, char ** argv) {
suff_rm_leading_spc = false; suff_rm_leading_spc = false;
} }
// tokenize new prefix and suffix // tokenize new prefix and suffix
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
if (suff_rm_leading_spc && inp_sfx[0] == space_token) { if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin()); inp_sfx.erase(inp_sfx.begin());
} }
@ -703,7 +703,7 @@ int main(int argc, char ** argv) {
const size_t original_size = embd_inp.size(); const size_t original_size = embd_inp.size();
const auto line_inp = ::llama_tokenize(ctx, buffer, false); const auto line_inp = ::llama_tokenize(ctx, buffer, false, true, false);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

View file

@ -35,7 +35,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str; std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true, add_bos);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past); eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
return true; return true;
} }
@ -156,14 +156,14 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
user_prompt = prompt.substr(image_pos + std::string("<image>").length()); user_prompt = prompt.substr(image_pos + std::string("<image>").length());
LOG_TEE("system_prompt: %s\n", system_prompt.c_str()); LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
@ -173,7 +173,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
user_prompt = prompt + "\nASSISTANT:"; user_prompt = prompt + "\nASSISTANT:";
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }

View file

@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> inp; std::vector<llama_token> inp;
std::vector<llama_token> all; std::vector<llama_token> all;
inp = ::llama_tokenize(ctx, params.prompt, true, true); inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
all = inp; all = inp;
const int max_context_size = llama_n_ctx(ctx); const int max_context_size = llama_n_ctx(ctx);

View file

@ -29,7 +29,7 @@ int main(int argc, char ** argv){
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true); inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
fprintf(stderr, "%s: tokenization done\n", __func__); fprintf(stderr, "%s: tokenization done\n", __func__);

View file

@ -34,7 +34,7 @@ int main(int argc, char ** argv){
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true); inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
llama_ngram_cache ngram_cache_context; llama_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic; llama_ngram_cache ngram_cache_dynamic;

View file

@ -42,7 +42,7 @@ int main(int argc, char ** argv){
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true); inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
llama_ngram_cache ngram_cache_context; llama_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic; llama_ngram_cache ngram_cache_dynamic;

View file

@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
if (params.chatml) { if (params.chatml) {
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>"; params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
} }
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true); embd_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
} else { } else {
LOG("use session tokens\n"); LOG("use session tokens\n");
embd_inp = session_tokens; embd_inp = session_tokens;
@ -277,10 +277,10 @@ int main(int argc, char ** argv) {
if (ctx_guidance) { if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true); guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true); std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size(); original_prompt_len = original_inp.size();
@ -339,15 +339,15 @@ int main(int argc, char ** argv) {
} }
// prefix & suffix for instruct mode // prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true); const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true, false);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true); const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true, false);
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
// chatml prefix & suffix // chatml prefix & suffix
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true); const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true, false);
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true); const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true, false);
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str()); LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str()); LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
@ -418,7 +418,7 @@ int main(int argc, char ** argv) {
for (const auto & antiprompt : params.antiprompt) { for (const auto & antiprompt : params.antiprompt) {
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
if (params.verbose_prompt) { if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); auto tmp = ::llama_tokenize(ctx, antiprompt, false, true, false);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
} }
@ -433,7 +433,7 @@ int main(int argc, char ** argv) {
if (!params.input_prefix.empty()) { if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
if (params.verbose_prompt) { if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
} }
@ -443,7 +443,7 @@ int main(int argc, char ** argv) {
if (!params.input_suffix.empty()) { if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
if (params.verbose_prompt) { if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
} }
@ -516,7 +516,7 @@ int main(int argc, char ** argv) {
antiprompt_ids.reserve(params.antiprompt.size()); antiprompt_ids.reserve(params.antiprompt.size());
for (const std::string & antiprompt : params.antiprompt) { for (const std::string & antiprompt : params.antiprompt) {
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true, false));
} }
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
@ -801,7 +801,7 @@ int main(int argc, char ** argv) {
if (params.interactive) { if (params.interactive) {
if (!params.antiprompt.empty()) { if (!params.antiprompt.empty()) {
// tokenize and inject first reverse prompt // tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true); const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true, false);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
is_antiprompt = true; is_antiprompt = true;
} }
@ -875,9 +875,9 @@ int main(int argc, char ** argv) {
process_escapes(buffer); process_escapes(buffer);
} }
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true, false);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); const auto line_inp = ::llama_tokenize(ctx, buffer, false, false, false);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true, false);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

View file

@ -164,7 +164,7 @@ int main(int argc, char ** argv) {
} }
std::vector<llama_token> tokens_system; std::vector<llama_token> tokens_system;
tokens_system = ::llama_tokenize(ctx, k_system, true); tokens_system = ::llama_tokenize(ctx, k_system, true, true, true);
const int32_t n_tokens_system = tokens_system.size(); const int32_t n_tokens_system = tokens_system.size();
llama_seq_id g_seq_id = 0; llama_seq_id g_seq_id = 0;
@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
// do not prepend BOS because we have a system prompt! // do not prepend BOS because we have a system prompt!
std::vector<llama_token> tokens_prompt; std::vector<llama_token> tokens_prompt;
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false); tokens_prompt = ::llama_tokenize(ctx, client.prompt, false, true, false);
for (size_t i = 0; i < tokens_prompt.size(); ++i) { for (size_t i = 0; i < tokens_prompt.size(); ++i) {
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);

View file

@ -108,10 +108,10 @@ int main(int argc, char ** argv) {
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true); tokens_list = ::llama_tokenize(ctx, params.prompt, true, true, true);
// tokenize the prefix and use it as a sink // tokenize the prefix and use it as a sink
const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size(); const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true, true, true).size();
const int n_tokens_all = tokens_list.size(); const int n_tokens_all = tokens_list.size();

View file

@ -345,7 +345,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
fprintf(stderr, "%s: tokenizing the input ..\n", __func__); fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true, true, true);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -498,7 +498,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__); fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true, true, true);
auto tim2 = std::chrono::high_resolution_clock::now(); auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@ -843,7 +843,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
for (size_t j = 0; j < 4; j++) { for (size_t j = 0; j < 4; j++) {
hs_cur.ending[j] = prompt_lines[idx*6+2+j]; hs_cur.ending[j] = prompt_lines[idx*6+2+j];
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true); hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true, true, true);
} }
// determine the common prefix of the endings // determine the common prefix of the endings
@ -1136,8 +1136,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__); fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
for (auto & task : data) { for (auto & task : data) {
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true); task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true, true, true);
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true); task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true, true, true);
task.common_prefix = 0; task.common_prefix = 0;
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) { for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
@ -1152,8 +1152,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
task.seq_tokens[0].size() - task.common_prefix + task.seq_tokens[0].size() - task.common_prefix +
task.seq_tokens[1].size() - task.common_prefix; task.seq_tokens[1].size() - task.common_prefix;
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size(); task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true, true, true).size();
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size(); task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true, true, true).size();
} }
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__); fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
@ -1359,7 +1359,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
} }
return false; return false;
} }
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true)); task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true, true, true));
} }
auto min_len = task.seq_tokens.front().size(); auto min_len = task.seq_tokens.front().size();
for (auto& seq : task.seq_tokens) { for (auto& seq : task.seq_tokens) {

View file

@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
} }
// tokenize prompt // tokenize prompt
auto tokens = llama_tokenize(ctx, params.prompt, true); auto tokens = llama_tokenize(ctx, params.prompt, true, true, true);
// evaluate prompt // evaluate prompt
llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0)); llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));

View file

@ -765,6 +765,9 @@ struct server_context {
// but it's better compared to completely ignoring ChatML and other chat templates // but it's better compared to completely ignoring ChatML and other chat templates
const bool TMP_FORCE_SPECIAL = true; const bool TMP_FORCE_SPECIAL = true;
// If special tokens are added, also make sure that this doesn't cause 2 BOS tokens if the user also adds one:
const bool fix_double_bos = add_special;
// If `add_bos` is true, we only add BOS, when json_prompt is a string, // If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string. // or the first element of the json_prompt array is a string.
std::vector<llama_token> prompt_tokens; std::vector<llama_token> prompt_tokens;
@ -777,7 +780,7 @@ struct server_context {
std::vector<llama_token> p; std::vector<llama_token> p;
if (first) { if (first) {
p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL, fix_double_bos);
first = false; first = false;
} else { } else {
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
@ -794,7 +797,7 @@ struct server_context {
} }
} else { } else {
auto s = json_prompt.template get<std::string>(); auto s = json_prompt.template get<std::string>();
prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL, fix_double_bos);
} }
return prompt_tokens; return prompt_tokens;
@ -1058,7 +1061,7 @@ struct server_context {
system_tokens.clear(); system_tokens.clear();
if (!system_prompt.empty()) { if (!system_prompt.empty()) {
system_tokens = ::llama_tokenize(ctx, system_prompt, true); system_tokens = ::llama_tokenize(ctx, system_prompt, true, false, true);
llama_batch_clear(batch); llama_batch_clear(batch);

View file

@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true); tokens_list = ::llama_tokenize(ctx, params.prompt, true, true, true);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());

View file

@ -128,7 +128,7 @@ int main(int argc, char ** argv) {
// Tokenize the prompt // Tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true); inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true, true);
const int max_context_size = llama_n_ctx(ctx_tgt); const int max_context_size = llama_n_ctx(ctx_tgt);
const int max_tokens_list_size = max_context_size - 4; const int max_tokens_list_size = max_context_size - 4;

View file

@ -28,7 +28,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
tokens = ::llama_tokenize(model, prompt, true, true); tokens = ::llama_tokenize(model, prompt, true, true, true);
for (int i = 0; i < (int) tokens.size(); i++) { for (int i = 0; i < (int) tokens.size(); i++) {
if (printing_ids) { if (printing_ids) {