Respect tokenizer.ggml.add_bos_token value when tokenizing (#4040)
* gguf-py: gguf-dump: Respect --no-tensor flag in JSON mode. * Respect add_bos_token GGUF metadata value * gguf-py: Try to fix SpecialVocab giving up too easily for the Nth time
This commit is contained in:
parent
8da46278e1
commit
91f6499393
12 changed files with 85 additions and 29 deletions
|
@ -501,6 +501,7 @@ struct llama_server_context
|
|||
bool multimodal = false;
|
||||
bool clean_kv_cache = true;
|
||||
bool all_slots_are_idle = false;
|
||||
bool add_bos_token = true;
|
||||
|
||||
int32_t id_gen;
|
||||
int32_t n_ctx; // total context for all clients / slots
|
||||
|
@ -573,6 +574,8 @@ struct llama_server_context
|
|||
|
||||
n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
add_bos_token = llama_should_add_bos_token(model);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -864,7 +867,7 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
void update_system_prompt() {
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
|
@ -1552,7 +1555,7 @@ struct llama_server_context
|
|||
}
|
||||
else
|
||||
{
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||
}
|
||||
|
||||
slot.num_prompt_tokens = prompt_tokens.size();
|
||||
|
@ -1629,7 +1632,7 @@ struct llama_server_context
|
|||
const bool has_images = process_images(slot);
|
||||
|
||||
// process the prefix of first image
|
||||
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
|
||||
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
|
||||
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
|
||||
{
|
||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue