Respect tokenizer.ggml.add_bos_token value when tokenizing (#4040)

* gguf-py: gguf-dump: Respect --no-tensor flag in JSON mode.

* Respect add_bos_token GGUF metadata value

* gguf-py: Try to fix SpecialVocab giving up too easily for the Nth time
This commit is contained in:
Kerfuffle 2023-11-16 19:14:37 -07:00 committed by GitHub
parent 8da46278e1
commit 91f6499393
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 85 additions and 29 deletions

View file

@ -230,7 +230,7 @@ int main(int argc, char ** argv) {
LOG_TEE("\n");
LOG_TEE("%s\n", get_system_info(params).c_str());
}
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos: %d\n", add_bos);
bool suff_rm_leading_spc = params.escape;

View file

@ -208,9 +208,10 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
int n_past = 0;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, true);
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);

View file

@ -229,7 +229,7 @@ int main(int argc, char ** argv) {
}
}
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos: %d\n", add_bos);
std::vector<llama_token> embd_inp;

View file

@ -149,8 +149,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = is_spm;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@ -288,8 +287,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = is_spm;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
@ -481,7 +479,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
// This is needed as usual for LLaMA models
const bool add_bos = is_spm;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
// Number of tasks to use when computing the score
if ( params.hellaswag_tasks < hs_task_count ) {

View file

@ -501,6 +501,7 @@ struct llama_server_context
bool multimodal = false;
bool clean_kv_cache = true;
bool all_slots_are_idle = false;
bool add_bos_token = true;
int32_t id_gen;
int32_t n_ctx; // total context for all clients / slots
@ -573,6 +574,8 @@ struct llama_server_context
n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_should_add_bos_token(model);
return true;
}
@ -864,7 +867,7 @@ struct llama_server_context
}
void update_system_prompt() {
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
llama_batch_clear(batch);
@ -1552,7 +1555,7 @@ struct llama_server_context
}
else
{
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
}
slot.num_prompt_tokens = prompt_tokens.size();
@ -1629,7 +1632,7 @@ struct llama_server_context
const bool has_images = process_images(slot);
// process the prefix of first image
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
{
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);