BERT tokenizer fixes (#6498)
Key changes: * BERT conversion: fix abuse of LlamaHfVocab, do not set BOS or EOS * Nomic Embed conversion: pad vocab instead of slicing embedding tensor * llama_tokenize: handle added special tokens like HF does
This commit is contained in:
parent
c4a3a4ff47
commit
1b67731e18
20 changed files with 221 additions and 194 deletions
|
@ -689,6 +689,7 @@ struct server_context {
|
|||
n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
add_bos_token = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -758,7 +759,7 @@ struct server_context {
|
|||
metrics.init();
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const {
|
||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
|
||||
// TODO: currently, we tokenize using special tokens by default
|
||||
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
||||
// but it's better compared to completely ignoring ChatML and other chat templates
|
||||
|
@ -776,7 +777,7 @@ struct server_context {
|
|||
|
||||
std::vector<llama_token> p;
|
||||
if (first) {
|
||||
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||
first = false;
|
||||
} else {
|
||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
|
@ -793,7 +794,7 @@ struct server_context {
|
|||
}
|
||||
} else {
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
|
@ -1058,7 +1059,7 @@ struct server_context {
|
|||
system_tokens.clear();
|
||||
|
||||
if (!system_prompt.empty()) {
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
|
@ -1914,7 +1915,7 @@ struct server_context {
|
|||
prefix_tokens.push_back(llama_token_middle(model));
|
||||
prompt_tokens = prefix_tokens;
|
||||
} else {
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||
}
|
||||
|
||||
slot.n_past = 0;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue