BERT tokenizer fixes (#6498)
Key changes: * BERT conversion: fix abuse of LlamaHfVocab, do not set BOS or EOS * Nomic Embed conversion: pad vocab instead of slicing embedding tensor * llama_tokenize: handle added special tokens like HF does
This commit is contained in:
parent
c4a3a4ff47
commit
1b67731e18
20 changed files with 221 additions and 194 deletions
|
@ -239,6 +239,7 @@ int main(int argc, char ** argv) {
|
|||
LOG_TEE("%s\n", get_system_info(params).c_str());
|
||||
}
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
bool suff_rm_leading_spc = params.escape;
|
||||
|
@ -279,10 +280,10 @@ int main(int argc, char ** argv) {
|
|||
if (ctx_guidance) {
|
||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
|
||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||
|
||||
original_prompt_len = original_inp.size();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue