From 1d2f3ad471e7682816bb741eeb6d635d559fbcbd Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 25 May 2024 21:30:26 +0200 Subject: [PATCH] Better name functions to append token/bos/eos --- llama.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/llama.cpp b/llama.cpp index bb5cd9fe2..95992de50 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12365,7 +12365,11 @@ struct llm_tokenizer_bpe { } } - bool add_special_bos(std::vector & output) const { + void append(const llama_vocab::id token_id, std::vector & output) const { + output.push_back(token_id); + } + + bool append_bos(std::vector & output) const { if (special_add_bos) { GGML_ASSERT(vocab.special_bos_id != -1); output.push_back(vocab.special_bos_id); @@ -12374,7 +12378,7 @@ struct llm_tokenizer_bpe { return false; } - bool add_special_eos(std::vector & output) const { + bool append_eos(std::vector & output) const { if (special_add_eos) { GGML_ASSERT(vocab.special_eos_id != -1); output.push_back(vocab.special_eos_id); @@ -12383,7 +12387,7 @@ struct llm_tokenizer_bpe { return false; } - void check_add_special(const std::vector & output) const { + void check_double_bos_eos(const std::vector & output) const { if (special_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) { LLAMA_LOG_WARN( "%s: Added a BOS token to the prompt as specified by the model but the prompt " @@ -12890,7 +12894,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & llm_tokenizer_bpe tokenizer(vocab); if (add_special) { - tokenizer.add_special_bos(output); + tokenizer.append_bos(output); } for (const auto & fragment : fragment_buffer) { @@ -12902,13 +12906,13 @@ static std::vector llama_tokenize_internal(const llama_vocab & #endif tokenizer.tokenize(raw_text, output); } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) - output.push_back(fragment.token); + tokenizer.append(fragment.token, output); } } if (add_special) { - tokenizer.add_special_eos(output); - tokenizer.check_add_special(output); + tokenizer.append_eos(output); + tokenizer.check_double_bos_eos(output); } } break; case LLAMA_VOCAB_TYPE_WPM: