Better name functions to append token/bos/eos
This commit is contained in:
parent
51e933a962
commit
1d2f3ad471
1 changed files with 11 additions and 7 deletions
18
llama.cpp
18
llama.cpp
|
@ -12365,7 +12365,11 @@ struct llm_tokenizer_bpe {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool add_special_bos(std::vector<llama_vocab::id> & output) const {
|
void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
|
||||||
|
output.push_back(token_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool append_bos(std::vector<llama_vocab::id> & output) const {
|
||||||
if (special_add_bos) {
|
if (special_add_bos) {
|
||||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||||
output.push_back(vocab.special_bos_id);
|
output.push_back(vocab.special_bos_id);
|
||||||
|
@ -12374,7 +12378,7 @@ struct llm_tokenizer_bpe {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool add_special_eos(std::vector<llama_vocab::id> & output) const {
|
bool append_eos(std::vector<llama_vocab::id> & output) const {
|
||||||
if (special_add_eos) {
|
if (special_add_eos) {
|
||||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||||
output.push_back(vocab.special_eos_id);
|
output.push_back(vocab.special_eos_id);
|
||||||
|
@ -12383,7 +12387,7 @@ struct llm_tokenizer_bpe {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void check_add_special(const std::vector<llama_vocab::id> & output) const {
|
void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
|
||||||
if (special_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
if (special_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||||
LLAMA_LOG_WARN(
|
LLAMA_LOG_WARN(
|
||||||
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||||
|
@ -12890,7 +12894,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
llm_tokenizer_bpe tokenizer(vocab);
|
llm_tokenizer_bpe tokenizer(vocab);
|
||||||
|
|
||||||
if (add_special) {
|
if (add_special) {
|
||||||
tokenizer.add_special_bos(output);
|
tokenizer.append_bos(output);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & fragment : fragment_buffer) {
|
for (const auto & fragment : fragment_buffer) {
|
||||||
|
@ -12902,13 +12906,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
#endif
|
#endif
|
||||||
tokenizer.tokenize(raw_text, output);
|
tokenizer.tokenize(raw_text, output);
|
||||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||||
output.push_back(fragment.token);
|
tokenizer.append(fragment.token, output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (add_special) {
|
if (add_special) {
|
||||||
tokenizer.add_special_eos(output);
|
tokenizer.append_eos(output);
|
||||||
tokenizer.check_add_special(output);
|
tokenizer.check_double_bos_eos(output);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_VOCAB_TYPE_WPM:
|
case LLAMA_VOCAB_TYPE_WPM:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue