Enable rtrim when pre-inserting BOS

This commit is contained in:
jaime-m-p 2024-05-21 02:11:28 +02:00
parent 3d490e8529
commit 9b21dc3aef

View file

@ -12498,15 +12498,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
// tokenizer.encode('', add_special_tokens=True) returns [1] // tokenizer.encode('', add_special_tokens=True) returns [1]
// tokenizer.encode('', add_special_tokens=False) returns [] // tokenizer.encode('', add_special_tokens=False) returns []
if (add_special && vocab.special_add_bos != 0) {
GGML_ASSERT(vocab.special_bos_id != -1);
output.push_back(vocab.special_bos_id);
}
static const bool rtrim = true; //TODO: as param static const bool rtrim = true; //TODO: as param
bool is_prev_special = false; bool is_prev_special = false;
bool special_token_rtrim = false; bool special_token_rtrim = false;
if (add_special && vocab.special_add_bos != 0) {
GGML_ASSERT(vocab.special_bos_id != -1);
output.push_back(vocab.special_bos_id);
is_prev_special = true;
}
for (const auto & fragment : fragment_buffer) { for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
// without adding this leading whitespace, we do not get the same results as the original tokenizer // without adding this leading whitespace, we do not get the same results as the original tokenizer