llama : alternative merge ignore logic

This commit is contained in:
Georgi Gerganov 2024-05-11 11:10:23 +03:00
parent 0c9a0aef4c
commit b8d3cd5337
No known key found for this signature in database
GPG key ID: 449E073F9DC10735

View file

@ -12299,26 +12299,17 @@ struct llm_tokenizer_bpe {
symbols_final.clear();
for (auto & word : word_collection) {
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
llm_symbol sym;
sym.text = word.c_str();
sym.n = word.size();
sym.prev = final_prev_index;
sym.next = -1;
if (final_prev_index != -1) {
symbols_final[final_prev_index].next = symbols_final.size();
}
symbols_final.emplace_back(sym);
final_prev_index = symbols_final.size() - 1;
continue;
}
work_queue = llm_bigram_bpe::queue();
symbols.clear();
int index = 0;
size_t offset = 0;
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
offset = word.size();
}
while (offset < word.size()) {
llm_symbol sym;
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));