llama : alternative merge ignore logic
This commit is contained in:
parent
0c9a0aef4c
commit
b8d3cd5337
1 changed files with 5 additions and 14 deletions
19
llama.cpp
19
llama.cpp
|
@ -12299,26 +12299,17 @@ struct llm_tokenizer_bpe {
|
|||
symbols_final.clear();
|
||||
|
||||
for (auto & word : word_collection) {
|
||||
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
||||
llm_symbol sym;
|
||||
sym.text = word.c_str();
|
||||
sym.n = word.size();
|
||||
sym.prev = final_prev_index;
|
||||
sym.next = -1;
|
||||
if (final_prev_index != -1) {
|
||||
symbols_final[final_prev_index].next = symbols_final.size();
|
||||
}
|
||||
symbols_final.emplace_back(sym);
|
||||
final_prev_index = symbols_final.size() - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
work_queue = llm_bigram_bpe::queue();
|
||||
symbols.clear();
|
||||
|
||||
int index = 0;
|
||||
size_t offset = 0;
|
||||
|
||||
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
||||
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
||||
offset = word.size();
|
||||
}
|
||||
|
||||
while (offset < word.size()) {
|
||||
llm_symbol sym;
|
||||
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue