llama : alternative merge ignore logic
This commit is contained in:
parent
0c9a0aef4c
commit
b8d3cd5337
1 changed files with 5 additions and 14 deletions
19
llama.cpp
19
llama.cpp
|
@ -12299,26 +12299,17 @@ struct llm_tokenizer_bpe {
|
||||||
symbols_final.clear();
|
symbols_final.clear();
|
||||||
|
|
||||||
for (auto & word : word_collection) {
|
for (auto & word : word_collection) {
|
||||||
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
|
||||||
llm_symbol sym;
|
|
||||||
sym.text = word.c_str();
|
|
||||||
sym.n = word.size();
|
|
||||||
sym.prev = final_prev_index;
|
|
||||||
sym.next = -1;
|
|
||||||
if (final_prev_index != -1) {
|
|
||||||
symbols_final[final_prev_index].next = symbols_final.size();
|
|
||||||
}
|
|
||||||
symbols_final.emplace_back(sym);
|
|
||||||
final_prev_index = symbols_final.size() - 1;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
work_queue = llm_bigram_bpe::queue();
|
work_queue = llm_bigram_bpe::queue();
|
||||||
symbols.clear();
|
symbols.clear();
|
||||||
|
|
||||||
int index = 0;
|
int index = 0;
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
|
|
||||||
|
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
||||||
|
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
||||||
|
offset = word.size();
|
||||||
|
}
|
||||||
|
|
||||||
while (offset < word.size()) {
|
while (offset < word.size()) {
|
||||||
llm_symbol sym;
|
llm_symbol sym;
|
||||||
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue