supporting more diverse tokenizers
This commit is contained in:
parent
1a941869cb
commit
efb5dac337
1 changed files with 1 additions and 1 deletions
|
@ -1924,7 +1924,7 @@ struct llama_tokenizer {
|
||||||
if (token == vocab_.token_to_id.end()) {
|
if (token == vocab_.token_to_id.end()) {
|
||||||
// output any symbols that did not form tokens as bytes.
|
// output any symbols that did not form tokens as bytes.
|
||||||
for (int j = 0; j < (int) symbol.n; ++j) {
|
for (int j = 0; j < (int) symbol.n; ++j) {
|
||||||
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
|
||||||
output.push_back(token_id);
|
output.push_back(token_id);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue