llama : fix order of pre-tokenizers

This commit is contained in:
Georgi Gerganov 2024-07-19 13:21:38 +03:00 committed by GitHub
parent 7fc85054bf
commit 447c08092d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -5517,14 +5517,14 @@ static void llm_load_vocab(
tokenizer_pre == "viking") { tokenizer_pre == "viking") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
vocab.tokenizer_clean_spaces = false; vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "jais") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
} else if ( } else if (
tokenizer_pre == "tekken") { tokenizer_pre == "tekken") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
vocab.tokenizer_ignore_merges = true; vocab.tokenizer_ignore_merges = true;
vocab.tokenizer_add_bos = true; vocab.tokenizer_add_bos = true;
} else if (
tokenizer_pre == "jais") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
} else { } else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
} }