llama : add pre-tokenizer regexes for BLOOM and gpt3-finnish (#8850)

2024-08-15 10:17:12 +03:00 · 2024-08-15 10:17:12 +03:00 · 6bda7ce6c3
commit 6bda7ce6c3
parent d5492f0525
5 changed files with 19 additions and 1 deletions
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -410,6 +410,8 @@ struct llm_tokenizer_bpe {
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_PORO:
+            case LLAMA_VOCAB_PRE_TYPE_BLOOM:
+            case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
                regex_exprs = {
                    " ?[^(\\s|.,!?…。，、।۔،)]+",
                };
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -5467,6 +5467,12 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "codeshell") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
+            } else if (
+                tokenizer_pre == "bloom") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
+            } else if (
+                tokenizer_pre == "gpt3-finnish") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }