convert : add Poro-34B-chat tokenizer support (#7713)

* support for Poro chat pre-tokenizer * add support for Poro pre-tokenizer * Update convert-hf-to-gguf-update.py Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Change Poro-34B-chat to poro-chat * Change Poro-34B-chat to poro-chat * Update convert-hf-to-gguf-update.py * Update llama.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-06-14 13:16:49 +03:00 · 2024-06-14 13:16:49 +03:00 · 41b9260f18
commit 41b9260f18
parent 172c825684
4 changed files with 13 additions and 0 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -4713,6 +4713,9 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "smaug-bpe") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+            } else if (
+                tokenizer_pre == "poro-chat") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@ -13028,6 +13031,11 @@ struct llm_tokenizer_bpe {
                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                        });
                        break;
+                    case LLAMA_VOCAB_PRE_TYPE_PORO:
+                        word_collection = unicode_regex_split(text, {
+                            " ?[^(\\s|.,!?…。，、।۔،)]+",
+                        });
+                        break;
                    default:
                        // default regex for BPE tokenization pre-processing
                        word_collection = unicode_regex_split(text, {