From 0ee896d149ec02ffe472a467a33709c2acce9f91 Mon Sep 17 00:00:00 2001 From: nopperl <54780682+nopperl@users.noreply.github.com> Date: Mon, 22 Jul 2024 11:47:35 +0000 Subject: [PATCH] fix punctuation regex in chameleon pre-tokenizer (@compilade) Co-authored-by: compilade --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index db41ad629..18aea5ad2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15852,7 +15852,7 @@ struct llm_tokenizer_bpe { "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens "([\\t\\n]| | )", // directly from tokenizer.json "\\p{N}", // Individual digits - "[\\p{P}\\$\\+<=>\\^~\\|`]+", // Punctuation + "[\\p{P}!-/:-@\\[-`{-~]", // Punctuation, Isolated "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }; break;