From 3275e60f575fb1141e4706da2e9dc810d7d47457 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 2 May 2024 11:52:50 +0300 Subject: [PATCH] falcon : fix regex --- llama.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0ef2e6277..a6cb3a8a9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12212,14 +12212,13 @@ struct llm_tokenizer_bpe { "\\s?\\p{L}+", "\\s?\\p{P}+", "[一-龥ࠀ-一가-퟿]+", - "\\p{N}+", + "\\p{N}", }); break; case LLAMA_VOCAB_PRE_TYPE_FALCON: word_collection = unicode_regex_split(text, { "[\\p{P}\\$\\+<=>\\^~\\|]+", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - "\\p{N}+", "[0-9][0-9][0-9]", }); break;