From 4434c9d6c2888b192cc791492ccce74c2b141bb2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 27 Apr 2024 11:33:16 +0300
Subject: [PATCH] minor

---
 llama.cpp                     |  8 ++++----
 tests/test-tokenizer-0-bpe.py | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 32a94041a..b3b022a67 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12035,7 +12035,7 @@ struct llm_tokenizer_bpe {
                             //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 
                             // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
-                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 
                             // TODO: this is not the same as the original regex:
                             //       - need to use ReFlex and update unicode.cpp to support the regex above
@@ -12052,7 +12052,7 @@ struct llm_tokenizer_bpe {
                             "\\s?[!-/:-~！-／：-～‘-‟　-。]+",
                             "\\s+$",
                             "[一-龥ࠀ-一가-퟿]+",
-                            "\\p{N}+"
+                            "\\p{N}+",
                         });
                         break;
                     case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
@@ -12061,7 +12061,7 @@ struct llm_tokenizer_bpe {
                             "\\s?\\p{L}+",
                             "\\s?\\p{P}+",
                             "[一-龥ࠀ-一가-퟿]+",
-                            "\\p{N}+"
+                            "\\p{N}+",
                         });
                         break;
                     default:
@@ -12070,7 +12070,7 @@ struct llm_tokenizer_bpe {
                             "[\\p{P}\\$\\+<=>\\^~\\|]+",
                             "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                             "\\p{N}+",
-                            "[0-9][0-9][0-9]"
+                            "[0-9][0-9][0-9]",
                         });
                         break;
                 }
diff --git a/tests/test-tokenizer-0-bpe.py b/tests/test-tokenizer-0-bpe.py
index 38aa33c46..3e9fea2a2 100644
--- a/tests/test-tokenizer-0-bpe.py
+++ b/tests/test-tokenizer-0-bpe.py
@@ -96,6 +96,21 @@ if fname_tok:
         # write to file
         with open(fname_out, 'w', encoding='utf-8') as f:
             for x in res:
+                # LLaMA v3 for some reason strips the space for these tokens (and others)
+                # if x == 662:
+                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+                # elif x == 1174:
+                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+                # elif x == 2564:
+                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+                # elif x == 758:
+                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+                # elif x == 949:
+                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+                # elif x == 5354:
+                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+                # else:
+                #     f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
                 f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
         print('len(res): ', len(res))
         print('len(lines): ', len(lines))