minor

2024-04-27 11:33:16 +03:00 · 2024-04-27 11:33:16 +03:00 · 4434c9d6c2
commit 4434c9d6c2
parent ad929833cb
2 changed files with 19 additions and 4 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -12035,7 +12035,7 @@ struct llm_tokenizer_bpe {
                            //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",

                            // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
-                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",

                            // TODO: this is not the same as the original regex:
                            //       - need to use ReFlex and update unicode.cpp to support the regex above
@ -12052,7 +12052,7 @@ struct llm_tokenizer_bpe {
                            "\\s?[!-/:-~！-／：-～‘-‟　-。]+",
                            "\\s+$",
                            "[一-龥ࠀ-一가-퟿]+",
-                            "\\p{N}+"
+                            "\\p{N}+",
                        });
                        break;
                    case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
@ -12061,7 +12061,7 @@ struct llm_tokenizer_bpe {
                            "\\s?\\p{L}+",
                            "\\s?\\p{P}+",
                            "[一-龥ࠀ-一가-퟿]+",
-                            "\\p{N}+"
+                            "\\p{N}+",
                        });
                        break;
                    default:
@ -12070,7 +12070,7 @@ struct llm_tokenizer_bpe {
                            "[\\p{P}\\$\\+<=>\\^~\\|]+",
                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                            "\\p{N}+",
-                            "[0-9][0-9][0-9]"
+                            "[0-9][0-9][0-9]",
                        });
                        break;
                }
--- a/tests/test-tokenizer-0-bpe.py
+++ b/tests/test-tokenizer-0-bpe.py
@ -96,6 +96,21 @@ if fname_tok:
        # write to file
        with open(fname_out, 'w', encoding='utf-8') as f:
            for x in res:
+                # LLaMA v3 for some reason strips the space for these tokens (and others)
+                # if x == 662:
+                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+                # elif x == 1174:
+                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+                # elif x == 2564:
+                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+                # elif x == 758:
+                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+                # elif x == 949:
+                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+                # elif x == 5354:
+                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
+                # else:
+                #     f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
        print('len(res): ', len(res))
        print('len(lines): ', len(lines))