From 4434c9d6c2888b192cc791492ccce74c2b141bb2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 Apr 2024 11:33:16 +0300 Subject: [PATCH] minor --- llama.cpp | 8 ++++---- tests/test-tokenizer-0-bpe.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 32a94041a..b3b022a67 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12035,7 +12035,7 @@ struct llm_tokenizer_bpe { //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989 - "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", // TODO: this is not the same as the original regex: // - need to use ReFlex and update unicode.cpp to support the regex above @@ -12052,7 +12052,7 @@ struct llm_tokenizer_bpe { "\\s?[!-/:-~!-/:-~‘-‟ -。]+", "\\s+$", "[一-龥ࠀ-一가-퟿]+", - "\\p{N}+" + "\\p{N}+", }); break; case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: @@ -12061,7 +12061,7 @@ struct llm_tokenizer_bpe { "\\s?\\p{L}+", "\\s?\\p{P}+", "[一-龥ࠀ-一가-퟿]+", - "\\p{N}+" + "\\p{N}+", }); break; default: @@ -12070,7 +12070,7 @@ struct llm_tokenizer_bpe { "[\\p{P}\\$\\+<=>\\^~\\|]+", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "\\p{N}+", - "[0-9][0-9][0-9]" + "[0-9][0-9][0-9]", }); break; } diff --git a/tests/test-tokenizer-0-bpe.py b/tests/test-tokenizer-0-bpe.py index 38aa33c46..3e9fea2a2 100644 --- a/tests/test-tokenizer-0-bpe.py +++ b/tests/test-tokenizer-0-bpe.py @@ -96,6 +96,21 @@ if fname_tok: # write to file with open(fname_out, 'w', encoding='utf-8') as f: for x in res: + # LLaMA v3 for some reason strips the space for these tokens (and others) + # if x == 662: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 1174: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 2564: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 758: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 949: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 5354: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # else: + # f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') print('len(res): ', len(res)) print('len(lines): ', len(lines))