This commit is contained in:
Georgi Gerganov 2024-04-27 11:33:16 +03:00
parent ad929833cb
commit 4434c9d6c2
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 19 additions and 4 deletions

View file

@ -12035,7 +12035,7 @@ struct llm_tokenizer_bpe {
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989 // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
// TODO: this is not the same as the original regex: // TODO: this is not the same as the original regex:
// - need to use ReFlex and update unicode.cpp to support the regex above // - need to use ReFlex and update unicode.cpp to support the regex above
@ -12052,7 +12052,7 @@ struct llm_tokenizer_bpe {
"\\s?[!-/:-~---‟ -。]+", "\\s?[!-/:-~---‟ -。]+",
"\\s+$", "\\s+$",
"[一-龥ࠀ-一가-퟿]+", "[一-龥ࠀ-一가-퟿]+",
"\\p{N}+" "\\p{N}+",
}); });
break; break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
@ -12061,7 +12061,7 @@ struct llm_tokenizer_bpe {
"\\s?\\p{L}+", "\\s?\\p{L}+",
"\\s?\\p{P}+", "\\s?\\p{P}+",
"[一-龥ࠀ-一가-퟿]+", "[一-龥ࠀ-一가-퟿]+",
"\\p{N}+" "\\p{N}+",
}); });
break; break;
default: default:
@ -12070,7 +12070,7 @@ struct llm_tokenizer_bpe {
"[\\p{P}\\$\\+<=>\\^~\\|]+", "[\\p{P}\\$\\+<=>\\^~\\|]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+", "\\p{N}+",
"[0-9][0-9][0-9]" "[0-9][0-9][0-9]",
}); });
break; break;
} }

View file

@ -96,6 +96,21 @@ if fname_tok:
# write to file # write to file
with open(fname_out, 'w', encoding='utf-8') as f: with open(fname_out, 'w', encoding='utf-8') as f:
for x in res: for x in res:
# LLaMA v3 for some reason strips the space for these tokens (and others)
# if x == 662:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 1174:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 2564:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 758:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 949:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 5354:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# else:
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
print('len(res): ', len(res)) print('len(res): ', len(res))
print('len(lines): ', len(lines)) print('len(lines): ', len(lines))