minor
This commit is contained in:
parent
ad929833cb
commit
4434c9d6c2
2 changed files with 19 additions and 4 deletions
|
@ -12035,7 +12035,7 @@ struct llm_tokenizer_bpe {
|
|||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
|
||||
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
|
||||
// TODO: this is not the same as the original regex:
|
||||
// - need to use ReFlex and update unicode.cpp to support the regex above
|
||||
|
@ -12052,7 +12052,7 @@ struct llm_tokenizer_bpe {
|
|||
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
||||
"\\s+$",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
"\\p{N}+"
|
||||
"\\p{N}+",
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
||||
|
@ -12061,7 +12061,7 @@ struct llm_tokenizer_bpe {
|
|||
"\\s?\\p{L}+",
|
||||
"\\s?\\p{P}+",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
"\\p{N}+"
|
||||
"\\p{N}+",
|
||||
});
|
||||
break;
|
||||
default:
|
||||
|
@ -12070,7 +12070,7 @@ struct llm_tokenizer_bpe {
|
|||
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
"\\p{N}+",
|
||||
"[0-9][0-9][0-9]"
|
||||
"[0-9][0-9][0-9]",
|
||||
});
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -96,6 +96,21 @@ if fname_tok:
|
|||
# write to file
|
||||
with open(fname_out, 'w', encoding='utf-8') as f:
|
||||
for x in res:
|
||||
# LLaMA v3 for some reason strips the space for these tokens (and others)
|
||||
# if x == 662:
|
||||
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||
# elif x == 1174:
|
||||
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||
# elif x == 2564:
|
||||
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||
# elif x == 758:
|
||||
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||
# elif x == 949:
|
||||
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||
# elif x == 5354:
|
||||
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||
# else:
|
||||
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||
print('len(res): ', len(res))
|
||||
print('len(lines): ', len(lines))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue