minor
This commit is contained in:
parent
ad929833cb
commit
4434c9d6c2
2 changed files with 19 additions and 4 deletions
|
@ -12035,7 +12035,7 @@ struct llm_tokenizer_bpe {
|
||||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
|
||||||
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
||||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
|
||||||
// TODO: this is not the same as the original regex:
|
// TODO: this is not the same as the original regex:
|
||||||
// - need to use ReFlex and update unicode.cpp to support the regex above
|
// - need to use ReFlex and update unicode.cpp to support the regex above
|
||||||
|
@ -12052,7 +12052,7 @@ struct llm_tokenizer_bpe {
|
||||||
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
||||||
"\\s+$",
|
"\\s+$",
|
||||||
"[一-龥ࠀ-一가-]+",
|
"[一-龥ࠀ-一가-]+",
|
||||||
"\\p{N}+"
|
"\\p{N}+",
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
||||||
|
@ -12061,7 +12061,7 @@ struct llm_tokenizer_bpe {
|
||||||
"\\s?\\p{L}+",
|
"\\s?\\p{L}+",
|
||||||
"\\s?\\p{P}+",
|
"\\s?\\p{P}+",
|
||||||
"[一-龥ࠀ-一가-]+",
|
"[一-龥ࠀ-一가-]+",
|
||||||
"\\p{N}+"
|
"\\p{N}+",
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -12070,7 +12070,7 @@ struct llm_tokenizer_bpe {
|
||||||
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
"\\p{N}+",
|
"\\p{N}+",
|
||||||
"[0-9][0-9][0-9]"
|
"[0-9][0-9][0-9]",
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -96,6 +96,21 @@ if fname_tok:
|
||||||
# write to file
|
# write to file
|
||||||
with open(fname_out, 'w', encoding='utf-8') as f:
|
with open(fname_out, 'w', encoding='utf-8') as f:
|
||||||
for x in res:
|
for x in res:
|
||||||
|
# LLaMA v3 for some reason strips the space for these tokens (and others)
|
||||||
|
# if x == 662:
|
||||||
|
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||||
|
# elif x == 1174:
|
||||||
|
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||||
|
# elif x == 2564:
|
||||||
|
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||||
|
# elif x == 758:
|
||||||
|
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||||
|
# elif x == 949:
|
||||||
|
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||||
|
# elif x == 5354:
|
||||||
|
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
|
||||||
|
# else:
|
||||||
|
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||||
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||||
print('len(res): ', len(res))
|
print('len(res): ', len(res))
|
||||||
print('len(lines): ', len(lines))
|
print('len(lines): ', len(lines))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue