Add BPE models for testing
This commit is contained in:
parent
e013b23102
commit
55e387b2d5
1 changed files with 10 additions and 2 deletions
|
@ -156,6 +156,7 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
|
||||||
'<s>a', # Phi-3 fail
|
'<s>a', # Phi-3 fail
|
||||||
'<unk><|endoftext|><s>', # Phi-3 fail
|
'<unk><|endoftext|><s>', # Phi-3 fail
|
||||||
'a\na', # TODO: Bert fail
|
'a\na', # TODO: Bert fail
|
||||||
|
'a\xa0\xa0\x00b', # jina-v2-es
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -328,8 +329,15 @@ if __name__ == "__main__":
|
||||||
# import os
|
# import os
|
||||||
# tokenizers = os.listdir(path_tokenizers)
|
# tokenizers = os.listdir(path_tokenizers)
|
||||||
tokenizers = [
|
tokenizers = [
|
||||||
"llama-spm", # SPM
|
# "llama-spm", # SPM
|
||||||
"phi-3", # SPM
|
# "phi-3", # SPM
|
||||||
|
# "bert-bge", # WPM
|
||||||
|
# "jina-v2-en", # WPM
|
||||||
|
"gpt-2", # BPE
|
||||||
|
"llama-bpe", # BPE
|
||||||
|
"jina-v2-es", # BPE
|
||||||
|
"jina-v2-de", # BPE
|
||||||
|
"phi-2", # BPE
|
||||||
]
|
]
|
||||||
|
|
||||||
for tokenizer in tokenizers:
|
for tokenizer in tokenizers:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue