support bpe tokenizer in convert

Signed-off-by: ldwang <ftgreat@gmail.com>
This commit is contained in:
ldwang 2023-07-15 14:14:00 +08:00
parent d7aab2e900
commit ee6bc1426e

View file

@ -242,12 +242,8 @@ class SentencePieceVocab:
byte_decoder = {v: k for k, v in byte_encoder.items()} byte_decoder = {v: k for k, v in byte_encoder.items()}
for i, item in enumerate(tokenizer): for i, item in enumerate(tokenizer):
text: bytes text: bytes
if i == 0: text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
text = " \u2047 ".encode("utf-8") score: float = -i
score = 0.0
else:
text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
score: float = -i
yield text, score yield text, score
else: else:
for i in range(tokenizer.vocab_size()): for i in range(tokenizer.vocab_size()):