From ee6bc1426e607f89e629060d8acdbf6be3f500ce Mon Sep 17 00:00:00 2001 From: ldwang Date: Sat, 15 Jul 2023 14:14:00 +0800 Subject: [PATCH] support bpe tokenizer in convert Signed-off-by: ldwang --- convert.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/convert.py b/convert.py index 6d5db5368..45e59b933 100644 --- a/convert.py +++ b/convert.py @@ -242,12 +242,8 @@ class SentencePieceVocab: byte_decoder = {v: k for k, v in byte_encoder.items()} for i, item in enumerate(tokenizer): text: bytes - if i == 0: - text = " \u2047 ".encode("utf-8") - score = 0.0 - else: - text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]]) - score: float = -i + text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]]) + score: float = -i yield text, score else: for i in range(tokenizer.vocab_size()):