From e6ea63ca7190ae39fa03f8468b4a8cfe51b0aeef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=8A=B9=EB=8D=95/Infrastructure=EA=B7=B8?= =?UTF-8?q?=EB=A3=B9=28YA=29?= Date: Thu, 12 Oct 2023 12:44:33 +0900 Subject: [PATCH] simply ignore added tokens that id < vocab size --- convert.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/convert.py b/convert.py index 19840a09e..e9fe25b78 100755 --- a/convert.py +++ b/convert.py @@ -366,24 +366,14 @@ class SentencePieceVocab: added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: added_tokens = {} - items: list[tuple[str, int]] = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - - tokens_to_replace: dict[int, str] = {} - new_tokens: dict[int, str] = {} - for piece, idx in items: - if idx < vocab_size: - tokens_to_replace[idx] = piece - else: - new_tokens[idx] = piece + new_tokens: dict[int, str] = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} expected_new_ids: list[int] = list(range(vocab_size, vocab_size + len(new_tokens))) actual_new_ids: list[int] = sorted(new_tokens.keys()) if expected_new_ids != actual_new_ids: raise Exception(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") - # Key is the original token ID, value is the replacement token piece. - self.tokens_to_replace = tokens_to_replace # Token pieces that were added to the base vocabulary. self.new_tokens_list: list[str] = [new_tokens[id] for id in actual_new_ids] self.vocab_size_base: int = vocab_size @@ -394,7 +384,7 @@ class SentencePieceVocab: def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer for id in range(tokenizer.vocab_size()): - piece = tokenizer.id_to_piece(id) if id not in self.tokens_to_replace else self.tokens_to_replace[id] + piece = tokenizer.id_to_piece(id) text: bytes = piece.encode("utf-8") score: float = tokenizer.get_score(id)