From 74204ccbaeb46d9fb9203c30b541653001a7a28c Mon Sep 17 00:00:00 2001 From: goerch Date: Sun, 22 Oct 2023 20:35:50 +0200 Subject: [PATCH] Clarify logic in conversion --- convert-mpt-hf-to-gguf.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py index 7b16f68bb..b06c2e713 100755 --- a/convert-mpt-hf-to-gguf.py +++ b/convert-mpt-hf-to-gguf.py @@ -132,15 +132,16 @@ added_vocab = tokenizer.get_added_vocab() reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} for i in range(vocab_size): - if i in reverse_vocab: - tokens.append(reverse_vocab[i]) - if reverse_vocab[i] not in added_vocab: - toktypes.append(gguf.TokenType.NORMAL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: + if i not in reverse_vocab: tokens.append(f"[PAD{i}]") toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + # NOTE: wouldn't we like to distinguish CONTROL tokens here? + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) gguf_writer.add_token_list(tokens) gguf_writer.add_token_types(toktypes)