model: dbrx: convert-hf-to-gguf.py fix 'token_embd.weight' has wrong shape, fix special tokens

This commit is contained in:
Pierrick HYMBERT 2024-04-07 17:21:35 +02:00
parent d7546fda64
commit 3a9dc2eee2

View file

@ -393,29 +393,36 @@ class Model(ABC):
def _set_vocab_tiktoken(self): def _set_vocab_tiktoken(self):
# https://github.com/openai/tiktoken # https://github.com/openai/tiktoken
dir_model = self.dir_model dir_model = self.dir_model
hparams = self.hparams
tokens: list[str] = [] tokens: list[str] = []
toktypes: list[int] = [] toktypes: list[int] = []
from transformers import AutoTokenizer from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
vocab_size = hparams["vocab_size"]
vocab_size = tokenizer.vocab_size assert max(tokenizer.get_vocab().values()) < vocab_size
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()} vocab = {}
added_vocab = tokenizer.get_added_vocab()
merges = [] merges = []
# FIXME REVIEW should we extract this from QwenModel to base Model class ? # FIXME REVIEW should we extract this from QwenModel to base Model class ?
mergeable_ranks = tokenizer.encoding._mergeable_ranks mergeable_ranks = tokenizer.encoding._mergeable_ranks
for token, rank in mergeable_ranks.items(): for token, rank in mergeable_ranks.items():
reverse_vocab[QwenModel.token_bytes_to_string(token)] = rank vocab[QwenModel.token_bytes_to_string(token)] = rank
if len(token) == 1: if len(token) == 1:
continue continue
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
assert len(merged) == 2 assert len(merged) == 2
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
for i in range(vocab_size): for i in range(vocab_size):
if reverse_vocab[i] in added_vocab: if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i]) tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special: if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL) toktypes.append(gguf.TokenType.CONTROL)
@ -425,15 +432,22 @@ class Model(ABC):
tokens.append(reverse_vocab[i]) tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL) toktypes.append(gguf.TokenType.NORMAL)
# FIXME REVIEW should we introduce tiktoken in llama.cpp ?
self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
special_vocab.merges = merges
special_vocab.chat_template = tokenizer.default_chat_template special_vocab.chat_template = tokenizer.default_chat_template
# FIXME REVIEW how to add special tokens https://huggingface.co/databricks/dbrx-instruct/blob/main/tiktoken.py#L193 special_vocab.merges = merges
tk_endoftext = tokenizer.encoding._special_tokens["<|endoftext|>"]
# only add special tokens when they were not already loaded from config.json
if len(special_vocab.special_token_ids) == 0:
special_vocab._set_special_token("bos", tk_endoftext)
special_vocab._set_special_token("eos", tk_endoftext)
# this one is usually not in config.json anyway
special_vocab._set_special_token("unk", tk_endoftext)
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)