model: convert-hf-to-gguf.py remove tiktoken
This commit is contained in:
parent
06527c66c3
commit
fc89feeddf
2 changed files with 0 additions and 65 deletions
|
@ -393,66 +393,6 @@ class Model(ABC):
|
|||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_tiktoken(self):
|
||||
# https://github.com/openai/tiktoken
|
||||
dir_model = self.dir_model
|
||||
hparams = self.hparams
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||
vocab_size = hparams["vocab_size"]
|
||||
assert max(tokenizer.get_vocab().values()) < vocab_size
|
||||
vocab = {}
|
||||
merges = []
|
||||
|
||||
# FIXME REVIEW should we extract this from QwenModel to base Model class ?
|
||||
mergeable_ranks = tokenizer.encoding._mergeable_ranks
|
||||
for token, rank in mergeable_ranks.items():
|
||||
vocab[QwenModel.token_bytes_to_string(token)] = rank
|
||||
if len(token) == 1:
|
||||
continue
|
||||
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||
assert len(merged) == 2
|
||||
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
|
||||
|
||||
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in ({**vocab, **added_vocab}).items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
||||
special_vocab.chat_template = tokenizer.default_chat_template
|
||||
special_vocab.merges = merges
|
||||
tk_endoftext = tokenizer.encoding._special_tokens["<|endoftext|>"]
|
||||
|
||||
# only add special tokens when they were not already loaded from config.json
|
||||
if len(special_vocab.special_token_ids) == 0:
|
||||
special_vocab._set_special_token("bos", tk_endoftext)
|
||||
special_vocab._set_special_token("eos", tk_endoftext)
|
||||
# this one is usually not in config.json anyway
|
||||
special_vocab._set_special_token("unk", tk_endoftext)
|
||||
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
|
||||
@Model.register("GPTNeoXForCausalLM")
|
||||
class GPTNeoXModel(Model):
|
||||
|
@ -1582,10 +1522,6 @@ class DbrxModel(Model):
|
|||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_tiktoken()
|
||||
|
||||
|
||||
@Model.register("MiniCPMForCausalLM")
|
||||
class MiniCPMModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.MINICPM
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
-r ./requirements-convert.txt
|
||||
torch~=2.1.1
|
||||
einops~=0.7.0
|
||||
tiktoken~=0.6.0
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue