From fc89feeddf7a3c28607ada5109f6870c29bfa843 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Thu, 11 Apr 2024 14:27:15 +0200 Subject: [PATCH] model: convert-hf-to-gguf.py remove tiktoken --- convert-hf-to-gguf.py | 64 ------------------- .../requirements-convert-hf-to-gguf.txt | 1 - 2 files changed, 65 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 291a78e34..d28908586 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -393,66 +393,6 @@ class Model(ABC): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_tiktoken(self): - # https://github.com/openai/tiktoken - dir_model = self.dir_model - hparams = self.hparams - tokens: list[str] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size - vocab = {} - merges = [] - - # FIXME REVIEW should we extract this from QwenModel to base Model class ? - mergeable_ranks = tokenizer.encoding._mergeable_ranks - for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) == 2 - merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - - # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined - added_vocab = tokenizer.get_added_vocab() - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in ({**vocab, **added_vocab}).items()} - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.chat_template = tokenizer.default_chat_template - special_vocab.merges = merges - tk_endoftext = tokenizer.encoding._special_tokens["<|endoftext|>"] - - # only add special tokens when they were not already loaded from config.json - if len(special_vocab.special_token_ids) == 0: - special_vocab._set_special_token("bos", tk_endoftext) - special_vocab._set_special_token("eos", tk_endoftext) - # this one is usually not in config.json anyway - special_vocab._set_special_token("unk", tk_endoftext) - - special_vocab.add_to_gguf(self.gguf_writer) - @Model.register("GPTNeoXForCausalLM") class GPTNeoXModel(Model): @@ -1582,10 +1522,6 @@ class DbrxModel(Model): self.gguf_writer.add_tensor(new_name, data) - def set_vocab(self): - self._set_vocab_tiktoken() - - @Model.register("MiniCPMForCausalLM") class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM diff --git a/requirements/requirements-convert-hf-to-gguf.txt b/requirements/requirements-convert-hf-to-gguf.txt index db8888cac..6ce840d73 100644 --- a/requirements/requirements-convert-hf-to-gguf.txt +++ b/requirements/requirements-convert-hf-to-gguf.txt @@ -1,4 +1,3 @@ -r ./requirements-convert.txt torch~=2.1.1 einops~=0.7.0 -tiktoken~=0.6.0