From b83fbc92873d7f5d0a703372eb4e15c451974ad2 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sat, 2 Mar 2024 23:39:19 -0500 Subject: [PATCH] convert : for Mamba, fallback to internal NeoX tokenizer The resulting models are exactly the same as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there. --- convert-hf-to-gguf.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index bed830ce6..a288d73f8 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1852,8 +1852,33 @@ class MambaModel(Model): vocab_size = self.hparams["vocab_size"] # Round vocab size to next multiple of 8 pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) - self.hparams["vocab_size"] = ((vocab_size + (pad_vocab - 1)) // pad_vocab) * pad_vocab - return self._set_vocab_gpt2() + # pad using ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab + self.hparams["vocab_size"] = vocab_size + + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # Use the GPT-NeoX tokenizer when no tokenizer files are present + tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" + print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + neox_reader = gguf.GGUFReader(tokenizer_path, "r") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) + self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1])) + field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) + self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) + self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) + self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) + self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) + self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID) + self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0]) def set_gguf_parameters(self): d_model = self.hparams["d_model"]