fix: Use gpt2 tokenizer for roberta and add eos/bos tokens

Branch: RobertaTokenizer Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2024-12-13 16:41:40 -07:00 · 2024-12-13 16:41:40 -07:00 · a2e03b826f
commit a2e03b826f
parent a76c56fa1a
1 changed files with 27 additions and 2 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -663,7 +663,11 @@ class Model:
            res = "minerva-7b"
        if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
            # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
-            res = "roberta-bpe"
+            # NOTE: The Roberta tokenizer is the same as GPT-2, but it always
            #   adds the cls/sep tokens as bos/eos. This is handled as a
            #   post-processor in tokenizers, so the chkhsh is different, but
            #   it still maps to gpt-2 internally.
            res = "gpt-2"
        if res is None:
            logger.warning("\n")
@ -2544,7 +2548,7 @@ class InternLM2Model(Model):
            return [(self.map_tensor_name(name), data_torch)]
-@Model.register("BertModel", "CamembertModel", "RobertaModel")
+@Model.register("BertModel", "CamembertModel")
 class BertModel(Model):
    model_arch = gguf.MODEL_ARCH.BERT
@ -2617,6 +2621,27 @@ class BertModel(Model):
        return [(self.map_tensor_name(name), data_torch)]
@Model.register("RobertaModel")
 class RobertaModel(BertModel):
    model_arch = gguf.MODEL_ARCH.BERT
    def set_vocab(self):
        """Support BPE tokenizers for roberta models"""
        bpe_tok_path = self.dir_model / "tokenizer.json"
        if bpe_tok_path.exists():
            self._set_vocab_gpt2()
            self.gguf_writer.add_add_bos_token(True)
            self.gguf_writer.add_add_eos_token(True)
            # we need this to validate the size of the token_type embeddings
            # though currently we are passing all zeros to the token_type embeddings
            # "Sequence A" or "Sequence B"
            self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
        else:
            return super().set_vocab()
@Model.register("NomicBertModel")
 class NomicBertModel(BertModel):
    model_arch = gguf.MODEL_ARCH.NOMIC_BERT