fix: Use gpt2 tokenizer for roberta and add eos/bos tokens

Branch: RobertaTokenizer Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2024-12-13 16:41:40 -07:00 · 2024-12-13 16:41:40 -07:00 · a2e03b826f
commit a2e03b826f
parent a76c56fa1a
1 changed files with 27 additions and 2 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -663,7 +663,11 @@ class Model:
            res = "minerva-7b"
        if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
            # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
-            res = "roberta-bpe"
+            # NOTE: The Roberta tokenizer is the same as GPT-2, but it always
+            #   adds the cls/sep tokens as bos/eos. This is handled as a
+            #   post-processor in tokenizers, so the chkhsh is different, but
+            #   it still maps to gpt-2 internally.
+            res = "gpt-2"

        if res is None:
            logger.warning("\n")
@ -2544,7 +2548,7 @@ class InternLM2Model(Model):
            return [(self.map_tensor_name(name), data_torch)]


-@Model.register("BertModel", "CamembertModel", "RobertaModel")
+@Model.register("BertModel", "CamembertModel")
 class BertModel(Model):
    model_arch = gguf.MODEL_ARCH.BERT

@ -2617,6 +2621,27 @@ class BertModel(Model):
        return [(self.map_tensor_name(name), data_torch)]


+@Model.register("RobertaModel")
+class RobertaModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def set_vocab(self):
+        """Support BPE tokenizers for roberta models"""
+        bpe_tok_path = self.dir_model / "tokenizer.json"
+        if bpe_tok_path.exists():
+            self._set_vocab_gpt2()
+            self.gguf_writer.add_add_bos_token(True)
+            self.gguf_writer.add_add_eos_token(True)
+
+            # we need this to validate the size of the token_type embeddings
+            # though currently we are passing all zeros to the token_type embeddings
+            # "Sequence A" or "Sequence B"
+            self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
+
+        else:
+            return super().set_vocab()
+
+
@Model.register("NomicBertModel")
 class NomicBertModel(BertModel):
    model_arch = gguf.MODEL_ARCH.NOMIC_BERT