From 47a0a0cdffcce913a0c31b47db5eb1672ca55739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Sun, 23 Jun 2024 10:57:07 +0200 Subject: [PATCH] gguf-py, convert-hf : conversion support for FLAN-T5 model family --- convert-hf-to-gguf.py | 11 +++++++++++ gguf-py/gguf/constants.py | 7 +++++++ gguf-py/gguf/tensor_mapping.py | 14 ++++++++++++-- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index b8fcff3d6..25cb2e935 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2834,6 +2834,17 @@ class T5Model(Model): self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"]) self.gguf_writer.add_file_type(self.ftype) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # flan-t5-xxl contains "decoder.embed_tokens.weight" tensor that is the same as "shared.weight" tensor + # To prevent errors caused by an unnecessary unmapped tensor, skip "decoder.embed_tokens.weight". + if name == "decoder.embed_tokens.weight": + logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.") + return [] + + return [(self.map_tensor_name(name), data_torch)] + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bdaec71b8..547b1f43e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -218,6 +218,7 @@ class MODEL_TENSOR(IntEnum): DEC_CROSS_ATTN_OUT = auto() DEC_CROSS_ATTN_REL_B = auto() DEC_FFN_NORM = auto() + DEC_FFN_GATE = auto() DEC_FFN_DOWN = auto() DEC_FFN_UP = auto() DEC_OUTPUT_NORM = auto() @@ -228,6 +229,7 @@ class MODEL_TENSOR(IntEnum): ENC_ATTN_OUT = auto() ENC_ATTN_REL_B = auto() ENC_FFN_NORM = auto() + ENC_FFN_GATE = auto() ENC_FFN_DOWN = auto() ENC_FFN_UP = auto() ENC_OUTPUT_NORM = auto() @@ -333,6 +335,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o", MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b", MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm", + MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate", MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", @@ -343,6 +346,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o", MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b", MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm", + MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate", MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", @@ -868,6 +872,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { ], MODEL_ARCH.T5: [ MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, MODEL_TENSOR.DEC_ATTN_NORM, MODEL_TENSOR.DEC_ATTN_Q, MODEL_TENSOR.DEC_ATTN_K, @@ -881,6 +886,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.DEC_CROSS_ATTN_OUT, MODEL_TENSOR.DEC_CROSS_ATTN_REL_B, MODEL_TENSOR.DEC_FFN_NORM, + MODEL_TENSOR.DEC_FFN_GATE, MODEL_TENSOR.DEC_FFN_DOWN, MODEL_TENSOR.DEC_FFN_UP, MODEL_TENSOR.DEC_OUTPUT_NORM, @@ -891,6 +897,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ENC_ATTN_OUT, MODEL_TENSOR.ENC_ATTN_REL_B, MODEL_TENSOR.ENC_FFN_NORM, + MODEL_TENSOR.ENC_FFN_GATE, MODEL_TENSOR.ENC_FFN_DOWN, MODEL_TENSOR.ENC_FFN_UP, MODEL_TENSOR.ENC_OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 973af64c2..2dc2d5231 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -467,8 +467,13 @@ class TensorNameMap: "decoder.block.{bid}.layer.2.layer_norm", # t5 ), + MODEL_TENSOR.DEC_FFN_GATE: ( + "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5 + ), + MODEL_TENSOR.DEC_FFN_UP: ( - "decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5 + "decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5 + "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5 ), MODEL_TENSOR.DEC_FFN_DOWN: ( @@ -507,8 +512,13 @@ class TensorNameMap: "encoder.block.{bid}.layer.1.layer_norm", # t5 ), + MODEL_TENSOR.ENC_FFN_GATE: ( + "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5 + ), + MODEL_TENSOR.ENC_FFN_UP: ( - "encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5 + "encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5 + "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5 ), MODEL_TENSOR.ENC_FFN_DOWN: (