From e0b72290d086c16a728163fcc668fc0434a70be7 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 10 Sep 2024 14:48:30 -0600 Subject: [PATCH] feat(convert_hf_to_gguf): Add GraniteMoeModel GraniteMoe has the same configuration deltas as Granite Branch: GraniteMoE Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 13 +++++++++++-- gguf-py/gguf/constants.py | 2 +- gguf-py/gguf/tensor_mapping.py | 12 ++++++------ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ff4c9226f..8a32bf510 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4102,14 +4102,23 @@ class GraniteModel(LlamaModel): # consistency if attention_scale := self.hparams.get("attention_multiplier"): self.gguf_writer.add_attention_scale(attention_scale) + logger.info("gguf: (granite) attention_scale = %s", attention_scale) if embedding_scale := self.hparams.get("embedding_multiplier"): self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info("gguf: (granite) embedding_scale = %s", embedding_scale) if residual_scale := self.hparams.get("residual_multiplier"): self.gguf_writer.add_residual_scale(residual_scale) - if logits_scaling := self.hparams.get("logits_scaling"): - self.gguf_writer.add_logit_scale(logits_scaling) + logger.info("gguf: (granite) residual_scale = %s", residual_scale) + if logits_scale := self.hparams.get("logits_scaling"): + self.gguf_writer.add_logit_scale(logits_scale) + logger.info("gguf: (granite) logits_scale = %s", logits_scale) +@Model.register("GraniteMoeForCausalLM") +class GraniteMoeModel(GraniteModel): + """Conversion for IBM's GraniteMoeForCausalLM""" + model_arch = gguf.MODEL_ARCH.GRANITE_MOE + ###### CONVERSION LOGIC ###### # tree of lazy tensors diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 95a07f0a3..4e1091220 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1253,7 +1253,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_GATE_INP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index db2d03e4d..fc5fb3010 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -251,11 +251,12 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_INP: ( - "layers.{bid}.feed_forward.gate", # mixtral - "model.layers.{bid}.block_sparse_moe.gate", # mixtral - "model.layers.{bid}.mlp.gate", # qwen2moe olmoe - "transformer.decoder_layer.{bid}.router", # Grok - "transformer.blocks.{bid}.ffn.router.layer", # dbrx + "layers.{bid}.feed_forward.gate", # mixtral + "model.layers.{bid}.block_sparse_moe.gate", # mixtral + "model.layers.{bid}.mlp.gate", # qwen2moe olmoe + "transformer.decoder_layer.{bid}.router", # Grok + "transformer.blocks.{bid}.ffn.router.layer", # dbrx + "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe ), MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( @@ -329,7 +330,6 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) - "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe ), MODEL_TENSOR.FFN_GATE_SHEXP: (