feat(convert_hf_to_gguf): Add GraniteMoeModel
GraniteMoe has the same configuration deltas as Granite Branch: GraniteMoE Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
This commit is contained in:
parent
8a4ca2313c
commit
e0b72290d0
3 changed files with 18 additions and 9 deletions
|
@ -4102,14 +4102,23 @@ class GraniteModel(LlamaModel):
|
||||||
# consistency
|
# consistency
|
||||||
if attention_scale := self.hparams.get("attention_multiplier"):
|
if attention_scale := self.hparams.get("attention_multiplier"):
|
||||||
self.gguf_writer.add_attention_scale(attention_scale)
|
self.gguf_writer.add_attention_scale(attention_scale)
|
||||||
|
logger.info("gguf: (granite) attention_scale = %s", attention_scale)
|
||||||
if embedding_scale := self.hparams.get("embedding_multiplier"):
|
if embedding_scale := self.hparams.get("embedding_multiplier"):
|
||||||
self.gguf_writer.add_embedding_scale(embedding_scale)
|
self.gguf_writer.add_embedding_scale(embedding_scale)
|
||||||
|
logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
|
||||||
if residual_scale := self.hparams.get("residual_multiplier"):
|
if residual_scale := self.hparams.get("residual_multiplier"):
|
||||||
self.gguf_writer.add_residual_scale(residual_scale)
|
self.gguf_writer.add_residual_scale(residual_scale)
|
||||||
if logits_scaling := self.hparams.get("logits_scaling"):
|
logger.info("gguf: (granite) residual_scale = %s", residual_scale)
|
||||||
self.gguf_writer.add_logit_scale(logits_scaling)
|
if logits_scale := self.hparams.get("logits_scaling"):
|
||||||
|
self.gguf_writer.add_logit_scale(logits_scale)
|
||||||
|
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("GraniteMoeForCausalLM")
|
||||||
|
class GraniteMoeModel(GraniteModel):
|
||||||
|
"""Conversion for IBM's GraniteMoeForCausalLM"""
|
||||||
|
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
# tree of lazy tensors
|
# tree of lazy tensors
|
||||||
|
|
|
@ -1253,7 +1253,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.ATTN_V,
|
MODEL_TENSOR.ATTN_V,
|
||||||
MODEL_TENSOR.ATTN_OUT,
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
MODEL_TENSOR.FFN_NORM,
|
MODEL_TENSOR.FFN_NORM,
|
||||||
MODEL_TENSOR.FFN_GATE_EXP,
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
MODEL_TENSOR.FFN_UP_EXP,
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
],
|
],
|
||||||
|
|
|
@ -251,11 +251,12 @@ class TensorNameMap:
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP: (
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
"layers.{bid}.feed_forward.gate", # mixtral
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||||
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe
|
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe
|
||||||
"transformer.decoder_layer.{bid}.router", # Grok
|
"transformer.decoder_layer.{bid}.router", # Grok
|
||||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||||
|
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||||
|
@ -329,7 +330,6 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
||||||
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
|
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
|
||||||
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
|
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue