From 65c5bb91abd6cdc00be7ed0d7c8d2d771a82f6d6 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 16 Sep 2024 08:56:56 -0600 Subject: [PATCH] fix(convert_hf_to_gguf/gguf-py): _multiplier -> _scale The transformers names with _multiplier will now be converted to the _scale equivalent during conversion. Branch: GraniteLM Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 20 +++++++++++--------- gguf-py/gguf/constants.py | 6 +++--- gguf-py/gguf/gguf_writer.py | 12 ++++++------ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8530557d8..ff4c9226f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4090,20 +4090,22 @@ class GraniteModel(LlamaModel): - No head_dim support - New multiplier params: - - attention_multiplier - - embedding_multiplier - - residual_multiplier + - attention_scale + - embedding_scale + - residual_scale - logits_scaling """ if head_dim := self.hparams.pop("head_dim", None): logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) super().set_gguf_parameters() - if attention_multiplier := self.hparams.get("attention_multiplier"): - self.gguf_writer.add_attention_multiplier(attention_multiplier) - if embedding_multiplier := self.hparams.get("embedding_multiplier"): - self.gguf_writer.add_embedding_multiplier(embedding_multiplier) - if residual_multiplier := self.hparams.get("residual_multiplier"): - self.gguf_writer.add_residual_multiplier(residual_multiplier) + # NOTE: Convert _multiplier params to _scale params for naming + # consistency + if attention_scale := self.hparams.get("attention_multiplier"): + self.gguf_writer.add_attention_scale(attention_scale) + if embedding_scale := self.hparams.get("embedding_multiplier"): + self.gguf_writer.add_embedding_scale(embedding_scale) + if residual_scale := self.hparams.get("residual_multiplier"): + self.gguf_writer.add_residual_scale(residual_scale) if logits_scaling := self.hparams.get("logits_scaling"): self.gguf_writer.add_logit_scale(logits_scaling) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 88619094a..b36a60d49 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -97,8 +97,8 @@ class Keys: RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" - RESIDUAL_MULTIPLIER = "{arch}.residual_multiplier" - EMBEDDING_MULTIPLIER = "{arch}.embedding_multiplier" + RESIDUAL_SCALE = "{arch}.residual_scale" + EMBEDDING_SCALE = "{arch}.embedding_scale" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -114,7 +114,7 @@ class Keys: KV_LORA_RANK = "{arch}.attention.kv_lora_rank" REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" SLIDING_WINDOW = "{arch}.attention.sliding_window" - MULTIPLIER = "{arch}.attention.multiplier" + SCALE = "{arch}.attention.scale" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index aed56ac96..bd059b45c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -679,11 +679,11 @@ class GGUFWriter: def add_time_decay_extra_dim(self, dim: int) -> None: self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim) - def add_residual_multiplier(self, value: float) -> None: - self.add_float32(Keys.LLM.RESIDUAL_MULTIPLIER.format(arch=self.arch), value) + def add_residual_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value) - def add_embedding_multiplier(self, value: float) -> None: - self.add_float32(Keys.LLM.EMBEDDING_MULTIPLIER.format(arch=self.arch), value) + def add_embedding_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value) def add_wkv_head_size(self, size: int) -> None: self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) @@ -709,8 +709,8 @@ class GGUFWriter: def add_sliding_window(self, value: int) -> None: self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value) - def add_attention_multiplier(self, value: float) -> None: - self.add_float32(Keys.Attention.MULTIPLIER.format(arch=self.arch), value) + def add_attention_scale(self, value: float) -> None: + self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value) def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)