add phi3 128k support in convert-hf-to-gguf

2024-05-01 15:15:12 +08:00 · 2024-05-01 15:15:12 +08:00 · 8fa413d8b5
commit 8fa413d8b5
parent 11474e756d
3 changed files with 59 additions and 11 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -14,6 +14,7 @@ from pathlib import Path
 from hashlib import sha256
 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast

+import math
 import numpy as np
 import torch

@ -1784,23 +1785,57 @@ class Phi3MiniModel(Model):
    def set_gguf_parameters(self):
        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])

-        rot_pct = 1.0
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        rms_eps = self.find_hparam(["rms_norm_eps"])
+        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
+        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
+        rope_dims =  n_embd // n_head

        self.gguf_writer.add_name("Phi3")
-        self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
-
+        self.gguf_writer.add_context_length(max_pos_embds)
+        self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
        self.gguf_writer.add_embedding_length(n_embd)
-        self.gguf_writer.add_feed_forward_length(8192)
+        self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head)
        self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
-        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
+        self.gguf_writer.add_rope_dimension_count(rope_dims)
+        self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
        self.gguf_writer.add_file_type(self.ftype)

+        # write rope scaling for long context (128k) model
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if (rope_scaling is None):
+            return
+
+        scale = max_pos_embds / orig_max_pos_embds
+
+        rope_scaling_type = rope_scaling.get('type', '').lower()
+        if len(rope_scaling_type) == 0:
+            raise KeyError(f'Missing the required key rope_scaling.type')
+
+        if rope_scaling_type == 'su':
+            attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
+        elif rope_scaling_type == 'yarn':
+            attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
+        else:
+            raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
+
+        self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
+
+        long_factors = rope_scaling.get('long_factor', None)
+        short_factors = rope_scaling.get('short_factor', None)
+
+        if long_factors is None or short_factors is None:
+            raise KeyError(f'Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+
+        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
+            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+
+        self.gguf_writer.add_rope_scaling_freq_long_factors(long_factors)
+        self.gguf_writer.add_rope_scaling_freq_short_factors(short_factors)

@Model.register("PlamoForCausalLM")
 class PlamoModel(Model):
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -61,6 +61,9 @@ class Keys:
        FREQ_BASE               = "{arch}.rope.freq_base"
        SCALING_TYPE            = "{arch}.rope.scaling.type"
        SCALING_FACTOR          = "{arch}.rope.scaling.factor"
+        SCALING_LONG_FACTORS    = "{arch}.rope.scaling.freq_long_factors"
+        SCALING_SHORT_FACTORS   = "{arch}.rope.scaling.freq_short_factors"
+        SCALING_ATTN_FACTOR     = "{arch}.rope.scaling.attn_factor"
        SCALING_ORIG_CTX_LEN    = "{arch}.rope.scaling.original_context_length"
        SCALING_FINETUNED       = "{arch}.rope.scaling.finetuned"

@ -780,6 +783,7 @@ class RopeScalingType(Enum):
    NONE   = 'none'
    LINEAR = 'linear'
    YARN   = 'yarn'
+    SU     = 'su'


 class PoolingType(IntEnum):
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -433,6 +433,15 @@ class GGUFWriter:
    def add_rope_scaling_factor(self, value: float) -> None:
        self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)

+    def add_rope_scaling_freq_long_factors(self, value: Sequence[float]) -> None:
+        self.add_array(Keys.Rope.SCALING_LONG_FACTORS.format(arch=self.arch), value)
+
+    def add_rope_scaling_freq_short_factors(self, value: Sequence[float]) -> None:
+        self.add_array(Keys.Rope.SCALING_SHORT_FACTORS.format(arch=self.arch), value)
+
+    def add_rope_scaling_attn_factors(self, value: Sequence[float]) -> None:
+        self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
+
    def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
        self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)