add phi3 128k support in convert-hf-to-gguf
This commit is contained in:
parent
11474e756d
commit
8fa413d8b5
3 changed files with 59 additions and 11 deletions
|
@ -14,6 +14,7 @@ from pathlib import Path
|
|||
from hashlib import sha256
|
||||
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
@ -1784,23 +1785,57 @@ class Phi3MiniModel(Model):
|
|||
def set_gguf_parameters(self):
|
||||
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
|
||||
|
||||
rot_pct = 1.0
|
||||
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||
rms_eps = self.find_hparam(["rms_norm_eps"])
|
||||
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
||||
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
||||
rope_dims = n_embd // n_head
|
||||
|
||||
self.gguf_writer.add_name("Phi3")
|
||||
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
|
||||
|
||||
self.gguf_writer.add_context_length(max_pos_embds)
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
|
||||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
self.gguf_writer.add_feed_forward_length(8192)
|
||||
self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_head_count(n_head)
|
||||
self.gguf_writer.add_head_count_kv(n_head)
|
||||
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
|
||||
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
||||
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
# write rope scaling for long context (128k) model
|
||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||
if (rope_scaling is None):
|
||||
return
|
||||
|
||||
scale = max_pos_embds / orig_max_pos_embds
|
||||
|
||||
rope_scaling_type = rope_scaling.get('type', '').lower()
|
||||
if len(rope_scaling_type) == 0:
|
||||
raise KeyError(f'Missing the required key rope_scaling.type')
|
||||
|
||||
if rope_scaling_type == 'su':
|
||||
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
|
||||
elif rope_scaling_type == 'yarn':
|
||||
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
|
||||
else:
|
||||
raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
|
||||
|
||||
self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
|
||||
|
||||
long_factors = rope_scaling.get('long_factor', None)
|
||||
short_factors = rope_scaling.get('short_factor', None)
|
||||
|
||||
if long_factors is None or short_factors is None:
|
||||
raise KeyError(f'Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
||||
|
||||
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
||||
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
||||
|
||||
self.gguf_writer.add_rope_scaling_freq_long_factors(long_factors)
|
||||
self.gguf_writer.add_rope_scaling_freq_short_factors(short_factors)
|
||||
|
||||
@Model.register("PlamoForCausalLM")
|
||||
class PlamoModel(Model):
|
||||
|
|
|
@ -57,12 +57,15 @@ class Keys:
|
|||
CAUSAL = "{arch}.attention.causal"
|
||||
|
||||
class Rope:
|
||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
FREQ_BASE = "{arch}.rope.freq_base"
|
||||
SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
||||
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
FREQ_BASE = "{arch}.rope.freq_base"
|
||||
SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
||||
SCALING_LONG_FACTORS = "{arch}.rope.scaling.freq_long_factors"
|
||||
SCALING_SHORT_FACTORS = "{arch}.rope.scaling.freq_short_factors"
|
||||
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
||||
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||
|
||||
class SSM:
|
||||
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
||||
|
@ -780,6 +783,7 @@ class RopeScalingType(Enum):
|
|||
NONE = 'none'
|
||||
LINEAR = 'linear'
|
||||
YARN = 'yarn'
|
||||
SU = 'su'
|
||||
|
||||
|
||||
class PoolingType(IntEnum):
|
||||
|
|
|
@ -433,6 +433,15 @@ class GGUFWriter:
|
|||
def add_rope_scaling_factor(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_freq_long_factors(self, value: Sequence[float]) -> None:
|
||||
self.add_array(Keys.Rope.SCALING_LONG_FACTORS.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_freq_short_factors(self, value: Sequence[float]) -> None:
|
||||
self.add_array(Keys.Rope.SCALING_SHORT_FACTORS.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_attn_factors(self, value: Sequence[float]) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue