un-hardcode max-alibi-bias
This commit is contained in:
parent
f42285f0e5
commit
2d4de517bb
2 changed files with 20 additions and 5 deletions
|
@ -2938,6 +2938,7 @@ class T5Model(Model):
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@Model.register("JAISLMHeadModel")
|
@Model.register("JAISLMHeadModel")
|
||||||
class JaisModel(Model):
|
class JaisModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.JAIS
|
model_arch = gguf.MODEL_ARCH.JAIS
|
||||||
|
@ -2984,13 +2985,27 @@ class JaisModel(Model):
|
||||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
# Hack to populate self.tensor_names
|
||||||
|
all(self.get_tensors())
|
||||||
|
if 'transformer.relative_pe.slopes' not in self.tensor_names:
|
||||||
|
self.gguf_writer.add_max_alibi_bias(8.0)
|
||||||
|
# else set later
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
|
||||||
tensors: list[tuple[str, Tensor]] = []
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
# we don't need these
|
# we don't need these
|
||||||
if name.endswith((".attn.bias", "relative_pe.slopes")):
|
if name.endswith((".attn.bias")):
|
||||||
|
return tensors
|
||||||
|
|
||||||
|
if name.endswith(("relative_pe.slopes")):
|
||||||
|
# calculate ALiBi bias
|
||||||
|
n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
|
||||||
|
first_val = float(data_torch._data[0])
|
||||||
|
alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
|
||||||
|
self.gguf_writer.add_max_alibi_bias(alibi_bias)
|
||||||
return tensors
|
return tensors
|
||||||
|
|
||||||
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
|
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
|
||||||
|
|
|
@ -4902,8 +4902,8 @@ static void llm_load_hparams(
|
||||||
case LLM_ARCH_JAIS:
|
case LLM_ARCH_JAIS:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
// TODO: become GGUF KV parameter
|
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||||||
hparams.f_max_alibi_bias = 8.0f;
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 24: model.type = e_model::MODEL_1_3B; break;
|
case 24: model.type = e_model::MODEL_1_3B; break;
|
||||||
case 40: model.type = e_model::MODEL_13B; break;
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue