diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4a8aec8cd..44061e286 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2972,6 +2972,8 @@ class JaisModel(Model): else: assert False + self.max_alibi_bias = 8.0 + def set_vocab(self): self._set_vocab_gpt2() @@ -2985,12 +2987,6 @@ class JaisModel(Model): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - # Hack to populate self.tensor_names - all(self.get_tensors()) - if 'transformer.relative_pe.slopes' not in self.tensor_names: - self.gguf_writer.add_max_alibi_bias(8.0) - # else set later - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -3001,11 +2997,14 @@ class JaisModel(Model): return tensors if name.endswith(("relative_pe.slopes")): - # calculate ALiBi bias + # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) + # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, + # but Jais's PyTorch model simply precalculates the slope values and places them + # in relative_pes.slopes n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) first_val = float(data_torch._data[0]) - alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) - self.gguf_writer.add_max_alibi_bias(alibi_bias) + self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) + return tensors if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): @@ -3025,6 +3024,10 @@ class JaisModel(Model): return tensors + def write_tensors(self): + super().write_tensors() + self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) + ###### CONVERSION LOGIC ###### diff --git a/src/llama.cpp b/src/llama.cpp index 8549388f1..e5907ac50 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6942,6 +6942,7 @@ static bool llm_load_tensors( case LLM_ARCH_BITNET: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});