Add Nemotron/Minitron GGUF Conversion & Inference Support (#8922)
* Add nemotron GGUF conversion & inference support * Fix formatting issues * Remove unnecessary write_tensors() * Update convert_hf_to_gguf.py Co-authored-by: compilade <git@compilade.net> * Update src/llama.cpp Co-authored-by: compilade <git@compilade.net> * Address comments by @compilade * Replace ggml_mul_mat()->llm_build_lora_mm() * Remove mutable variable * Use for bias tensors * Cover corner case for role_scaling not in config.json --------- Co-authored-by: compilade <git@compilade.net>
This commit is contained in:
parent
e3f6fd56b1
commit
2a24c8caa6
4 changed files with 271 additions and 11 deletions
|
@ -3740,6 +3740,47 @@ class ChatGLMModel(Model):
|
|||
name = name.removeprefix("transformer.")
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("NemotronForCausalLM")
|
||||
class NemotronModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.NEMOTRON
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
self.gguf_writer.add_pad_token_id(0)
|
||||
self.gguf_writer.add_unk_token_id(1)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
|
||||
f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
|
||||
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
||||
|
||||
# * Partial RoPE
|
||||
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
|
||||
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
||||
|
||||
# * RopeScaling for Nemotron
|
||||
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
else:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
|
||||
# model.layers.{l}.input_layernorm.weight
|
||||
# model.layers.{l}.post_attention_layernorm.weight
|
||||
# model.norm.weight
|
||||
if name.endswith("norm.weight"):
|
||||
data_torch = data_torch + 1
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue