Add Nemotron/Minitron GGUF Conversion & Inference Support (#8922)

* Add nemotron GGUF conversion & inference support

* Fix formatting issues

* Remove unnecessary write_tensors()

* Update convert_hf_to_gguf.py

Co-authored-by: compilade <git@compilade.net>

* Update src/llama.cpp

Co-authored-by: compilade <git@compilade.net>

* Address comments by @compilade

* Replace ggml_mul_mat()->llm_build_lora_mm()

* Remove mutable variable

* Use  for bias tensors

* Cover corner case for role_scaling not in config.json

---------

Co-authored-by: compilade <git@compilade.net>
This commit is contained in:
Yoshi Suhara 2024-08-15 19:23:33 -07:00 committed by GitHub
parent e3f6fd56b1
commit 2a24c8caa6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 271 additions and 11 deletions

View file

@ -13,7 +13,7 @@ class TensorNameMap:
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais
"transformer.word_embeddings", # falcon
"word_embeddings", # bloom
"model.embed_tokens", # llama-hf
"model.embed_tokens", # llama-hf nemotron
"tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert nomic-bert
"language_model.embedding.word_embeddings", # persimmon
@ -52,7 +52,7 @@ class TensorNameMap:
# Output
MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron
"output", # llama-pth bloom internlm2
"word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2
@ -75,6 +75,7 @@ class TensorNameMap:
"transformer.rms_norm", # Grok
"encoder.final_layernorm", # chatglm
"transformer.norm", # openelm
"model.norm", # nemotron
),
# Rope frequencies
@ -93,7 +94,7 @@ class TensorNameMap:
"transformer.h.{bid}.input_layernorm", # falcon7b
"h.{bid}.input_layernorm", # bloom
"transformer.h.{bid}.ln_mlp", # falcon40b
"model.layers.{bid}.input_layernorm", # llama-hf
"model.layers.{bid}.input_layernorm", # llama-hf nemotron
"layers.{bid}.attention_norm", # llama-pth
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
"model.layers.{bid}.ln1", # yi
@ -135,7 +136,7 @@ class TensorNameMap:
# Attention query
MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", # llama-hf
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron
"layers.{bid}.attention.wq", # llama-pth
"encoder.layer.{bid}.attention.self.query", # bert
"transformer.h.{bid}.attn.q_proj", # gpt-j
@ -146,7 +147,7 @@ class TensorNameMap:
# Attention key
MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", # llama-hf
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron
"layers.{bid}.attention.wk", # llama-pth
"encoder.layer.{bid}.attention.self.key", # bert
"transformer.h.{bid}.attn.k_proj", # gpt-j
@ -158,7 +159,7 @@ class TensorNameMap:
# Attention value
MODEL_TENSOR.ATTN_V: (
"model.layers.{bid}.self_attn.v_proj", # llama-hf
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron
"layers.{bid}.attention.wv", # llama-pth
"encoder.layer.{bid}.attention.self.value", # bert
"transformer.h.{bid}.attn.v_proj", # gpt-j
@ -175,7 +176,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.attn.out_proj", # mpt
"transformer.h.{bid}.self_attention.dense", # falcon
"h.{bid}.self_attention.dense", # bloom
"model.layers.{bid}.self_attn.o_proj", # llama-hf
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron
"layers.{bid}.attention.wo", # llama-pth
"encoder.layer.{bid}.attention.output.dense", # bert
"transformer.h.{bid}.attn.out_proj", # gpt-j
@ -218,7 +219,7 @@ class TensorNameMap:
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais
"h.{bid}.post_attention_layernorm", # bloom
"transformer.blocks.{bid}.norm_2", # mpt
"model.layers.{bid}.post_attention_layernorm", # llama-hf
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron
"layers.{bid}.ffn_norm", # llama-pth
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
"model.layers.{bid}.ln2", # yi
@ -258,7 +259,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.ffn.up_proj", # mpt
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
"h.{bid}.mlp.dense_h_to_4h", # bloom
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron
"layers.{bid}.feed_forward.w3", # llama-pth
"encoder.layer.{bid}.intermediate.dense", # bert
"transformer.h.{bid}.mlp.fc_in", # gpt-j
@ -329,7 +330,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.ffn.down_proj", # mpt
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
"h.{bid}.mlp.dense_4h_to_h", # bloom
"model.layers.{bid}.mlp.down_proj", # llama-hf
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron
"layers.{bid}.feed_forward.w2", # llama-pth
"encoder.layer.{bid}.output.dense", # bert
"transformer.h.{bid}.mlp.fc_out", # gpt-j