convert : fix layer names

This commit is contained in:
Georgi Gerganov 2023-08-14 21:06:07 +03:00
parent 01080a5a51
commit c35fc0bbb0
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 57 additions and 57 deletions

View file

@ -95,7 +95,7 @@ else:
gguf_writer.add_architecture(llm_arch) gguf_writer.add_architecture(llm_arch)
gguf_writer.add_name(last_dir) gguf_writer.add_name(last_dir)
gguf_writer.add_file_type( "All tensors F32" if ftype == 0 else "Most tensors F16, some F32") gguf_writer.add_file_type("All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
gguf_writer.add_source_hf_repo(hf_repo) gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"]) gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"])
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"]) gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])

View file

@ -626,7 +626,7 @@ struct gguf_file_loader {
hparams.n_embd = read_u32("llama.embedding_length"); hparams.n_embd = read_u32("llama.embedding_length");
hparams.n_ff = read_u32("llama.feed_forward_length"); hparams.n_ff = read_u32("llama.feed_forward_length");
hparams.n_head = read_u32("llama.attention.head_count"); hparams.n_head = read_u32("llama.attention.head_count");
hparams.n_layer = read_u32("llama.layer_count"); hparams.n_layer = read_u32("llama.block_count");
hparams.n_rot = read_u32("llama.rope.dimension_count"); hparams.n_rot = read_u32("llama.rope.dimension_count");
hparams.f_rms_norm_eps = read_f32("llama.attention.layer_norm_rms_epsilon"); hparams.f_rms_norm_eps = read_f32("llama.attention.layer_norm_rms_epsilon");
@ -1373,7 +1373,7 @@ static void llama_model_load_internal(
ml->ggml_ctx = ctx; ml->ggml_ctx = ctx;
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); model.tok_embeddings = ml->get_tensor("token_embd.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
// "output" tensor // "output" tensor
{ {
@ -1394,8 +1394,8 @@ static void llama_model_load_internal(
backend_output = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU;
} }
model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm); model.norm = ml->get_tensor("output_norm.weight", {n_embd}, backend_norm);
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output); model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) { if (backend_norm == GGML_BACKEND_GPU) {
vram_weights += ggml_nbytes(model.norm); vram_weights += ggml_nbytes(model.norm);
} }
@ -1413,20 +1413,20 @@ static void llama_model_load_internal(
auto & layer = model.layers[i]; auto & layer = model.layers[i];
std::string layers_i = "layers." + std::to_string(i); std::string layers_i = "blk." + std::to_string(i);
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend); layer.attention_norm = ml->get_tensor(layers_i + ".attn_norm.weight", {n_embd}, backend);
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split); layer.wq = ml->get_tensor(layers_i + ".attn_q.weight", {n_embd, n_embd}, backend_split);
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split); layer.wk = ml->get_tensor(layers_i + ".attn_k.weight", {n_embd, n_embd_gqa}, backend_split);
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split); layer.wv = ml->get_tensor(layers_i + ".attn_v.weight", {n_embd, n_embd_gqa}, backend_split);
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split); layer.wo = ml->get_tensor(layers_i + ".attn_output.weight", {n_embd, n_embd}, backend_split);
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split); layer.w1 = ml->get_tensor(layers_i + ".ffn_gate.weight", {n_embd, n_ff}, backend_split);
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split); layer.w2 = ml->get_tensor(layers_i + ".ffn_down.weight", { n_ff, n_embd}, backend_split);
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split); layer.w3 = ml->get_tensor(layers_i + ".ffn_up.weight", {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) { if (backend == GGML_BACKEND_GPU) {
vram_weights += vram_weights +=

View file

@ -4,92 +4,92 @@ def get_tensor_namemap( n_blocks : int):
tensor_map = {} tensor_map = {}
# Token embeddings # Token embeddings
mapped_to = "token_embd" mapped_to = "token_embd"
tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
tensor_map["transformer.wte"] = mapped_to # gpt2 mpt tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
tensor_map["transformer.word_embeddings"] = mapped_to # falcon tensor_map["transformer.word_embeddings"] = mapped_to # falcon
tensor_map["model.embed_tokens"] = mapped_to # llama-hf tensor_map["model.embed_tokens"] = mapped_to # llama-hf
tensor_map["tok_embeddings"] = mapped_to # llama-pth tensor_map["tok_embeddings"] = mapped_to # llama-pth
# Position embeddings # Position embeddings
mapped_to = "pos_embd" mapped_to = "pos_embd"
tensor_map["transformer.wpe"] = mapped_to # gpt2 tensor_map["transformer.wpe"] = mapped_to # gpt2
# Output norm # Output norm
mapped_to = "output_norm" mapped_to = "output_norm"
tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
tensor_map["transformer.norm_f"] = mapped_to # mpt tensor_map["transformer.norm_f"] = mapped_to # mpt
tensor_map["model.norm"] = mapped_to # llama-hf tensor_map["model.norm"] = mapped_to # llama-hf
tensor_map["norm"] = mapped_to # llama-pth tensor_map["norm"] = mapped_to # llama-pth
# Output # Output
mapped_to = "output" mapped_to = "output"
tensor_map["embed_out"] = mapped_to # gptneox tensor_map["embed_out"] = mapped_to # gptneox
tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
tensor_map["output"] = mapped_to # llama-pth tensor_map["output"] = mapped_to # llama-pth
# Attention and fee-forward layer blocks # Attention and fee-forward layer blocks
for i in range(0,n_blocks): for i in range(0,n_blocks):
# Attention norm # Attention norm
mapped_to = "blk."+str(i)+".attn_norm" mapped_to = "blk."+str(i)+".attn_norm"
tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2 tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
# Attention norm 2 # Attention norm 2
mapped_to = "blk."+str(i)+".attn_norm_2" mapped_to = "blk."+str(i)+".attn_norm_2"
tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
# Attention query-key-value # Attention query-key-value
mapped_to = "blk."+str(i)+".attn_qkv" mapped_to = "blk."+str(i)+".attn_qkv"
tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2 tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
# Attention query # Attention query
mapped_to = "blk."+str(i)+".attn_q" mapped_to = "blk."+str(i)+".attn_q"
tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
# Attention key # Attention key
mapped_to = "blk."+str(i)+".attn_k" mapped_to = "blk."+str(i)+".attn_k"
tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
# Attention value # Attention value
mapped_to = "blk."+str(i)+".attn_v" mapped_to = "blk."+str(i)+".attn_v"
tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
# Attention output # Attention output
mapped_to = "blk."+str(i)+".attn_output" mapped_to = "blk."+str(i)+".attn_output"
tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2 tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
# Feed-forward norm # Feed-forward norm
mapped_to = "blk."+str(i)+".ffn_norm" mapped_to = "blk."+str(i)+".ffn_norm"
tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2 tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
# Feed-forward up # Feed-forward up
mapped_to = "blk."+str(i)+".ffn_up" mapped_to = "blk."+str(i)+".ffn_up"
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2 tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
# Feed-forward gate # Feed-forward gate
mapped_to = "blk."+str(i)+".ffn_gate" mapped_to = "blk."+str(i)+".ffn_gate"
tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
# Feed-forward down # Feed-forward down
mapped_to = "blk."+str(i)+".ffn_down" mapped_to = "blk."+str(i)+".ffn_down"
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2 tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
return tensor_map return tensor_map