diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 47a108779..539808bcc 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -186,8 +186,6 @@ class MODEL_TENSOR(IntEnum): ATTN_Q_NORM = auto() ATTN_K_NORM = auto() LAYER_OUT_NORM = auto() - LAYER_NORM_1 = auto() - LAYER_NORM_2 = auto() SSM_IN = auto() SSM_CONV1D = auto() SSM_X = auto() @@ -276,8 +274,6 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", - MODEL_TENSOR.LAYER_NORM_1: "blk.{bid}.layer_norm_1", - MODEL_TENSOR.LAYER_NORM_2: "blk.{bid}.layer_norm_2", MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", @@ -430,8 +426,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.LAYER_OUT_NORM, - MODEL_TENSOR.LAYER_NORM_1, - MODEL_TENSOR.LAYER_NORM_2, + MODEL_TENSOR.ATTN_NORM_2, ], MODEL_ARCH.MPT: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ea139339c..81b4992a5 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -102,6 +102,7 @@ class TensorNameMap: # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( "transformer.h.{bid}.ln_attn", # falcon40b + "encoder.layer.{bid}.layer_norm_1", # jina-v2-code ), # Attention query-key-value @@ -351,20 +352,9 @@ class TensorNameMap: "encoder.layers.{bid}.norm2", # nomic-bert "transformer.decoder_layer.{bid}.rms_norm_3", # Grok "encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2 - "encoder.layer.{bid}.layer_norm_1", # jina-v2-code "encoder.layer.{bid}.layer_norm_2" # jina-v2-code ), - - MODEL_TENSOR.LAYER_NORM_1: ( - "encoder.layer.{bid}.layer_norm_1", # jina-v2-code - ), - - - MODEL_TENSOR.LAYER_NORM_2: ( - "encoder.layer.{bid}.layer_norm_2", # jina-v2-code - ), - MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", diff --git a/llama.cpp b/llama.cpp index 229b63a29..4662f1fdd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -496,8 +496,6 @@ enum llm_tensor { LLM_TENSOR_ATTN_KV_B, LLM_TENSOR_ATTN_Q_A_NORM, LLM_TENSOR_ATTN_KV_A_NORM, - LLM_TENSOR_LAYER_NORM_1, - LLM_TENSOR_LAYER_NORM_2, }; static const std::map> LLM_TENSOR_NAMES = { @@ -719,8 +717,7 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_LAYER_NORM_1, "blk.%d.layer_norm_1" }, - { LLM_TENSOR_LAYER_NORM_2, "blk.%d.layer_norm_2" }, + { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, }, }, { @@ -2014,12 +2011,6 @@ struct llama_layer { struct ggml_tensor * layer_out_norm_b; struct ggml_tensor * ffn_norm_exps; - // extra normalization layers needed by `jina-embeddings-v2-base-code` - struct ggml_tensor * layer_norm_1; - struct ggml_tensor * layer_norm_1_b; - struct ggml_tensor * layer_norm_2; - struct ggml_tensor * layer_norm_2_b; - // ff struct ggml_tensor * ffn_gate; // w1 struct ggml_tensor * ffn_down; // w2 @@ -4680,7 +4671,8 @@ static void llm_load_vocab( tokenizer_pre == "jina-es" || tokenizer_pre == "jina-de" || tokenizer_pre == "jina-v2-es" || - tokenizer_pre == "jina-v2-de") { + tokenizer_pre == "jina-v2-de" || + tokenizer_pre == "jina-v2-code") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; } else if ( tokenizer_pre == "refact") { @@ -5547,12 +5539,9 @@ static bool llm_load_tensors( layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}); - layer.layer_norm_1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_NORM_1, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.layer_norm_1_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_NORM_1, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.layer_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.layer_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); @@ -8516,12 +8505,8 @@ struct llm_build_context { // attention layer norm cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il); - if (model.layers[il].layer_norm_1 != nullptr) { - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_norm_1, model.layers[il].layer_norm_1_b, LLM_NORM, cb, il); - } - - if (model.layers[il].layer_norm_2 != nullptr) { - cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_norm_2, model.layers[il].layer_norm_2_b, LLM_NORM, cb, il); + if (model.layers[il].attn_norm_2 != nullptr) { + cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il); } struct ggml_tensor * ffn_inp = cur;