diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 2ba675ef0..1f7515e7c 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2718,8 +2718,8 @@ class OlmoModel(Model): @Model.register("JinaBertModel", "JinaBertForMaskedLM") -class JinaBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.JINA_BERT +class JinaBertV2Model(BertModel): + model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 702842ffe..71039fabb 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -118,7 +118,7 @@ class MODEL_ARCH(IntEnum): REFACT = auto() BERT = auto() NOMIC_BERT = auto() - JINA_BERT = auto() + JINA_BERT_V2 = auto() BLOOM = auto() STABLELM = auto() QWEN = auto() @@ -195,7 +195,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.REFACT: "refact", MODEL_ARCH.BERT: "bert", MODEL_ARCH.NOMIC_BERT: "nomic-bert", - MODEL_ARCH.JINA_BERT: "jina-bert", + MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.QWEN: "qwen", @@ -380,7 +380,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, MODEL_TENSOR.LAYER_OUT_NORM, ], - MODEL_ARCH.JINA_BERT: [ + MODEL_ARCH.JINA_BERT_V2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, MODEL_TENSOR.TOKEN_TYPES, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 45a68fc06..8531b2f73 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -238,7 +238,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.w3", # internlm2 "encoder.layers.{bid}.mlp.fc11", # nomic-bert "model.layers.{bid}.mlp.c_fc", # starcoder2 - "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert + "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -265,7 +265,7 @@ class TensorNameMap: "model.layers.layers.{bid}.mlp.gate_proj", # plamo "model.layers.{bid}.feed_forward.w1", # internlm2 "encoder.layers.{bid}.mlp.fc12", # nomic-bert - "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert + "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -299,7 +299,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.w2", # internlm2 "encoder.layers.{bid}.mlp.fc2", # nomic-bert "model.layers.{bid}.mlp.c_proj", # starcoder2 - "encoder.layer.{bid}.mlp.wo", # jina-bert + "encoder.layer.{bid}.mlp.wo", # jina-bert-v2 ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -318,7 +318,7 @@ class TensorNameMap: "model.layers.{bid}.self_attn.q_layernorm", # persimmon "model.layers.{bid}.self_attn.q_norm", # cohere "transformer.blocks.{bid}.attn.q_ln", # sea-lion - "encoder.layer.{bid}.attention.self.layer_norm_q" # jina-bert + "encoder.layer.{bid}.attention.self.layer_norm_q" # jina-bert-v2 ), MODEL_TENSOR.ATTN_K_NORM: ( @@ -326,7 +326,7 @@ class TensorNameMap: "model.layers.{bid}.self_attn.k_layernorm", # persimmon "model.layers.{bid}.self_attn.k_norm", # cohere "transformer.blocks.{bid}.attn.k_ln", # sea-lion - "encoder.layer.{bid}.attention.self.layer_norm_k" # jina-bert + "encoder.layer.{bid}.attention.self.layer_norm_k" # jina-bert-v2 ), MODEL_TENSOR.ROPE_FREQS: ( @@ -337,7 +337,7 @@ class TensorNameMap: "encoder.layer.{bid}.output.LayerNorm", # bert "encoder.layers.{bid}.norm2", # nomic-bert "transformer.decoder_layer.{bid}.rms_norm_3", # Grok - "encoder.layer.{bid}.mlp.layernorm", # jina-bert + "encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2 ), MODEL_TENSOR.SSM_IN: ( diff --git a/llama.cpp b/llama.cpp index 7460e3531..330df9de5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -205,7 +205,7 @@ enum llm_arch { LLM_ARCH_REFACT, LLM_ARCH_BERT, LLM_ARCH_NOMIC_BERT, - LLM_ARCH_JINA_BERT, + LLM_ARCH_JINA_BERT_V2, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, @@ -241,7 +241,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_BERT, "bert" }, { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, - { LLM_ARCH_JINA_BERT, "jina-bert" }, + { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2"}, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_QWEN, "qwen" }, @@ -690,7 +690,7 @@ static const std::map> LLM_TENSOR_NA }, }, { - LLM_ARCH_JINA_BERT, + LLM_ARCH_JINA_BERT_V2, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, @@ -3893,7 +3893,7 @@ static void llm_load_hparams( model.type = e_model::MODEL_335M; break; // bge-large } } break; - case LLM_ARCH_JINA_BERT: + case LLM_ARCH_JINA_BERT_V2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); @@ -4137,7 +4137,7 @@ static void llm_load_hparams( model.ftype = ml.ftype; - if (hparams.f_max_alibi_bias > 0.0f && model.arch != LLM_ARCH_JINA_BERT) { + if (hparams.f_max_alibi_bias > 0.0f && model.arch != LLM_ARCH_JINA_BERT_V2) { hparams.need_kq_pos = true; } @@ -5113,7 +5113,7 @@ static bool llm_load_tensors( layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}); } } break; - case LLM_ARCH_JINA_BERT: + case LLM_ARCH_JINA_BERT_V2: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings @@ -7994,7 +7994,7 @@ struct llm_build_context { struct ggml_tensor * inpL; struct ggml_tensor * inp_pos = nullptr; - if (model.arch != LLM_ARCH_JINA_BERT) { + if (model.arch != LLM_ARCH_JINA_BERT_V2) { inp_pos = build_inp_pos(); } struct ggml_tensor * inp_mean = build_inp_mean(); @@ -8027,7 +8027,7 @@ struct llm_build_context { struct ggml_tensor * Vcur; // self-attention - if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT) { + if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); cb(Qcur, "Qcur", il); @@ -8137,7 +8137,7 @@ struct llm_build_context { model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - } else if (model.arch == LLM_ARCH_JINA_BERT) { + } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { cur = llm_build_ffn(ctx0, cur, model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, @@ -10544,7 +10544,7 @@ static struct ggml_cgraph * llama_build_graph( result = llm.build_refact(); } break; case LLM_ARCH_BERT: - case LLM_ARCH_JINA_BERT: + case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_NOMIC_BERT: { result = llm.build_bert(); @@ -15473,7 +15473,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_REFACT: case LLM_ARCH_BLOOM: case LLM_ARCH_MAMBA: - case LLM_ARCH_JINA_BERT: + case LLM_ARCH_JINA_BERT_V2: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values