feat: add jina v2 support
This commit is contained in:
parent
0fd6c1f015
commit
67022f14f2
4 changed files with 68 additions and 3 deletions
|
@ -1783,6 +1783,16 @@ class NomicBertModel(BertModel):
|
||||||
yield name, data
|
yield name, data
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("JinaBertModel")
|
||||||
|
class JinaBertModel(BertModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.JINA_BERT
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
assert self.hparams["position_embedding_type"] == "alibi"
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GemmaForCausalLM")
|
@Model.register("GemmaForCausalLM")
|
||||||
class GemmaModel(Model):
|
class GemmaModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GEMMA
|
model_arch = gguf.MODEL_ARCH.GEMMA
|
||||||
|
|
|
@ -107,6 +107,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
REFACT = auto()
|
REFACT = auto()
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
NOMIC_BERT = auto()
|
NOMIC_BERT = auto()
|
||||||
|
JINA_BERT = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
QWEN = auto()
|
QWEN = auto()
|
||||||
|
@ -173,6 +174,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.REFACT: "refact",
|
MODEL_ARCH.REFACT: "refact",
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||||
|
MODEL_ARCH.JINA_BERT: "jina-bert",
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
MODEL_ARCH.QWEN: "qwen",
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
|
@ -326,6 +328,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.JINA_BERT: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
|
],
|
||||||
MODEL_ARCH.MPT: [
|
MODEL_ARCH.MPT: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
|
|
@ -207,6 +207,9 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||||
|
"encoder.layer.{bid}.mlp.gated_layers", # jina-bert
|
||||||
|
"encoder.layer.{bid}.mlp.layernorm", # jina-bert
|
||||||
|
"encoder.layer.{bid}.mlp.wo", # jina-bert
|
||||||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||||
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
"model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
"model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
|
|
42
llama.cpp
42
llama.cpp
|
@ -201,6 +201,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_REFACT,
|
LLM_ARCH_REFACT,
|
||||||
LLM_ARCH_BERT,
|
LLM_ARCH_BERT,
|
||||||
LLM_ARCH_NOMIC_BERT,
|
LLM_ARCH_NOMIC_BERT,
|
||||||
|
LLM_ARCH_JINA_BERT,
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
LLM_ARCH_STABLELM,
|
LLM_ARCH_STABLELM,
|
||||||
LLM_ARCH_QWEN,
|
LLM_ARCH_QWEN,
|
||||||
|
@ -230,6 +231,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_REFACT, "refact" },
|
{ LLM_ARCH_REFACT, "refact" },
|
||||||
{ LLM_ARCH_BERT, "bert" },
|
{ LLM_ARCH_BERT, "bert" },
|
||||||
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
||||||
|
{ LLM_ARCH_JINA_BERT, "jina-bert" },
|
||||||
{ LLM_ARCH_BLOOM, "bloom" },
|
{ LLM_ARCH_BLOOM, "bloom" },
|
||||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||||
{ LLM_ARCH_QWEN, "qwen" },
|
{ LLM_ARCH_QWEN, "qwen" },
|
||||||
|
@ -607,6 +609,22 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_JINA_BERT,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||||
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
{
|
{
|
||||||
|
@ -3455,6 +3473,18 @@ static void llm_load_hparams(
|
||||||
model.type = e_model::MODEL_137M;
|
model.type = e_model::MODEL_137M;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_JINA_BERT:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||||
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||||||
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
||||||
|
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
@ -4329,6 +4359,7 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_BERT:
|
case LLM_ARCH_BERT:
|
||||||
|
case LLM_ARCH_JINA_BERT:
|
||||||
case LLM_ARCH_NOMIC_BERT:
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
{
|
{
|
||||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
@ -4346,7 +4377,7 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
auto & layer = model.layers[i];
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
if (model.arch == LLM_ARCH_BERT) {
|
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT) {
|
||||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
||||||
|
|
||||||
|
@ -4367,7 +4398,7 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
|
|
||||||
if (model.arch == LLM_ARCH_BERT) {
|
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT) {
|
||||||
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
||||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
||||||
|
|
||||||
|
@ -6512,7 +6543,7 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * Vcur;
|
struct ggml_tensor * Vcur;
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
if (model.arch == LLM_ARCH_BERT) {
|
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT) {
|
||||||
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
@ -8411,6 +8442,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_bert();
|
result = llm.build_bert();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_JINA_BERT:
|
||||||
|
{
|
||||||
|
result = llm.build_bert();
|
||||||
|
} break;
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
{
|
{
|
||||||
result = llm.build_bloom();
|
result = llm.build_bloom();
|
||||||
|
@ -13094,6 +13129,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_REFACT:
|
case LLM_ARCH_REFACT:
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
case LLM_ARCH_MAMBA:
|
case LLM_ARCH_MAMBA:
|
||||||
|
case LLM_ARCH_JINA_BERT:
|
||||||
return LLAMA_ROPE_TYPE_NONE;
|
return LLAMA_ROPE_TYPE_NONE;
|
||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue