From cf1c1447e32a21d9ce5f0447eff9d8459e011a7f Mon Sep 17 00:00:00 2001 From: Joan Martinez Date: Mon, 22 Apr 2024 13:05:26 +0200 Subject: [PATCH] fix: fix usage of ALIBI --- convert-hf-to-gguf.py | 5 ----- ggml.c | 14 ++++++-------- llama.cpp | 6 +++--- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 218b136f9..9c01c296e 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2175,17 +2175,12 @@ class JinaBertModel(BertModel): self.intermediate_size = self.hparams["intermediate_size"] def get_tensors(self): - import string - print(f'Intermediate SIZE: {self.intermediate_size}') - for name, data in super().get_tensors(): if 'gated_layers' in name: - print(f'name {name} => {data.shape}') d1 = data[:self.intermediate_size, :] name1 = name.replace('gated_layers', 'gated_layers_w') d2 = data[self.intermediate_size:, :] name2 = name.replace('gated_layers', 'gated_layers_v') - print(f'd1 {d1.shape}, d2 {d2.shape}') yield name1, d1 yield name2, d2 continue diff --git a/ggml.c b/ggml.c index 793b67f4c..6ae51fd13 100644 --- a/ggml.c +++ b/ggml.c @@ -5406,10 +5406,6 @@ static struct ggml_tensor * ggml_soft_max_impl( GGML_ASSERT(pos->ne[0] == a->ne[0]); } - if (max_bias > 0.0f) { - GGML_ASSERT(pos); - } - bool is_node = false; if (a->grad) { @@ -12241,11 +12237,11 @@ static void ggml_compute_forward_soft_max_f32( float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching - float * pos = src2 ? (float *) src2->data : src0->data; + float * pos = src2 ? (float *) src2->data : NULL; for (int i1 = ir0; i1 < ir1; i1++) { float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); - float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); + float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); // broadcast the mask across rows float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL; @@ -12262,7 +12258,7 @@ static void ggml_compute_forward_soft_max_f32( const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1); for (int i = 0; i < nc; i++) { - wp[i] = wp[i] + slope*pos[i]; + wp[i] = wp[i] - slope*abs(i1%nc - i); } } @@ -12478,7 +12474,7 @@ static void ggml_compute_forward_alibi_f32( for (int64_t j = 0; j < ne1; j++) { float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); - pdst[0] = i * m_k + src[0]; + pdst[0] = -1.0f * i * m_k; } } } @@ -16111,6 +16107,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm return; } + fprintf(stdout, "Computing forward (%s) for tensor %s\n", GGML_OP_NAME[tensor->op], tensor->name); switch (tensor->op) { case GGML_OP_DUP: { @@ -16447,6 +16444,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm GGML_ASSERT(false); } break; } + fprintf(stdout, "After FORWARD %s (%p): Shape:%li, %li, %li, %li tensor: %9.6f, %9.6f, %9.6f, %9.6f \n", tensor->name, tensor, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ((float *)(tensor->data))[0], ((float *)(tensor->data))[1], ((float *)(tensor->data))[2], ((float *)(tensor->data))[3]); } //////////////////////////////////////////////////////////////////////////////// diff --git a/llama.cpp b/llama.cpp index eafabd48d..e52b39d12 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3795,6 +3795,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + hparams.f_max_alibi_bias = 8.0f; switch (hparams.n_layer) { case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small @@ -4001,7 +4002,7 @@ static void llm_load_hparams( model.ftype = ml.ftype; - if (hparams.f_max_alibi_bias > 0.0f) { + if (hparams.f_max_alibi_bias > 0.0f && model.arch != LLM_ARCH_JINA_BERT) { hparams.need_kq_pos = true; } @@ -4519,7 +4520,6 @@ static bool llm_load_tensors( model.layers.resize(n_layer); const auto tn = LLM_TN(model.arch); - //std::printf("JOAN HERE ARCH %i", model.arch); switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: @@ -7525,7 +7525,7 @@ struct llm_build_context { struct ggml_tensor * inp_pos = nullptr; if (model.arch != LLM_ARCH_JINA_BERT) { - inp_pos = build_inp_pos(); + inp_pos = build_inp_pos(); } struct ggml_tensor * inp_mean = build_inp_mean(); struct ggml_tensor * inp_cls = build_inp_cls();