From 996f7f4ec54308a5cff72f8a71e471c1bc27aa2c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 15 Feb 2024 13:46:26 +0200 Subject: [PATCH] ggml : support multi-sequence ALiBi (Metal) ggml-ci --- ggml-metal.m | 19 ++++++---- ggml-metal.metal | 14 ++++--- ggml.c | 30 ++++++++++++--- ggml.h | 4 +- llama.cpp | 77 +++++++++++++++++++++++++++----------- tests/test-backend-ops.cpp | 33 ++-------------- 6 files changed, 106 insertions(+), 71 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index a0d9aaaba..1c67334d1 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -809,7 +809,7 @@ static bool ggml_metal_graph_compute( id id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil; id id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil; - //id id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil; + id id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil; id id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil; //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op)); @@ -1201,12 +1201,17 @@ static bool ggml_metal_graph_compute( } else { [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; } - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5]; - [encoder setBytes:&scale length:sizeof(scale) atIndex:6]; - [encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:7]; + if (id_src2) { + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:2]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:4]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:5]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:6]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:7]; + [encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:8]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; diff --git a/ggml-metal.metal b/ggml-metal.metal index 4b6514014..c5da88e14 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -351,6 +351,7 @@ kernel void kernel_sum_rows( kernel void kernel_soft_max( device const float * src0, device const float * src1, + device const float * src2, device float * dst, constant int64_t & ne00, constant int64_t & ne01, @@ -369,6 +370,7 @@ kernel void kernel_soft_max( device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr; + device const float * ppos = src2 != src0 ? src2 : nullptr; device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; float slope = 0.0f; @@ -390,7 +392,7 @@ kernel void kernel_soft_max( float lmax = -INFINITY; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - lmax = MAX(lmax, psrc0[i00]*scale + slope*i00 + (pmask ? pmask[i00] : 0.0f)); + lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]); } // find the max value in the block @@ -415,7 +417,7 @@ kernel void kernel_soft_max( // parallel sum float lsum = 0.0f; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - const float exp_psrc0 = exp((psrc0[i00]*scale + slope*i00 + (pmask ? pmask[i00] : 0.0f)) - max_val); + const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val); lsum += exp_psrc0; pdst[i00] = exp_psrc0; } @@ -453,6 +455,7 @@ kernel void kernel_soft_max( kernel void kernel_soft_max_4( device const float * src0, device const float * src1, + device const float * src2, device float * dst, constant int64_t & ne00, constant int64_t & ne01, @@ -471,10 +474,9 @@ kernel void kernel_soft_max_4( device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr; + device const float4 * ppos = src2 != src0 ? (device const float4 *)(src2) : nullptr; device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); - const float4 s0(0.0f, 1.0f, 2.0f, 3.0f); - float slope = 0.0f; if (max_bias > 0.0f) { @@ -493,7 +495,7 @@ kernel void kernel_soft_max_4( float4 lmax4 = -INFINITY; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - lmax4 = fmax(lmax4, psrc4[i00]*scale + slope*(4*i00 + s0) + (pmask ? pmask[i00] : 0.0f)); + lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]); } const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); @@ -519,7 +521,7 @@ kernel void kernel_soft_max_4( // parallel sum float4 lsum4 = 0.0f; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - const float4 exp_psrc4 = exp((psrc4[i00]*scale + slope*(4*i00 + s0) + (pmask ? pmask[i00] : 0.0f)) - max_val); + const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val); lsum4 += exp_psrc4; pdst4[i00] = exp_psrc4; } diff --git a/ggml.c b/ggml.c index 6cd5e47e1..afaeb74b9 100644 --- a/ggml.c +++ b/ggml.c @@ -5060,16 +5060,28 @@ static struct ggml_tensor * ggml_soft_max_impl( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * mask, + struct ggml_tensor * pos, float scale, float max_bias, bool inplace) { GGML_ASSERT(ggml_is_contiguous(a)); + if (mask) { GGML_ASSERT(ggml_is_contiguous(mask)); GGML_ASSERT(ggml_is_matrix(mask)); GGML_ASSERT(ggml_can_repeat_rows(mask, a)); } + if (pos) { + GGML_ASSERT(ggml_is_vector(pos)); + GGML_ASSERT(pos->type == GGML_TYPE_F32); + GGML_ASSERT(pos->ne[0] == a->ne[0]); + } + + if (max_bias > 0.0f) { + GGML_ASSERT(pos); + } + bool is_node = false; if (a->grad) { @@ -5085,6 +5097,7 @@ static struct ggml_tensor * ggml_soft_max_impl( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = mask; + result->src[2] = pos; return result; } @@ -5092,22 +5105,23 @@ static struct ggml_tensor * ggml_soft_max_impl( struct ggml_tensor * ggml_soft_max( struct ggml_context * ctx, struct ggml_tensor * a) { - return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false); + return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false); } struct ggml_tensor * ggml_soft_max_inplace( struct ggml_context * ctx, struct ggml_tensor * a) { - return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true); + return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true); } struct ggml_tensor * ggml_soft_max_ext( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * mask, + struct ggml_tensor * pos, float scale, float max_bias) { - return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false); + return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false); } // ggml_soft_max_back @@ -11460,6 +11474,7 @@ static void ggml_compute_forward_soft_max_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, + const struct ggml_tensor * src2, struct ggml_tensor * dst) { assert(ggml_is_contiguous(dst)); assert(ggml_are_same_shape(src0, dst)); @@ -11503,6 +11518,8 @@ static void ggml_compute_forward_soft_max_f32( float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; + float * pos = (float *) src2->data; + for (int i1 = ir0; i1 < ir1; i1++) { float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); @@ -11522,7 +11539,7 @@ static void ggml_compute_forward_soft_max_f32( const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1); for (int i = 0; i < nc; i++) { - wp[i] = wp[i] + slope*i; + wp[i] = wp[i] + slope*pos[i]; } } @@ -11570,11 +11587,12 @@ static void ggml_compute_forward_soft_max( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, + const struct ggml_tensor * src2, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_soft_max_f32(params, src0, src1, dst); + ggml_compute_forward_soft_max_f32(params, src0, src1, src2, dst); } break; default: { @@ -15101,7 +15119,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_SOFT_MAX: { - ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_SOFT_MAX_BACK: { diff --git a/ggml.h b/ggml.h index b112b9917..9bd8b98db 100644 --- a/ggml.h +++ b/ggml.h @@ -1373,13 +1373,15 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); - // fused soft_max(a*scale + mask + ALiBi bias) + // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope)) // mask is optional + // pos is required when max_bias > 0.0f // max_bias = 0.0f for no ALiBi GGML_API struct ggml_tensor * ggml_soft_max_ext( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * mask, + struct ggml_tensor * pos, float scale, float max_bias); diff --git a/llama.cpp b/llama.cpp index bc9ada35f..1c58aba48 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1923,6 +1923,7 @@ struct llama_context { struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch] struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch] + struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx] struct ggml_tensor * inp_K_shift; // I32 [n_ctx] struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch] @@ -4782,6 +4783,7 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * wo_b, struct ggml_tensor * q_cur, struct ggml_tensor * kq_mask, + struct ggml_tensor * kq_pos, int64_t n_ctx, int32_t n_tokens, int32_t n_kv, @@ -4833,7 +4835,7 @@ static struct ggml_tensor * llm_build_kqv( } else #endif { - kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); + kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias); cb(kq, "kq_soft_max_ext", il); } @@ -4881,6 +4883,7 @@ static struct ggml_tensor * llm_build_kv( struct ggml_tensor * v_cur, struct ggml_tensor * q_cur, struct ggml_tensor * kq_mask, + struct ggml_tensor * kq_pos, int64_t n_ctx, int32_t n_tokens, int32_t kv_head, @@ -4899,7 +4902,7 @@ static struct ggml_tensor * llm_build_kv( struct ggml_tensor * cur; cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b, - q_cur, kq_mask, n_ctx, n_tokens, n_kv, kq_scale, cb, il); + q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il); cb(cur, "kqv_out", il); return cur; @@ -5082,7 +5085,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5212,6 +5215,10 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); + // positions of the tokens in the KV cache + struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + cb(KQ_pos, "KQ_pos", -1); + // shift the entire K-cache if needed if (do_rope_shift) { llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); @@ -5262,7 +5269,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5386,7 +5393,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5485,7 +5492,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5690,7 +5697,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5752,6 +5759,10 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); + // positions of the tokens in the KV cache + struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + cb(KQ_pos, "KQ_pos", -1); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -5779,7 +5790,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5879,7 +5890,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } else { // compute Q and K and RoPE them @@ -5910,7 +5921,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5982,6 +5993,10 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); + // positions of the tokens in the KV cache + struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + cb(KQ_pos, "KQ_pos", -1); + inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, @@ -6015,7 +6030,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6075,6 +6090,10 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); cb(KQ_mask, "KQ_mask", -1); + // positions of the tokens in the KV cache + struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0); + cb(KQ_pos, "KQ_pos", -1); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -6108,7 +6127,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6230,7 +6249,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6345,7 +6364,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6466,7 +6485,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6593,7 +6612,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); cb(cur, "kqv_out", il); } @@ -6696,7 +6715,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } struct ggml_tensor * sa_out = cur; @@ -6795,7 +6814,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6904,7 +6923,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -7022,7 +7041,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -7141,7 +7160,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -7273,7 +7292,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -7504,6 +7523,18 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } + { + const int64_t n_kv = kv_self.n; + + assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer)); + + float * data = (float *) lctx.inp_KQ_pos->data; + + for (int i = 0; i < n_kv; ++i) { + data[i] = float(lctx.kv_self.cells[i].pos); + } + } + { assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer)); float * data = (float *) lctx.inp_sum->data; @@ -11419,6 +11450,7 @@ struct llama_context * llama_new_context_with_model( ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch); ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); + ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx); ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); @@ -11426,6 +11458,7 @@ struct llama_context * llama_new_context_with_model( ggml_set_name(ctx->inp_embd, "inp_embd"); ggml_set_name(ctx->inp_pos, "inp_pos"); ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask"); + ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos"); ggml_set_name(ctx->inp_K_shift, "inp_K_shift"); ggml_set_name(ctx->inp_sum, "inp_sum"); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e2e676ba7..88638c164 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1104,7 +1104,8 @@ struct test_soft_max : public test_case { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_tensor * b = nullptr; if (mask) { b = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]); } - ggml_tensor * out = ggml_soft_max_ext(ctx, a, b, scale, max_bias); + ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]); + ggml_tensor * out = ggml_soft_max_ext(ctx, a, b, c, scale, max_bias); return out; } }; @@ -1149,30 +1150,6 @@ struct test_rope : public test_case { } }; -// GGML_OP_ALIBI -struct test_alibi : public test_case { - const ggml_type type; - const std::array ne; - int n_past; - int n_head; - float bias_max; - - std::string vars() override { - return VARS_TO_STR5(type, ne, n_past, n_head, bias_max); - } - - test_alibi(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}, - int n_past = 512, int n_head = 10, float bias_max = 0.5f) - : type(type), ne(ne), n_past(n_past), n_head(n_head), bias_max(bias_max) {} - - ggml_tensor * build_graph(ggml_context * ctx) override { - ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); - ggml_tensor * out = ggml_alibi(ctx, a, n_past, n_head, bias_max); - return out; - } -}; - // GGML_OP_POOL2D struct test_pool2d : public test_case { enum ggml_op_pool pool_type; @@ -1490,7 +1467,7 @@ struct test_moe : public test_case { ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); - ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd), 0.0f); + ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f); // select experts ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok); @@ -1619,7 +1596,6 @@ public: ggml_cpy(ctx, v_cur_t, v_cache_view); } - // if max_alibi_bias > 0 then apply ALiBi struct ggml_tensor * llm_build_kqv( struct ggml_context * ctx, struct ggml_tensor * k_l, @@ -1638,7 +1614,7 @@ public: struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); - kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f); + kq = ggml_soft_max_ext(ctx, kq, kq_mask, nullptr, kq_scale, 0.0f); // split cached v into n_head heads struct ggml_tensor * v = @@ -2117,7 +2093,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2) } - test_cases.emplace_back(new test_alibi()); test_cases.emplace_back(new test_concat(GGML_TYPE_F32)); test_cases.emplace_back(new test_concat(GGML_TYPE_I32));