diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp index 537cf1c09..28ceecfc4 100644 --- a/ggml/src/ggml-kompute/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute/ggml-kompute.cpp @@ -1126,28 +1126,37 @@ static void ggml_vk_mul_mat_q6_k( const std::shared_ptr& inB, const std::shared_ptr& out, uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1, - int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02 + int32_t ne00, int32_t ne01, int32_t ne02, + int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, + int32_t ne0, int32_t ne1, + uint32_t nb01, uint32_t nb02, uint32_t nb03, + uint32_t nb11, uint32_t nb12, uint32_t nb13, + uint32_t r2, uint32_t r3 ) { const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv, kp::shader_data::op_mul_mat_q6_k_comp_spv_len); struct PushConstants { uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne10, ne0, ne1, ne01, gqa; + int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12; + uint32_t nb01, nb02, nb03, nb11, nb12, nb13; + uint32_t r2, r3; } pushConsts { inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne10, ne0, ne1, ne01, ne12/ne02 + ne00, ne10, ne0, ne1, ne01, ne02, ne12, + nb01, nb02, nb03, nb11, nb12, nb13, + r2, r3 }; std::shared_ptr s_algo = nullptr; if (!komputeManager()->hasAlgorithm(__func__)) { - const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts}); + const uint32_t local_x = 2; + const uint32_t local_y = ggml_vk_current_device().subgroupSize; + s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts}); } else { s_algo = komputeManager()->getAlgorithm(__func__); s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}); + s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}); s_algo->setPushConstants({pushConsts}); s_algo->updateDescriptors(s_kompute_context->pool.get()); } @@ -1450,8 +1459,8 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons switch (op->src[0]->type) { case GGML_TYPE_F32: - case GGML_TYPE_Q6_K: return op->ne[3] == 1; + case GGML_TYPE_Q6_K: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: @@ -1729,7 +1738,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml case GGML_TYPE_Q6_K: ggml_vk_mul_mat_q6_k( seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02 + ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, + nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 ); break; default: { diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp index c9baebdf4..d331d1a70 100644 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +++ b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp @@ -21,7 +21,16 @@ layout (push_constant) uniform parameter { int ne0; int ne1; int ne01; - int gqa; + int ne02; + int ne12; + uint nb01; + uint nb02; + uint nb03; + uint nb11; + uint nb12; + uint nb13; + uint r2; + uint r3; } pcs; void main() { @@ -34,12 +43,15 @@ void main() { const uint r0 = gl_WorkGroupID.x; const uint r1 = gl_WorkGroupID.y; - const uint r2 = gl_WorkGroupID.z; + const uint im = gl_WorkGroupID.z; const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID); - const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0); - const uint x = row * nb + offset0; // Based from inA without base offset - const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB + + const uint i12 = im%pcs.ne12; + const uint i13 = im/pcs.ne12; + + const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK); + const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff; float sumf = 0; @@ -89,6 +101,6 @@ void main() { const float tot = subgroupAdd(sumf); if (subgroupElect()) { - out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot; + out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot; } }