Add unary and binary op shader templates

2024-02-11 08:41:59 +01:00 · 2024-02-11 08:41:59 +01:00 · 5e0a9a2d37
commit 5e0a9a2d37
parent 2a0cf851d4
3 changed files with 885 additions and 648 deletions
--- a/ggml-vulkan-shaders.hpp
+++ b/ggml-vulkan-shaders.hpp
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -215,10 +215,18 @@ struct vk_op_push_constants {
    float param2;
 };
-struct vk_op_cpy_push_constants {
+struct vk_op_unary_push_constants {
    uint32_t ne;
-    uint32_t ne00; uint32_t ne01; uint32_t nb00; uint32_t nb01; uint32_t nb02;
+    uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
-    uint32_t ne10; uint32_t ne11; uint32_t nb10; uint32_t nb11; uint32_t nb12;
+    uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
    uint32_t d_offset;
 };
 struct vk_op_binary_push_constants {
    uint32_t ne;
    uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
    uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
    uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
    uint32_t d_offset;
 };
@ -1027,9 +1035,9 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
@ -2197,7 +2205,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
    GGML_ASSERT(false);
 }
-static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type, bool aligned=true) {
+static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
    std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
@ -2205,18 +2213,16 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
    const int tensor_type_size = ggml_type_size(tensor->type);
    const int dst_type_size = ggml_type_size(buffer_type);
-    const uint32_t ne = tensor->ne[0] * tensor->ne[1] * tensor->ne[2];
+    const uint32_t ne = ggml_nelements(tensor);
-    const uint32_t nb2 = aligned ? ggml_vk_align_size(dst_type_size * tensor->ne[0] * tensor->ne[1], ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size : tensor->ne[0] * tensor->ne[1];
+    const vk_op_unary_push_constants pc = {
    const vk_op_cpy_push_constants pc = {
        (uint32_t)ne,
-        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size,
+        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size,
-        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1],                       1                   , (uint32_t)tensor->ne[0]                   , nb2,
+        (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2],                       1                   , (uint32_t)tensor->ne[0]                   , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]),
        0,
    };
    ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_cpy_push_constants), &pc, { ne, 1, 1 });
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 });
 }
 static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -2362,7 +2368,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
    }
    if (x_non_contig) {
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, dst->type, false);
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, dst->type);
    } else if (load_x || qx_needs_dequant) {
        if (load_x) {
            // copy data to device
@ -3240,10 +3246,10 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
    const int src0_type_size = ggml_type_size(src0->type);
    const int dst_type_size = ggml_type_size(dst->type);
    const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
-    ggml_vk_op_f32<vk_op_cpy_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, {
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, {
        (uint32_t)ggml_nelements(src0),
-        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size,
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
-        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        d_offset,
    });
 }
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
@ -1509,6 +1509,103 @@ layout (push_constant) uniform parameter
 } p;
 """
 generic_unary_op_head = """#version 450
 #extension GL_EXT_shader_16bit_storage : require
 layout (push_constant) uniform parameter
 {
    uint ne;
    uint ne00; uint ne01; uint ne02; uint nb00; uint nb01; uint nb02; uint nb03;
    uint ne10; uint ne11; uint ne12; uint nb10; uint nb11; uint nb12; uint nb13;
    uint d_offset;
 } p;
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 uint src0_idx(uint idx) {
    const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
    const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00);
    const uint i02_offset = i02*p.ne01*p.ne00;
    const uint i01 = (idx - i03_offset - i02_offset) / p.ne00;
    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
    return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
 }
 uint dst_idx(uint idx) {
    const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
    const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
    const uint i12_offset = i12*p.ne11*p.ne10;
    const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
    return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
 }
 void main() {
    if (gl_GlobalInvocationID.x >= p.ne) {
        return;
    }
 """
 generic_binary_op_head = """#version 450
 #extension GL_EXT_shader_16bit_storage : require
 layout (push_constant) uniform parameter
 {
    uint ne;
    uint ne00; uint ne01; uint ne02; uint nb00; uint nb01; uint nb02; uint nb03;
    uint ne10; uint ne11; uint ne12; uint nb10; uint nb11; uint nb12; uint nb13;
    uint ne20; uint ne21; uint ne22; uint nb20; uint nb21; uint nb22; uint nb23;
    uint d_offset;
 } p;
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 uint src0_idx(uint idx) {
    const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
    const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00);
    const uint i02_offset = i02*p.ne01*p.ne00;
    const uint i01 = (idx - i03_offset - i02_offset) / p.ne00;
    const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
    return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
 }
 uint src1_idx(uint idx) {
    const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
    const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
    const uint i12_offset = i12*p.ne11*p.ne10;
    const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
    return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
 }
 uint dst_idx(uint idx) {
    const uint i23 = idx / (p.ne22*p.ne21*p.ne20);
    const uint i23_offset = i23 * p.ne22*p.ne21*p.ne20;
    const uint i22 = (idx - i23_offset) / (p.ne21*p.ne20);
    const uint i22_offset = i22*p.ne21*p.ne20;
    const uint i21 = (idx - i23_offset - i22_offset) / p.ne20;
    const uint i20 = idx - i23_offset - i22_offset - i21*p.ne20;
    return i23*p.nb23 + i22*p.nb22 + i21*p.nb21 + i20*p.nb20;
 }
 void main() {
    if (gl_GlobalInvocationID.x >= p.ne) {
        return;
    }
 """
 # MUL F32
 mul_body = """layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
@ -1600,45 +1697,13 @@ void main() {
 """
 # CPY
 cpy_src = """#version 450
 #extension GL_EXT_shader_16bit_storage : require
 layout (push_constant) uniform parameter
 {
    uint ne;
    uint ne00; uint ne01; uint nb00; uint nb01; uint nb02;
    uint ne10; uint ne11; uint nb10; uint nb11; uint nb12;
    uint d_offset;
 } p;
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 void main() {
    if (gl_GlobalInvocationID.x >= p.ne) {
        return;
    }
    const uint i02 = gl_GlobalInvocationID.x / (p.ne00*p.ne01);
    const uint i01 = (gl_GlobalInvocationID.x - i02*p.ne01*p.ne00) / p.ne00;
    const uint i00 = gl_GlobalInvocationID.x - i02*p.ne01*p.ne00 - i01*p.ne00;
    const uint a_idx = i00*p.nb00 + i01*p.nb01 + i02*p.nb02;
    const uint i12 = gl_GlobalInvocationID.x / (p.ne10*p.ne11);
    const uint i11 = (gl_GlobalInvocationID.x - i12*p.ne11*p.ne10) / p.ne10;
    const uint i10 = gl_GlobalInvocationID.x - i12*p.ne11*p.ne10 - i11*p.ne10;
    const uint d_idx = i10*p.nb10 + i11*p.nb11 + i12*p.nb12;
 """
 cpy_end = """
-    data_d[p.d_offset + d_idx] = D_TYPE(data_a[a_idx]);
+    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
 }
 """
 # Causes an optimization error otherwise
 cpy_f16_f16_end = """
-    data_d[p.d_offset + d_idx] = data_a[a_idx];
+    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)];
 }
 """
@ -2298,9 +2363,9 @@ async def main():
    tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
-    tasks.append(string_to_spv("cpy_f32_f32", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
+    tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
-    tasks.append(string_to_spv("cpy_f32_f16", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
+    tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
-    tasks.append(string_to_spv("cpy_f16_f16", f"{cpy_src}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
+    tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_head}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
    tasks.append(string_to_spv("add_f32", f"{generic_head}\n{shader_f32}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))