Merge branch 'ggerganov:master' into server-chat-templates

2024-11-13 12:15:29 +05:00 · 2024-11-13 12:15:29 +05:00 · a1ee42d521
commit a1ee42d521
parent a2112f095c 80dd7ff22f
13 changed files with 144 additions and 27 deletions
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@ -196,6 +196,7 @@ struct vk_device_struct {
    vk_pipeline pipeline_pad_f32;
    vk_pipeline pipeline_repeat_f32;
    vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
+    vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
    vk_pipeline pipeline_norm_f32;
    vk_pipeline pipeline_group_norm_f32;
    vk_pipeline pipeline_rms_norm_f32;
@ -722,6 +723,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
        std::lock_guard<std::mutex> guard(compile_count_mutex);
        assert(compile_count > 0);
        compile_count--;
+
+        // "Progress bar" for shader compiles
+        static uint32_t total_compile_count = 0;
+        if ((total_compile_count++ % 10) == 0) {
+            std::cerr << ".";
+        }
    }
    compile_count_cond.notify_all();
 }
@ -1200,6 +1207,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
 static void ggml_vk_load_shaders(vk_device& device) {
    VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");

+    std::cerr << "ggml_vulkan: Compiling shaders";
+
    // mulmat
    std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
    std::initializer_list<uint32_t> warptile_m = { 128,  64,  64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size };
@ -1759,6 +1768,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

+    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
    ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);

@ -1817,6 +1830,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    for (auto &c : compiles) {
        c.wait();
    }
+    std::cerr << "Done!" << std::endl;
 }

 static vk_device ggml_vk_get_device(size_t idx) {
@ -3061,18 +3075,34 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }

-static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
-    if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
-        return ctx->device->pipeline_cpy_f32_f32;
+static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) {
+
+    // Choose "contiguous copy" shader if src/dst are contiguous
+    bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst));
+
+    if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
+        if (contig) {
+            return ctx->device->pipeline_contig_cpy_f32_f32;
+        } else {
+            return ctx->device->pipeline_cpy_f32_f32;
+        }
    }
-    if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
-        return ctx->device->pipeline_cpy_f32_f16;
+    if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
+        if (contig) {
+            return ctx->device->pipeline_contig_cpy_f32_f16;
+        } else {
+            return ctx->device->pipeline_cpy_f32_f16;
+        }
    }
-    if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
-        return ctx->device->pipeline_cpy_f16_f16;
+    if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
+        if (contig) {
+            return ctx->device->pipeline_contig_cpy_f16_f16;
+        } else {
+            return ctx->device->pipeline_cpy_f16_f16;
+        }
    }

-    std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
+    std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
    GGML_ABORT("fatal error");
 }

@ -3082,6 +3112,15 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
    const int tensor_type_size = ggml_type_size(tensor->type);

    const uint32_t ne = ggml_nelements(tensor);
+    std::array<uint32_t, 3> elements;
+
+    if (ne > 262144) {
+        elements = { 512, 512, CEIL_DIV(ne, 262144) };
+    } else if (ne > 512) {
+        elements = { 512, CEIL_DIV(ne, 512), 1 };
+    } else {
+        elements = { ne, 1, 1 };
+    }

    const vk_op_unary_push_constants pc = {
        (uint32_t)ne,
@ -3091,7 +3130,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
        0.0f, 0.0f,
    };
    ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 });
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
 }

 static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@ -3176,12 +3215,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
    vk_pipeline to_fp16_vk_1 = nullptr;

    if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
    } else {
        to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
    }
    if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16);
    } else {
        to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
    }
@ -3361,10 +3400,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
    vk_pipeline to_fp16_vk_0 = nullptr;
    vk_pipeline to_fp16_vk_1 = nullptr;
    if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type);
    }
    if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type);
    } else {
        to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
    }
@ -3745,12 +3784,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
    vk_pipeline to_fp16_vk_1 = nullptr;

    if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16);
    } else {
        to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type);
    }
    if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16);
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16);
    } else {
        to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
    }
@ -3938,10 +3977,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
    vk_pipeline to_fp16_vk_0 = nullptr;
    vk_pipeline to_fp16_vk_1 = nullptr;
    if (x_non_contig) {
-        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
+        to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type);
    }
    if (y_non_contig) {
-        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type);
+        to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type);
    } else {
        to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
    }
@ -4148,7 +4187,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
    case GGML_OP_CPY:
    case GGML_OP_CONT:
    case GGML_OP_DUP:
-        return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type);
+        return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type);
    case GGML_OP_NORM:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_norm_f32;
@ -4281,7 +4320,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
    case GGML_OP_DIV:
    case GGML_OP_CONCAT:
    case GGML_OP_UPSCALE:
-    case GGML_OP_SCALE:
    case GGML_OP_SQR:
    case GGML_OP_SIN:
    case GGML_OP_COS:
--- a/ggml/src/vulkan-shaders/clamp.comp
+++ b/ggml/src/vulkan-shaders/clamp.comp
@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"

+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
    const uint idx = get_idx();

--- a/ggml/src/vulkan-shaders/contig_copy.comp
+++ b/ggml/src/vulkan-shaders/contig_copy.comp
@ -0,0 +1,42 @@
+#version 450
+
+#include "types.comp"
+#include "generic_unary_head.comp"
+
+#extension GL_EXT_control_flow_attributes : require
+
+const uint num_threads = 128;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    uint idx = get_idx();
+
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 4;
+
+    // fast path for when all four iterations are in-bounds
+    if (idx + (num_iter-1)*num_threads < p.ne) {
+        [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+            data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
+#else
+            data_d[p.d_offset + idx] = data_a[idx];
+#endif
+            idx += num_threads;
+        }
+    } else {
+        [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+            if (idx >= p.ne) {
+                continue;
+            }
+
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+            data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
+#else
+            data_d[p.d_offset + idx] = data_a[idx];
+#endif
+            idx += num_threads;
+        }
+    }
+}
--- a/ggml/src/vulkan-shaders/copy.comp
+++ b/ggml/src/vulkan-shaders/copy.comp
@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"

+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
    const uint idx = get_idx();

--- a/ggml/src/vulkan-shaders/cos.comp
+++ b/ggml/src/vulkan-shaders/cos.comp
@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"

+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
    const uint idx = get_idx();

--- a/ggml/src/vulkan-shaders/generic_unary_head.comp
+++ b/ggml/src/vulkan-shaders/generic_unary_head.comp
@ -1,4 +1,5 @@
 #extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_control_flow_attributes : require

 layout (push_constant) uniform parameter
 {
@ -9,8 +10,6 @@ layout (push_constant) uniform parameter
    float param1; float param2;
 } p;

-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

--- a/ggml/src/vulkan-shaders/pad.comp
+++ b/ggml/src/vulkan-shaders/pad.comp
@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"

+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;

--- a/ggml/src/vulkan-shaders/repeat.comp
+++ b/ggml/src/vulkan-shaders/repeat.comp
@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"

+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 uint src0_idx_mod(uint idx) {
    const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
--- a/ggml/src/vulkan-shaders/scale.comp
+++ b/ggml/src/vulkan-shaders/scale.comp
@ -3,12 +3,22 @@
 #include "types.comp"
 #include "generic_unary_head.comp"

+const uint num_threads = 128;
+
+layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
-    const uint idx = get_idx();
+    uint idx = get_idx();

-    if (idx >= p.ne) {
-        return;
+    // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
+    const uint num_iter = 4;
+
+    [[unroll]] for (uint i = 0; i < num_iter; ++i) {
+        if (idx >= p.ne) {
+            continue;
+        }
+
+        data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
+        idx += num_threads;
    }
-
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
 }
--- a/ggml/src/vulkan-shaders/sin.comp
+++ b/ggml/src/vulkan-shaders/sin.comp
@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"

+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
    const uint idx = get_idx();

--- a/ggml/src/vulkan-shaders/square.comp
+++ b/ggml/src/vulkan-shaders/square.comp
@ -3,6 +3,8 @@
 #include "types.comp"
 #include "generic_unary_head.comp"

+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
 void main() {
    const uint idx = get_idx();

--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@ -350,6 +350,9 @@ void process_shaders() {
    string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
    string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
+    string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});

    string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -681,6 +681,7 @@ struct test_case {

        // run
        int64_t total_time_us = 0;
+        int64_t total_mem = 0;
        int total_runs = 0;
        do {
            int64_t start_time = ggml_time_us();
@ -688,6 +689,7 @@ struct test_case {
            int64_t end_time = ggml_time_us();

            total_time_us += end_time - start_time;
+            total_mem += mem;
            total_runs += n_runs;
        } while (total_time_us < 1000*1000); // run for at least 1 second

@ -717,7 +719,7 @@ struct test_case {
        } else {
            printf("%8zu kB/run - \033[1;34m%7.2f GB/s\033[0m",
                op_size(out) / 1024,
-                mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
+                total_mem / (total_time_us / 1e6) / 1024.0 / 1024.0 / 1024.0);
        }
        printf("\n");

@ -2740,6 +2742,13 @@ struct test_flash_attn_ext : public test_case {
        return 5e-4;
    }

+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        // Just counting matmul costs:
+        // Q*K^T is nb x hs x kv, P*V is nb x kv x hs, per head
+        return 2 * 2 * nh * nb * hs * kv;
+    }
+
    test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
                        bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
        : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
@ -3779,6 +3788,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1,   1, 1, 1}));
    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));

+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
+
    for (int bs : {1, 512}) {
        for (ggml_type type_a : all_types) {
            for (ggml_type type_b : {GGML_TYPE_F32}) {